-
Notifications
You must be signed in to change notification settings - Fork 120
/
prepBirthWeightData.R
83 lines (68 loc) · 2.23 KB
/
prepBirthWeightData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
data <- read.table("natal2010Sample.tsv.gz",
sep="\t", header=T, stringsAsFactors=F)
# make a boolean from Y/N data
makevarYN = function(col) {
ifelse(col %in% c("", "U"), NA, ifelse(col=="Y", T, F))
}
# make a numeric var w/NAs from numeric data
makevarNum = function(col, sentinel) {
ifelse(col==sentinel, NA, col)
}
# make a boolean from 1/2/9 data.
makevar12 = function(col) {
ifelse(col==9, NA, ifelse(col==1, T, F))
}
# tobacco use: CIG_REC (Y, N, U, Blank)
data$CIG_REC = makevarYN(data$CIG_REC)
# maternal delivery weight, maternal prepregnancy weight (pounds)
# capped at 400lbs
data$PWGT = makevarNum(data$PWGT, 999)
# babies birth weight
data$DWGT = makevarNum(data$PWGT, 999)
# weight gain during pregnancy
data$WTGAIN = makevarNum(data$WTGAIN, 99)
# birth weight in grams
data$DBWT = makevarNum(data$DBWT, 9999)
# mother's age
data$MAGER = makevarNum(data$MAGER, 99) #
# number of prenatal visits
data$UPREVIS = makevarNum(data$UPREVIS, 99)
# We don't use these in the chapter, but I'll leave them
# in, anyway
#risk factors (1,2,9,Blank)
# diabetes, chronic hypertension, pregnancy-associated hypertension
# eclampsia
riskfactors = c("URF_DIAB", "URF_CHYPER", "URF_PHYPER",
"URF_ECLAM")
data[, riskfactors] = as.data.frame(lapply(data[, riskfactors], FUN=makevar12))
# reset the "default" level on categorical variabls
recode = function(col, map, ref) {
relevel(as.factor(map[col]), ref=ref)
}
# gestation length
# GESTREC3 (1,2,3 -- <37weeks(premie), >=37wks, NA)
grmap = c("< 37 weeks",
">= 37 weeks",
NA)
data$GESTREC3 = recode(data$GESTREC3, grmap, grmap[[2]])
# DPLURAL : birth plurality
plmap = c("single",
"twin",
"triplet or higher",
"triplet or higher",
"triplet or higher")
data$DPLURAL = recode(data$DPLURAL, plmap, "single")
x = c("PWGT",
"WTGAIN",
"MAGER",
"UPREVIS",
"CIG_REC",
"GESTREC3",
"DPLURAL",
riskfactors)
sdata = data[, c(x, c("DBWT", "ORIGRANDGROUP"))]
# get rid of the NA data before splitting into train and test
# noNAs is T if there are no NAs in the row
noNAs = rowSums(as.data.frame(lapply(sdata, FUN=is.na))) == 0
sdata = sdata[noNAs, ]
save(sdata, file="NatalBirthData.rData")