<- read_csv("../data/linkco2013us_num.csv.zip")
linked_deaths
linked_deaths<- read_csv("../data/linkco2013us_den.csv.zip")
linked_births linked_births
1 Load Data
The CSV files for the 2013 cohort were downloaded from the NBER collection of Birth Cohort Linked Birth and Infant Death Data of the National Vital Statistics System of the National Center for Health Statistics, on the NBER website.
Each row corresponds to a newborn. When the newborn died, an identified with a unique ID (column idnumber
) is provided in both tables. A lot of columns from both files contain the same information. We will only keep some of those columns.
1.1 Deaths
First, let us focus on the subsample of deaths, only on the following variables.
<-
deaths %>%
linked_deaths select(idnumber, d_restatus, hospd, weekdayd, dthyr, dthmon,
dob_yy, dob_mm, dob_wk, aged)
Let us encode the values for categorical variables:
Show the R codes
<-
deaths %>%
deaths mutate(
d_restatus = factor(
d_restatus, levels = c(1:4),
labels = c("Residents", "Intrastate Nonresidents",
"Interstate or Interterritory Nonresidents", "Foreign Residents")
),hospd = factor(
hospd,levels = c(1:7, 9),
labels = c("Hospital, clinic or Medical Center - Inpatient",
"Hospital, clinic or Medical Center - ER",
"Hospital, clinic or Medical Center - Dead on Arrival",
"Decedent's home",
"Hospice facility",
"Nursing home/long term care",
"Other",
"Unknown"
),
),weekdayd = factor(
weekdayd, levels = c(1:7, 9),
labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Unknown")
),dthmon = factor(dthmon, levels = 1:12),
dob_mm = factor(dob_mm, levels = 1:12),
dob_wk = factor(
dob_wk,levels = c(1:7, 9),
labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Unknown")
) )
The description of those columns can be added to our tibble, using the set_variable_labels()
function from {labelled}.
library(labelled)
Show the R codes
<-
deaths %>%
deaths ::set_variable_labels(
labelledd_restatus = "Death Resident Status",
hospd = "Place of Death and Decedent's Status",
weekdayd = "Day of Week of Death",
dthyr = "Year of Death",
dthmon = "Month of Death",
dob_yy = "Birth Year",
dob_mm = "Birth Month",
dob_wk = "Birth Weekday",
aged = "Age at Death in Days"
)
Let us save this tibble which contains only the information that we are interested in.
save(deaths, file = "../data/deaths.RData")
Here is an overview of the first rows:
deaths
# A tibble: 23,159 × 10
idnumber d_restatus hospd weekd…¹ dthyr dthmon dob_yy dob_mm dob_wk aged
<dbl> <fct> <fct> <fct> <dbl> <fct> <dbl> <fct> <fct> <dbl>
1 1 Residents Hosp… Thursd… 2013 2 2013 1 Thurs… 56
2 2 Intrastate No… Hosp… Friday 2013 1 2013 1 Sunday 12
3 3 Residents Hosp… Saturd… 2013 1 2013 1 Satur… 0
4 4 Residents Hosp… Sunday 2013 1 2013 1 Sunday 0
5 5 Intrastate No… Unkn… Tuesday 2013 1 2013 1 Monday 1
6 6 Residents Hosp… Sunday 2013 1 2013 1 Sunday 0
7 7 Residents Hosp… Sunday 2013 1 2013 1 Monday 6
8 8 Residents Hosp… Friday 2013 1 2013 1 Wedne… 9
9 9 Residents Hosp… Monday 2013 7 2013 1 Wedne… 180
10 10 Residents Hosp… Tuesday 2013 1 2013 1 Tuesd… 0
# … with 23,149 more rows, and abbreviated variable name ¹weekdayd
1.2 Births
Now let us turn to the births dataset. As for the deaths dataset, let us recode the values and add some labels.
<-
births %>%
linked_births select(
# Mother's characteristics
mager41, mbrace, mracerec, mar, meduc,# Father's characteristics
fagecomb, fbrace, fracerec,# Child's characteristics
idnumber, bfacil, bfacil3, restatus, sex, dbwt,# Pregnancy
lbo, tbo, precare_rec, uprevis, wtgain, cig_1, cig_2, cig_3, cig_rec,
dlmp_mm, dlmp_yy, estgest, combgest,# Risk Factor
rf_diab, rf_gest, rf_phyp, rf_ghyp, rf_eclam, rf_ppterm,
rf_ppoutc, rf_cesar,# Obstetric Procedures
op_cerv, op_tocol, op_ecvs, op_ecvf, uop_induc, uop_tocol,# Onset of Labor
on_ruptr, on_abrup, on_prolg,# Charact. of Labor and Delivery
ld_induct, ld_augment, ld_nvrtx, ld_steroids, ld_antibio,
ld_chorio, ld_mecon, ld_fintol, ld_anesth,# Complications of Labor and Delivery
uld_meco, uld_precip, uld_breech,# Method of Delivery, Delivery
md_present, md_route, md_trial, rdmeth_rec, dmeth_rec,
attend, apgar5, dplural,# Abnormal Conditions of the Newborn
ab_vent, ab_vent6, ab_nicu, ab_surfac, ab_antibio, ab_seiz, ab_inj,# Congenital Anomalies of the Newborn
ca_anen, ca_menin, ca_heart, ca_hernia, ca_ompha, ca_gastro,
ca_limb, ca_cleftlp, ca_cleft, ca_downs, ca_chrom, ca_hypos,# Death
manner )
Recoding the observations, following the guide:
Show the R codes
<-
births %>%
births mutate(
mager41 = factor(
mager41,levels = c(12:50),
labels = c("10-12", 13:49, "50-64"),
),bfacil = factor(
bfacil,levels = c(1:7, 9),
labels = c(
"Hospital",
"Freestanding Birth Center",
"Home (intended)",
"Home (not intended)",
"Home (unknown if intended)",
"Clinic / Doctor’s Office",
"Other",
"Unknown"
)
),bfacil3 = factor(
bfacil3,levels = c(1,2,3),
labels = c("Hospital", "Not in Hospital", "Unknown or Not Stated")
),restatus = factor(
restatus,levels = c(1:4),
labels = c("Residents", "Intrastate Nonresidents",
"Interstate or Interterritory Nonresidents", "Foreign Residents")
),mbrace = factor(
mbrace,levels = c(1:14, 21:24),
labels = c(
"White – single race",
"Black – single race",
"American Indian – single race",
"Asian Indian – single race",
"Chinese – single race",
"Filipino – single race",
"Japanese – single race",
"Korean – single race",
"Vietnamese – single race",
"Other Asian – single race",
"Hawaiian – single race",
"Guamanian – single race",
"Samoan – single race",
"Other Pacific Islander – single race",
"White – bridged multiple race",
"Black – bridged multiple race",
"American Indian & Alaskan Native – bridged multiple race",
"Asian / Pacific Islander – bridged multiple race"
)
),mracerec = factor(
mracerec,levels = c(0, 1:4),
labels = c(
"Other (not classified as White or Black)",
"White",
"Black",
"American Indian / Alaskan Native ",
"Asian / Pacific Islander"
)
),mar = factor(
mar,levels = c(1,2, 3, 9),
labels = c("Yes", "No", "Unmarried parents not living together",
"Unknown or not Stated")
),meduc = factor(
meduc,levels = c(1:9),
labels = c(
"8th grade or less",
"9th through 12th grade with no diploma",
"High school graduate or GED completed",
"Some college credit, but not a degree",
"Associate degree (AA,AS)",
"Bachelor’s degree (BA, AB, BS)",
"Master’s degree (MA, MS, MEng, MEd, MSW, MBA)",
"Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD)",
"Unknown"
)
),fagecomb = ifelse(fagecomb == 99, NA, fagecomb),
fbrace = factor(
fbrace,levels = c(1:14, 21:24, 99),
labels = c(
"White – single race",
"Black – single race",
"American Indian – single race",
"Asian Indian – single race",
"Chinese – single race",
"Filipino – single race",
"Japanese – single race",
"Korean – single race",
"Vietnamese – single race",
"Other Asian – single race",
"Hawaiian – single race",
"Guamanian – single race",
"Samoan – single race",
"Other Pacific Islander – single race",
"White – bridged multiple race",
"Black – bridged multiple race",
"American Indian & Alaskan Native – bridged multiple race",
"Asian / Pacific Islander – bridged multiple race",
"Unknown or not stated, also includes states not reporting multiple race"
)
),fracerec = factor(
fracerec,levels = c(0:4, 9),
labels = c(
"Other (not classified as White or Black)",
"White",
"Black",
"American Indian / Alaskan Native",
"Asian / Pacific Islander",
"Unknown or not stated"
),
),lbo = ifelse(lbo == 9, NA, lbo), # careful: 8: 8+
tbo = ifelse(tbo == 9, NA, tbo), # careful: 8: 8+
precare_rec = factor(
precare_rec,levels = c(1:5),
labels = c(
"1st to 3rd month",
"4th to 6th month",
"7th to final month",
"No prenatal care",
"Unknown or not stated"
)
),uprevis = ifelse(uprevis == 99, NA, uprevis),
wtgain = ifelse(wtgain == 99, NA, wtgain), # careful: 98: 98+
cig_1 = ifelse(cig_1 == 99, NA, cig_1), # careful: 98: 98+
cig_2 = ifelse(cig_2 == 99, NA, cig_1), # careful: 98: 98+
cig_3 = ifelse(cig_3 == 99, NA, cig_1), # careful: 98: 98+
cig_rec = factor(
cig_rec,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_diab = factor(
rf_diab,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_gest = factor(
rf_gest,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_phyp = factor(
rf_phyp,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_ghyp = factor(
rf_ghyp,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_eclam = factor(
rf_eclam,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_ppterm = factor(
rf_ppterm,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_ppoutc = factor(
rf_ppoutc,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),rf_cesar = factor(
rf_cesar,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),op_cerv = factor(
op_cerv,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),op_tocol = factor(
op_tocol,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),op_ecvs = factor(
op_ecvs,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),op_ecvf = factor(
op_ecvf,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),uop_induc = factor(
uop_induc,levels = c(1,2,8,9),
labels = c("Yes", "No", "Not on certificate", "Unknown or not stated")
),uop_tocol = factor(
uop_tocol,levels = c(1,2,8,9),
labels = c("Yes", "No", "Not on certificate", "Unknown or not stated")
),on_ruptr = factor(
on_ruptr,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),on_abrup = factor(
on_abrup,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),on_prolg = factor(
on_prolg,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_induct = factor(
ld_induct,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_augment = factor(
ld_augment,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_nvrtx = factor(
ld_nvrtx,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_steroids = factor(
ld_steroids,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_antibio = factor(
ld_antibio,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_chorio = factor(
ld_chorio,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_mecon = factor(
ld_mecon,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_fintol = factor(
ld_fintol,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),ld_anesth = factor(
ld_anesth,levels = c("Y", "N", "U"),
labels = c("Yes", "No", "Unknown or not stated")
),uld_meco = factor(
uld_meco,levels = c(1,2,8,9),
labels = c("Yes", "No", "Not on certificate", "Unknown or not stated")
),uld_precip = factor(
uld_precip,levels = c(1,2,8,9),
labels = c("Yes", "No", "Not on certificate", "Unknown or not stated")
),uld_breech = factor(
uld_breech,levels = c(1,2,8,9),
labels = c("Yes", "No", "Not on certificate", "Unknown or not stated")
),md_present = factor(
md_present,levels = c(1,2,3,9),
labels = c(
"Cephalic",
"Breech",
"Other",
"Unknown or not stated"
)
),md_route = factor(
md_route,levels = c(1,2,3,4,9),
labels = c(
"Spontaneous",
"Forceps",
"Vacuum",
"Cesarean",
"Unknown or not stated"
)
),md_trial = factor(
md_trial,levels = c("Y", "N", "X", "U"),
labels = c(
"Yes", "No",
"Not applicable ", "Unknown or not stated"
)
),rdmeth_rec = factor(
rdmeth_rec,levels = c(1:6, 9),
labels = c(
"Vaginal",
"Vaginal after previous c-section",
"Primary C-section",
"Repeat C-section",
"Vaginal (unknown if previous c-section)",
"C-section (unknown if previous c-section)",
"Not stated"
)
),dmeth_rec = factor(
dmeth_rec,levels = c(1,2,9),
labels = c("Vaginal", "C-Section", "Unknown")
),attend = factor(
attend,levels = c(1:5, 9),
labels = c(
"Doctor of Medicine",
"Doctor of Osteopathy",
"Certified Nurse Midwife",
"Other Midwife",
"Other",
"Unknown or not stated"
)
),apgar5 = ifelse(apgar5 == 99, yes = NA, no = apgar5),
dplural = factor(
dplural,levels = c(1:5),
labels = c(
"Single",
"Twin",
"Triplet",
"Quadruplet",
"Quintuplet or higher"
)
),sex = factor(
sex,levels = c("M", "F"),
labels = c("Male", "Female")
),dlmp_mm = ifelse(dlmp_mm == 99, NA, dlmp_mm),
dlmp_yy = ifelse(dlmp_yy == 9999, NA, dlmp_yy),
estgest = ifelse(estgest == 99, yes = NA, estgest),
combgest = ifelse(combgest == 99, NA, combgest),
dbwt = ifelse(dbwt == 9999, NA, dbwt),
ab_vent = factor(
ab_vent,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ab_vent6 = factor(
ab_vent6,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ab_nicu = factor(
ab_nicu,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ab_surfac = factor(
ab_surfac,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ab_antibio = factor(
ab_antibio,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ab_seiz = factor(
ab_seiz,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ab_inj = factor(
ab_inj,levels = c("Y", "N", "U"),
labels = c(
"Yes, Complication reported",
"No Complication reported",
"Unknown or not stated"
)
),ca_anen = factor(
ca_anen,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_menin = factor(
ca_menin,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_heart = factor(
ca_heart,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_hernia = factor(
ca_hernia,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_ompha = factor(
ca_ompha,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_gastro = factor(
ca_gastro,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_limb = factor(
ca_limb,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_cleftlp = factor(
ca_cleftlp,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_cleft = factor(
ca_cleft,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_downs = factor(
ca_downs,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_chrom = factor(
ca_chrom,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),ca_hypos = factor(
ca_hypos,levels = c("Y", "N", "U"),
labels = c(
"Yes, anomaly reported",
"No, anomaly not reported",
"Unknown"
)
),manner = factor(
manner,levels = c(1:7),
labels = c(
"Accident",
"Suicide",
"Homicide",
"Pending investigation",
"Could not determine",
"Self-inflicted",
"Natural"
)
) )
Then, let us add some labels to each column.
Show the R codes
<-
births %>%
births ::set_variable_labels(
labelled# Child's characteristics
bfacil = "Birth Place",
bfacil3 = "Birth Place",
restatus ="Resident Status",
sex = "Sex of Infant",
dbwt = "Birth Weight (in Grams)",
# Death of the newborn
manner = "Manner of Death",
# Mother's characteristics
mager41 = "Mother's Age",
mbrace = "Mother's Bridged Race",
mracerec = "Mother's Race",
mar ="Mother's Marital Status",
meduc = "Mother's Education",
# Father's characteristics
fagecomb = "Father's Combined Age",
fbrace = "Father's Bridged Race",
fracerec = "Father's Race",
# Pregnancy
lbo = "Live Birth Order",
tbo = "Total Birth Order",
precare_rec = "Month Prenatal Care Began",
uprevis = "Number of Prenatal Visits",
wtgain = "Weight Gain",
cig_1 = "Cigarettes 1st Trimester",
cig_2 = "Cigarettes 2nd Trimester",
cig_3 = "Cigarettes 3rd Trimester",
cig_rec = "Smokes cigarettes",
dlmp_mm = "Last Normal Menses: Month",
dlmp_yy = "Last Normal Menses: Year",
estgest = "Obstetric/Clinical Gestation Est.",
combgest = "Gestation – Detail in Weeks",
# Risk factor
rf_diab = "Risk Factor: Prepregnancy Diabetes",
rf_gest = "Risk Factor: Gestational Diabetes",
rf_phyp = "Risk Factor: Prepregnancy Hypertension",
rf_ghyp = "Risk Factor: Gestational Hypertension",
rf_eclam = "Risk Factor: Hypertension Eclampsia",
rf_ppterm = "Risk Factor: Previous Preterm Birth",
rf_ppoutc = "Risk Factor: Poor Pregnancy Outcome",
rf_cesar = "Risk Factor: Previous Cesarean Deliveries",
# Obstetric procedures
op_cerv = "Obstetric Procedures: Cervical Cerclage",
op_tocol = "Obstetric Procedures: Tocolysis",
op_ecvs = "Obstetric Procedures: Successful External Cephalic",
op_ecvf = "Obstetric Procedures: Failed External Cephalic",
uop_induc = "Obstetric Procedures: Induction of labor",
uop_tocol = "Obstetric Procedures: Tocolysis",
on_ruptr = "Onset of Labor: Premature Rupture of Membrane",
on_abrup = "Onset of Labor: Abruptio placenta",
on_prolg = "Onset of Labor: Prolonged Labor",
# Labor
ld_induct = "Charact. of Labor and Delivery: Induction of Labor",
ld_augment = "Charact. of Labor and Delivery: Augmentation of Labor",
ld_nvrtx = "Charact. of Labor and Delivery: Non-Vertex Presentation",
ld_steroids = "Charact. of Labor and Delivery: Steroids",
ld_antibio = "Charact. of Labor and Delivery: Antibiotics",
ld_chorio = "Charact. of Labor and Delivery: Chorioamnionitis",
ld_mecon = "Charact. of Labor and Delivery: Meconium Staining",
ld_fintol = "Charact. of Labor and Delivery: Fetal Intolerance",
ld_anesth = "Charact. of Labor and Delivery: Anesthesia",
uld_meco = "Complications of Labor and Delivery: Meconium",
uld_precip = "Complications of Labor and Delivery: Precipitous labor",
uld_breech = "Complications of Labor and Delivery: Breech",
md_present = "Method of Delivery: Fetal Presentation",
md_route = "Method of Delivery: Final Route and Method of Delivery",
md_trial = "Method of Delivery: Trial of Labor Attempted",
rdmeth_rec = "Delivery Method",
dmeth_rec = "Delivery Method",
attend = "Attendant",
apgar5 = "Five Minute Apgar Score",
dplural = "Plurality",
# Abnormal Conditions and anomalies of the newborn
ab_vent = "Abnormal Conditions of the Newborn: Assisted Ventilation",
ab_vent6 = "Abnormal Conditions of the Newborn: Assisted Ventilation >6hrs",
ab_nicu = "Abnormal Conditions of the Newborn: Admission to NICU",
ab_surfac = "Abnormal Conditions of the Newborn: Surfactant",
ab_antibio = "Abnormal Conditions of the Newborn: Antibiotics",
ab_seiz = "Abnormal Conditions of the Newborn: Seizures",
ab_inj = "Abnormal Conditions of the Newborn: Birth Injury",
#
ca_anen = "Congenital Anomalies of the Newborn: Anencephaly",
ca_menin = "Congenital Anomalies of the Newborn: Meningomyelocele/Spina Bifida",
ca_heart = "Congenital Anomalies of the Newborn: Cyanotic Congenital Heart Disease",
ca_hernia = "Congenital Anomalies of the Newborn: Congenital Diaphragmatic Hernia",
ca_ompha = "Congenital Anomalies of the Newborn: Omphalocele",
ca_gastro = "Congenital Anomalies of the Newborn: Gastroschisis",
ca_limb = "Congenital Anomalies of the Newborn: Limb Reduction Deficit",
ca_cleftlp = "Congenital Anomalies of the Newborn: Cleft Lip w/ or w/o Cleft Palate",
ca_cleft = "Congenital Anomalies of the Newborn: Cleft Palate Alone",
ca_downs = "Congenital Anomalies of the Newborn: Downs Syndrome",
ca_chrom = "Congenital Anomalies of the Newborn: Suspected Chromosonal Disorder",
ca_hypos = "Congenital Anomalies of the Newborn: Hypospadias"
)
And let us save the data:
save(births, file = "../data/births.RData")
Here is an extract from the first rows of the births data:
births
# A tibble: 3,940,764 × 84
mager41 mbrace mrace…¹ mar meduc fagec…² fbrace frace…³ idnum…⁴ bfacil
<fct> <fct> <fct> <fct> <fct> <dbl> <fct> <fct> <dbl> <fct>
1 22 White – si… "White" Yes High… 23 Unkno… Unknow… NA Hospi…
2 25 White – br… "White" Yes Some… 27 White… White NA Hospi…
3 25 White – si… "White" Yes Bach… 25 White… White NA Hospi…
4 30 White – si… "White" Yes Bach… 31 White… White NA Hospi…
5 23 American I… "Ameri… No 9th … 30 Ameri… Americ… NA Hospi…
6 32 White – br… "White" No High… 32 Ameri… Americ… NA Hospi…
7 23 American I… "Ameri… No Some… 26 Ameri… Americ… NA Hospi…
8 21 White – si… "White" Yes Some… 21 White… White NA Hospi…
9 31 White – si… "White" Yes Bach… 35 White… White NA Frees…
10 25 White – si… "White" Yes Some… 32 White… White NA Hospi…
# … with 3,940,754 more rows, 74 more variables: bfacil3 <fct>, restatus <fct>,
# sex <fct>, dbwt <dbl>, lbo <dbl>, tbo <dbl>, precare_rec <fct>,
# uprevis <dbl>, wtgain <dbl>, cig_1 <dbl>, cig_2 <dbl>, cig_3 <dbl>,
# cig_rec <fct>, dlmp_mm <dbl>, dlmp_yy <dbl>, estgest <dbl>, combgest <dbl>,
# rf_diab <fct>, rf_gest <fct>, rf_phyp <fct>, rf_ghyp <fct>, rf_eclam <fct>,
# rf_ppterm <fct>, rf_ppoutc <fct>, rf_cesar <fct>, op_cerv <fct>,
# op_tocol <fct>, op_ecvs <fct>, op_ecvf <fct>, uop_induc <fct>, …
2 Descriptive statistics
<-
df %>%
births left_join(deaths)
We can quickly produce some descriptive statistics from those data, using the tbl_summary()
function from {gtsummary}.
library(gtsummary)
<- function(df, variables){
get_table_desc_stat %>%
df select(!!variables) %>%
tbl_summary(
# by = ,
type = all_continuous() ~ "continuous2",
statistic = list(
all_continuous() ~ c("{mean} ({sd})", "{median} ({p25}, {p75})"),
all_categorical() ~ "{n} ({p}%)"),
digits = list(
all_continuous() ~ 2,
all_categorical() ~ 0
),missing_text = "Missing value"
%>%
) modify_header(label ~ "**Variable**") %>%
add_stat_label(
label = list(
all_continuous() ~ c("Mean (Std)", "Median (IQR)"),
all_categorical() ~ "n (%)"
)
) }
2.1 Raw data
Show the R codes
<- c("sex", "dbwt", "bfacil", "bfacil3", "restatus")
childrens_characteristics
get_table_desc_stat(df, childrens_characteristics)
Variable | N = 3,940,764 |
---|---|
Sex of Infant, n (%) | |
Male | 2,017,374 (51%) |
Female | 1,923,390 (49%) |
Birth Weight (in Grams) | |
Mean (Std) | 3,270.78 (593.63) |
Median (IQR) | 3,316.00 (2,977.00, 3,630.00) |
Missing value | 646 |
Birth Place, n (%) | |
Hospital | 3,510,218 (98%) |
Freestanding Birth Center | 17,531 (0%) |
Home (intended) | 25,499 (1%) |
Home (not intended) | 3,693 (0%) |
Home (unknown if intended) | 4,552 (0%) |
Clinic / Doctor’s Office | 355 (0%) |
Other | 2,456 (0%) |
Unknown | 113 (0%) |
Missing value | 376,347 |
Birth Place, n (%) | |
Hospital | 3,883,310 (99%) |
Not in Hospital | 57,331 (1%) |
Unknown or Not Stated | 123 (0%) |
Resident Status, n (%) | |
Residents | 2,847,673 (72%) |
Intrastate Nonresidents | 999,320 (25%) |
Interstate or Interterritory Nonresidents | 85,188 (2%) |
Foreign Residents | 8,583 (0%) |
Show the R codes
<- c("d_restatus", "hospd", "weekdayd", "dthyr", "dthmon",
death_variables "dob_yy", "dob_mm", "dob_wk", "aged", "manner")
get_table_desc_stat(df, death_variables)
Variable | N = 3,940,764 |
---|---|
Death Resident Status, n (%) | |
Residents | 14,430 (62%) |
Intrastate Nonresidents | 7,508 (32%) |
Interstate or Interterritory Nonresidents | 1,176 (5%) |
Foreign Residents | 45 (0%) |
Missing value | 3,917,605 |
Place of Death and Decedent's Status, n (%) | |
Hospital, clinic or Medical Center - Inpatient | 17,943 (77%) |
Hospital, clinic or Medical Center - ER | 2,907 (13%) |
Hospital, clinic or Medical Center - Dead on Arrival | 283 (1%) |
Decedent's home | 1,586 (7%) |
Hospice facility | 54 (0%) |
Nursing home/long term care | 24 (0%) |
Other | 323 (1%) |
Unknown | 39 (0%) |
Missing value | 3,917,605 |
Day of Week of Death, n (%) | |
Sunday | 3,223 (14%) |
Monday | 3,120 (13%) |
Tuesday | 3,249 (14%) |
Wednesday | 3,469 (15%) |
Thursday | 3,376 (15%) |
Friday | 3,391 (15%) |
Saturday | 3,331 (14%) |
Unknown | 0 (0%) |
Missing value | 3,917,605 |
Year of Death, n (%) | |
2013 | 20,487 (88%) |
2014 | 2,672 (12%) |
Missing value | 3,917,605 |
Month of Death, n (%) | |
1 | 1,897 (8%) |
2 | 1,799 (8%) |
3 | 1,973 (9%) |
4 | 1,920 (8%) |
5 | 2,014 (9%) |
6 | 1,977 (9%) |
7 | 1,930 (8%) |
8 | 1,937 (8%) |
9 | 1,946 (8%) |
10 | 2,019 (9%) |
11 | 1,816 (8%) |
12 | 1,931 (8%) |
Missing value | 3,917,605 |
Birth Year, n (%) | |
2013 | 23,159 (100%) |
Missing value | 3,917,605 |
Birth Month, n (%) | |
1 | 1,951 (8%) |
2 | 1,740 (8%) |
3 | 1,897 (8%) |
4 | 1,853 (8%) |
5 | 2,030 (9%) |
6 | 1,971 (9%) |
7 | 1,991 (9%) |
8 | 2,010 (9%) |
9 | 1,995 (9%) |
10 | 2,032 (9%) |
11 | 1,795 (8%) |
12 | 1,894 (8%) |
Missing value | 3,917,605 |
Birth Weekday, n (%) | |
Sunday | 2,733 (12%) |
Monday | 3,316 (14%) |
Tuesday | 3,667 (16%) |
Wednesday | 3,578 (15%) |
Thursday | 3,511 (15%) |
Friday | 3,474 (15%) |
Saturday | 2,880 (12%) |
Unknown | 0 (0%) |
Missing value | 3,917,605 |
Age at Death in Days | |
Mean (Std) | 41.56 (72.56) |
Median (IQR) | 3.00 (0.00, 52.00) |
Missing value | 3,917,605 |
Manner of Death, n (%) | |
Accident | 1,205 (6%) |
Suicide | 0 (0%) |
Homicide | 261 (1%) |
Pending investigation | 301 (2%) |
Could not determine | 2,017 (10%) |
Self-inflicted | 0 (0%) |
Natural | 15,622 (81%) |
Missing value | 3,921,358 |
Show the R codes
<- c("mager41", "mbrace", "mracerec", "mar", "meduc")
mother_variables get_table_desc_stat(df, mother_variables)
Variable | N = 3,940,764 |
---|---|
Mother's Age, n (%) | |
10-12 | 71 (0%) |
13 | 459 (0%) |
14 | 2,569 (0%) |
15 | 9,439 (0%) |
16 | 22,596 (1%) |
17 | 42,956 (1%) |
18 | 76,261 (2%) |
19 | 122,346 (3%) |
20 | 151,182 (4%) |
21 | 167,417 (4%) |
22 | 185,362 (5%) |
23 | 194,103 (5%) |
24 | 200,099 (5%) |
25 | 208,091 (5%) |
26 | 215,347 (5%) |
27 | 227,545 (6%) |
28 | 234,861 (6%) |
29 | 237,526 (6%) |
30 | 237,377 (6%) |
31 | 231,394 (6%) |
32 | 212,709 (5%) |
33 | 191,239 (5%) |
34 | 166,761 (4%) |
35 | 143,151 (4%) |
36 | 118,161 (3%) |
37 | 93,346 (2%) |
38 | 73,923 (2%) |
39 | 56,507 (1%) |
40 | 41,758 (1%) |
41 | 29,721 (1%) |
42 | 19,821 (1%) |
43 | 11,936 (0%) |
44 | 6,502 (0%) |
45 | 3,701 (0%) |
46 | 1,887 (0%) |
47 | 977 (0%) |
48 | 587 (0%) |
49 | 387 (0%) |
50-64 | 689 (0%) |
Mother's Bridged Race, n (%) | |
White – single race | 2,689,736 (75%) |
Black – single race | 556,437 (15%) |
American Indian – single race | 33,908 (1%) |
Asian Indian – single race | 56,790 (2%) |
Chinese – single race | 49,453 (1%) |
Filipino – single race | 32,191 (1%) |
Japanese – single race | 7,108 (0%) |
Korean – single race | 14,910 (0%) |
Vietnamese – single race | 19,687 (1%) |
Other Asian – single race | 41,740 (1%) |
Hawaiian – single race | 1,136 (0%) |
Guamanian – single race | 1,282 (0%) |
Samoan – single race | 2,285 (0%) |
Other Pacific Islander – single race | 6,022 (0%) |
White – bridged multiple race | 38,302 (1%) |
Black – bridged multiple race | 24,363 (1%) |
American Indian & Alaskan Native – bridged multiple race | 5,243 (0%) |
Asian / Pacific Islander – bridged multiple race | 14,306 (0%) |
Missing value | 345,865 |
Mother's Race, n (%) | |
Other (not classified as White or Black) | 0 (0%) |
White | 2,993,686 (76%) |
Black | 635,120 (16%) |
American Indian / Alaskan Native | 46,011 (1%) |
Asian / Pacific Islander | 265,947 (7%) |
Mother's Marital Status, n (%) | |
Yes | 2,342,660 (59%) |
No | 1,598,104 (41%) |
Unmarried parents not living together | 0 (0%) |
Unknown or not Stated | 0 (0%) |
Mother's Education, n (%) | |
8th grade or less | 136,701 (4%) |
9th through 12th grade with no diploma | 421,293 (12%) |
High school graduate or GED completed | 879,956 (25%) |
Some college credit, but not a degree | 753,056 (21%) |
Associate degree (AA,AS) | 280,660 (8%) |
Bachelor’s degree (BA, AB, BS) | 669,170 (19%) |
Master’s degree (MA, MS, MEng, MEd, MSW, MBA) | 297,054 (8%) |
Doctorate (PhD, EdD) or Professional Degree (MD, DDS, DVM, LLB, JD) | 84,707 (2%) |
Unknown | 41,820 (1%) |
Missing value | 376,347 |
Show the R codes
<- c("fagecomb", "fbrace", "fracerec")
father_variables get_table_desc_stat(df, father_variables)
Variable | N = 3,940,764 |
---|---|
Father's Combined Age | |
Mean (Std) | 31.09 (6.89) |
Median (IQR) | 31.00 (26.00, 35.00) |
Missing value | 826,291 |
Father's Bridged Race, n (%) | |
White – single race | 2,210,002 (61%) |
Black – single race | 418,722 (12%) |
American Indian – single race | 22,558 (1%) |
Asian Indian – single race | 55,700 (2%) |
Chinese – single race | 42,055 (1%) |
Filipino – single race | 21,498 (1%) |
Japanese – single race | 4,108 (0%) |
Korean – single race | 11,221 (0%) |
Vietnamese – single race | 15,867 (0%) |
Other Asian – single race | 33,597 (1%) |
Hawaiian – single race | 941 (0%) |
Guamanian – single race | 983 (0%) |
Samoan – single race | 2,325 (0%) |
Other Pacific Islander – single race | 4,458 (0%) |
White – bridged multiple race | 28,772 (1%) |
Black – bridged multiple race | 14,438 (0%) |
American Indian & Alaskan Native – bridged multiple race | 7,617 (0%) |
Asian / Pacific Islander – bridged multiple race | 12,986 (0%) |
Unknown or not stated, also includes states not reporting multiple race | 687,051 (19%) |
Missing value | 345,865 |
Father's Race, n (%) | |
Other (not classified as White or Black) | 0 (0%) |
White | 2,466,993 (63%) |
Black | 476,535 (12%) |
American Indian / Alaskan Native | 35,143 (1%) |
Asian / Pacific Islander | 222,529 (6%) |
Unknown or not stated | 739,564 (19%) |
Show the R codes
<- c("lbo", "tbo", "precare_rec", "uprevis", "wtgain",
pregnancy_variables "cig_1", "cig_2", "cig_3", "cig_rec",
"dlmp_mm", "dlmp_yy", "estgest", "combgest")
get_table_desc_stat(df, pregnancy_variables)
Variable | N = 3,940,764 |
---|---|
Live Birth Order, n (%) | |
1 | 1,550,114 (40%) |
2 | 1,246,847 (32%) |
3 | 654,946 (17%) |
4 | 276,936 (7%) |
5 | 108,168 (3%) |
6 | 44,188 (1%) |
7 | 20,301 (1%) |
8 | 20,732 (1%) |
Missing value | 18,532 |
Total Birth Order, n (%) | |
1 | 1,272,200 (33%) |
2 | 1,108,655 (28%) |
3 | 714,320 (18%) |
4 | 392,389 (10%) |
5 | 202,062 (5%) |
6 | 99,603 (3%) |
7 | 51,311 (1%) |
8 | 61,099 (2%) |
Missing value | 39,125 |
Month Prenatal Care Began, n (%) | |
1st to 3rd month | 2,539,077 (71%) |
4th to 6th month | 672,306 (19%) |
7th to final month | 158,902 (4%) |
No prenatal care | 51,392 (1%) |
Unknown or not stated | 142,740 (4%) |
Missing value | 376,347 |
Number of Prenatal Visits | |
Mean (Std) | 11.27 (4.04) |
Median (IQR) | 12.00 (9.00, 13.00) |
Missing value | 118,625 |
Weight Gain | |
Mean (Std) | 30.33 (14.92) |
Median (IQR) | 30.00 (20.00, 39.00) |
Missing value | 190,653 |
Cigarettes 1st Trimester | |
Mean (Std) | 0.88 (3.79) |
Median (IQR) | 0.00 (0.00, 0.00) |
Missing value | 549,170 |
Cigarettes 2nd Trimester | |
Mean (Std) | 0.88 (3.79) |
Median (IQR) | 0.00 (0.00, 0.00) |
Missing value | 550,029 |
Cigarettes 3rd Trimester | |
Mean (Std) | 0.88 (3.78) |
Median (IQR) | 0.00 (0.00, 0.00) |
Missing value | 550,272 |
Smokes cigarettes, n (%) | |
Yes | 287,250 (8%) |
No | 3,104,885 (87%) |
Unknown or not stated | 172,282 (5%) |
Missing value | 376,347 |
Last Normal Menses: Month | |
Mean (Std) | 6.60 (3.49) |
Median (IQR) | 7.00 (4.00, 10.00) |
Missing value | 195,745 |
Last Normal Menses: Year, n (%) | |
2011 | 1,455 (0%) |
2012 | 2,810,941 (75%) |
2013 | 940,736 (25%) |
Missing value | 187,632 |
Obstetric/Clinical Gestation Est. | |
Mean (Std) | 38.51 (2.14) |
Median (IQR) | 39.00 (38.00, 40.00) |
Missing value | 7,483 |
Gestation – Detail in Weeks | |
Mean (Std) | 38.65 (2.50) |
Median (IQR) | 39.00 (38.00, 40.00) |
Missing value | 3,695 |
Show the R codes
<- c("rf_diab", "rf_gest", "rf_phyp", "rf_ghyp",
risk_factor_variables "rf_eclam", "rf_ppterm", "rf_ppoutc", "rf_cesar")
get_table_desc_stat(df, risk_factor_variables)
Variable | N = 3,940,764 |
---|---|
Risk Factor: Prepregnancy Diabetes, n (%) | |
Yes | 26,896 (1%) |
No | 3,529,191 (99%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Gestational Diabetes, n (%) | |
Yes | 187,064 (5%) |
No | 3,369,023 (95%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Prepregnancy Hypertension, n (%) | |
Yes | 54,198 (2%) |
No | 3,501,889 (98%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Gestational Hypertension, n (%) | |
Yes | 172,919 (5%) |
No | 3,383,168 (95%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Hypertension Eclampsia, n (%) | |
Yes | 8,148 (0%) |
No | 3,547,939 (100%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Previous Preterm Birth, n (%) | |
Yes | 92,331 (3%) |
No | 3,463,756 (97%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Poor Pregnancy Outcome, n (%) | |
Yes | 78,310 (2%) |
No | 3,477,777 (98%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Risk Factor: Previous Cesarean Deliveries, n (%) | |
Yes | 523,128 (15%) |
No | 3,032,959 (85%) |
Unknown or not stated | 8,330 (0%) |
Missing value | 376,347 |
Show the R codes
<- c("op_cerv", "op_tocol", "op_ecvs", "op_ecvf",
obstetric_procedures_variables "uop_induc", "uop_tocol", "on_ruptr",
"on_abrup", "on_prolg")
get_table_desc_stat(df, obstetric_procedures_variables)
Variable | N = 3,940,764 |
---|---|
Obstetric Procedures: Cervical Cerclage, n (%) | |
Yes | 10,900 (0%) |
No | 3,542,733 (99%) |
Unknown or not stated | 10,784 (0%) |
Missing value | 376,347 |
Obstetric Procedures: Tocolysis, n (%) | |
Yes | 33,727 (1%) |
No | 3,519,906 (99%) |
Unknown or not stated | 10,784 (0%) |
Missing value | 376,347 |
Obstetric Procedures: Successful External Cephalic, n (%) | |
Yes | 4,790 (0%) |
No | 3,548,843 (100%) |
Unknown or not stated | 10,784 (0%) |
Missing value | 376,347 |
Obstetric Procedures: Failed External Cephalic, n (%) | |
Yes | 4,070 (0%) |
No | 3,549,563 (100%) |
Unknown or not stated | 10,784 (0%) |
Missing value | 376,347 |
Obstetric Procedures: Induction of labor, n (%) | |
Yes | 904,437 (23%) |
No | 3,031,183 (77%) |
Not on certificate | 0 (0%) |
Unknown or not stated | 5,144 (0%) |
Obstetric Procedures: Tocolysis, n (%) | |
Yes | 37,355 (1%) |
No | 3,892,312 (99%) |
Not on certificate | 0 (0%) |
Unknown or not stated | 11,097 (0%) |
Onset of Labor: Premature Rupture of Membrane, n (%) | |
Yes | 123,588 (3%) |
No | 3,429,618 (96%) |
Unknown or not stated | 11,211 (0%) |
Missing value | 376,347 |
Onset of Labor: Abruptio placenta, n (%) | |
Yes | 110,247 (3%) |
No | 3,442,959 (97%) |
Unknown or not stated | 11,211 (0%) |
Missing value | 376,347 |
Onset of Labor: Prolonged Labor, n (%) | |
Yes | 47,198 (1%) |
No | 3,506,008 (98%) |
Unknown or not stated | 11,211 (0%) |