\[ \usepackage{dsfont} \usepackage{xcolor} \require{mathtools} \definecolor{bayesred}{RGB}{147, 30, 24} \definecolor{bayesblue}{RGB}{32, 35, 91} \definecolor{bayesorange}{RGB}{218, 120, 1} \definecolor{grey}{RGB}{128, 128, 128} \definecolor{couleur1}{RGB}{0,163,137} \definecolor{couleur2}{RGB}{255,124,0} \definecolor{couleur3}{RGB}{0, 110, 158} \definecolor{coul1}{RGB}{255,37,0} \definecolor{coul2}{RGB}{242,173,0} \definecolor{col_neg}{RGB}{155, 191, 221} \definecolor{col_pos}{RGB}{255, 128, 106} {\color{bayesorange} P (\text{H} \mid \text{E})} = \frac {{\color{bayesred} P(\text{H})} \times {\color{bayesblue}P(\text{E} \mid \text{H})}} {\color{grey} {P(\text{E})}} \]
In this notebook, we compute the SCATE at three different points: \(\boldsymbol{x}=(2500,60)\), \(\boldsymbol{x}=(4200,60)\), and \(\boldsymbol{x}=(2500,20)\). At each point, we will estimate the SCATE using a GAM model with Gaussian transport depending on the number of observations used to train the models.
1 Data
To illustrate the methods, we rely on the Linked Birth/Infant Death Cohort Data (2013 cohort). The CSV files for the 2013 cohort were downloaded from the NBER collection of Birth Cohort Linked Birth and Infant Death Data of the National Vital Statistics System of the National Center for Health Statistics, on the NBER website. We formated the data (see ./births_stats.html).
Let us load the birth data.
library(tidyverse)
load("../data/births.RData")
Then, we can only keep a subsample of variables to illustrate the method.
<-
base %>%
births mutate(
black_mother = mracerec == "Black",
nonnatural_delivery = rdmeth_rec != "Vaginal"
%>%
) select(
sex, dbwt, cig_rec, wtgain,
black_mother, nonnatural_delivery,
mracerec, rdmeth_rec%>%
) mutate(
cig_rec = replace_na(cig_rec, "Unknown or not stated"),
black_mother = ifelse(black_mother, yes = "Yes", no = "No"),
is_girl = ifelse(sex == "Female", yes = "Yes", no = "No")
%>%
) ::set_variable_labels(
labelledblack_mother = "Is the mother Black?",
is_girl = "Is the newborn a girl?",
nonnatural_delivery = "Is the delivery method non-natural?"
%>%
) rename(birth_weight = dbwt)
base
# A tibble: 3,940,764 × 9
sex birth_weight cig_rec wtgain black_mo…¹ nonna…² mrace…³ rdmet…⁴ is_girl
<fct> <dbl> <fct> <dbl> <chr> <lgl> <fct> <fct> <chr>
1 Male 3017 No 1 No FALSE "White" Vaginal No
2 Female 3367 No 18 No FALSE "White" Vaginal Yes
3 Male 4409 No 36 No FALSE "White" Vaginal No
4 Male 4047 No 25 No FALSE "White" Vaginal No
5 Male 3119 Yes 0 No FALSE "Ameri… Vaginal No
6 Female 4295 Yes 34 No FALSE "White" Vaginal Yes
7 Male 3160 No 47 No FALSE "Ameri… Vaginal No
8 Male 3711 No 39 No FALSE "White" Vaginal No
9 Male 3430 No NA No FALSE "White" Vaginal No
10 Female 3772 No 36 No FALSE "White" Vaginal Yes
# … with 3,940,754 more rows, and abbreviated variable names ¹black_mother,
# ²nonnatural_delivery, ³mracerec, ⁴rdmeth_rec
The variables that were kept are the following:
sex
: sex of the newbornbirth_weight
: Birth Weight (in Grams)cig_rec
: mother smokes cigaretteswtgain
: mother weight gainmracerec
: mother’s raceblack_mother
: is mother black?rdmeth_rec
: delivery methodnonnatural_delivery
: is the delivery method non-natural?
Then, let us discard individuals with missing values.
<-
base %>%
base filter(
!is.na(birth_weight),
!is.na(wtgain),
!is.na(nonnatural_delivery),
!is.na(black_mother)
)
Let us define some colours for later use:
library(wesanderson)
<- wes_palette("Darjeeling1")
colr1 <- wes_palette("Darjeeling2")
colr2 <- colr1[2]
couleur1 <- colr1[4]
couleur2 <- colr2[2]
couleur3
<- "#882255"
coul1 <- "#DDCC77" coul2
We need to load some packages.
library(car)
library(transport)
library(splines)
library(mgcv)
library(expm)
library(tidyverse)
2 Functions
Let us define the same functions as that used in reproduction_univariate.html.
2.1 Models
Let us define a function that estimates \(\color{couleur1}{\widehat{m}_0}\) or \(\color{couleur2}{\widehat{m}_1}\) with a logistic GAM model (cubic spline first).
library(splines)
#' Returns $\hat{m}_0()$ or $\hat{m}_1()$, using GAM
#' @param target name of the target variable
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param x_m_name name of the mediator variable
#' @param data data frame with the observations
#' @param treatment either non-treated (`0`) or treated (`1`)
#' @param treatment_0 treatment value for non-treated
#' @param treatment_1 treatment value for treated
#' @param df degrees of freedom (in the `bs()` function, deefault to `df=3`)
<- function(target, treatment_name, x_m_name, data, treatment = c(0,1), treatment_0=NULL, treatment_1=NULL, df = 3){
models_spline if(treatment == 0){
stopifnot(!is.null(treatment_0))
# \hat{m}_0()
<-
reg bquote(glm(.(target)~bs(.(x_m_name), df = df), data = data,
family=binomial, subset = (.(treatment_name) == .(treatment_0))),
list(target = as.name(target), x_m_name=as.name(x_m_name),
treatment_name = as.name(treatment_name),
treatment_0 = treatment_0)) %>%
eval()
else{
}stopifnot(!is.null(treatment_1))
# \hat{m}_1()
<- bquote(glm(.(target)~bs(.(x_m_name), df = df), data = data,
reg family=binomial, subset = (.(treatment_name) == .(treatment_1))),
list(target = as.name(target), x_m_name=as.name(x_m_name),
treatment_name = as.name(treatment_name),
treatment_1 = treatment_1)) %>%
eval()
}
reg }
2.2 Prediction Function
Let us define a function that predicts the target variable on new data, using an estimated model (cubic splines).
#' @param object regression model (GAM)
#' @param newdata data frame in which to look for the mediator variable used to predict the target
<- function(object, newdata){
model_spline_predict predict(object, newdata = newdata, type="response")
}
2.3 Transport
2.3.1 Quantile-based Assumption
The transport function can be defined as follows:
#' Quantile-based transport (Univariate)
#' @param x_0 vector of values to be transported
#' @param x_m_name name of the mediator variable
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param treatment_0 value for non treated
#' @param treatment_1 value for treated
#' @param data data frame with both T=0 or T=1
<- function(x_0, x_m_name, treatment_name, treatment_0, treatment_1, data){
transport_quantile <- pull(data, treatment_name) == treatment_0
ind_0 <- pull(data, x_m_name)
x_val # Empirical Cumulative Distribution Function for values of the mediator variable for non-treated
<- ecdf(x_val)
Fn # Probability associated in the non-treated
<- Fn(x_0)
u # Transported_values
<- pull(data, x_m_name)[pull(data, treatment_name) == treatment_1]
x_1 <- quantile(x_1, u)
x_t_quantile list(x_0 = x_0, u = u, x_t_quantile = x_t_quantile)
}
2.4 Sample Condional Average Treatment Effect
#' Computes the Sample Average Treatment Effect with and without transport
#' @param x_0 vector of values at which to compute $\hat{m}_0(x_0)$, $\hat{m}_1(x_0)$
#' @param x_t vector of values at which to compute $\hat{m}_1(\mathcal{T}(x_t))$
#' @param x_m_name vector of names of the mediator variables
#' @param mod_0 model $\hat{m}_0$
#' @param mod_1 model $\hat{m}_1$
#' @param pred_mod_0 prediction function for model $\hat{m}_0(\cdot)$
#' @param pred_mod_1 prediction function for model $\hat{m}_1(\cdot)$
#' @param return_x if `TRUE` (default) the mediator variables are returned in the table, as well as their transported values
<- function(x_0, x_t, x_m_names, mod_0, mod_1, pred_mod_0, pred_mod_1, return_x = TRUE){
sate if(is.vector(x_0)){
# Univariate case
<- tibble(!!x_m_names := x_0)
new_data else{
}<- x_0
new_data
}if(is.vector(x_t)){
# Univariate case
<- tibble(!!x_m_names := x_t)
new_data_t else{
}<- x_t
new_data_t
}
# $\hat{m}_0(x_0)$
<- pred_mod_0(object = mod_0, newdata = new_data)
y_0 # $\hat{m}_1(x_0)$
<- pred_mod_1(object = mod_1, newdata = new_data)
y_1 # $\hat{m}_1(\mathcal{T}(x_0))$
<- pred_mod_1(object = mod_1, newdata = new_data_t)
y_1_t
<-
scate_tab tibble(y_0 = y_0, y_1 = y_1, y_1_t = y_1_t,
CATE = y_1-y_0, SCATE = y_1_t-y_0)
if(return_x){
<- new_data_t %>% rename_all(~paste0(.x, "_t"))
new_data_t <- bind_cols(new_data, new_data_t, scate_tab)
scate_tab
}
scate_tab }
3 Example with \(n=4000\) for Smoker mothers and birth weight
Let us estimate the Sample Conditional Average Treatment Effect, using GAM models and a quantile-based method to transport the mediator variable. The outcome is the probability of having a natural birth, the treatment is whether the mothers smokes or not, and the mediator is the birth weight of the newborn.
Let us estimate the SCATE using only a subset of 4,000 randomly drawn observations from both groups (treated = smoker mothers and non-treated = non-smoker mothers).
First, we create the two sub-samples:
<- base %>% filter(cig_rec == "No") %>% sample_n(4000)
base_0 <- base %>% filter(cig_rec == "Yes") %>% sample_n(4000) base_1
The, we can estimate \(\color{couleur1}{\widehat{m}_0}\) or \(\color{couleur2}{\widehat{m}_1}\):
<-
reg_univ_0 models_spline(target = "nonnatural_delivery",
treatment_name = "cig_rec",
treatment = 0,
x_m_name = "birth_weight",
data = base_0, treatment_0 = "No", df = 3)
<-
reg_univ_1 models_spline(target = "nonnatural_delivery",
treatment_name = "cig_rec",
treatment = 1,
x_m_name = "birth_weight",
data = base_1, treatment_1 = "Yes", df = 3)
We will estimate the SCATE at the following weight gains:
<- seq(2000,4500,by=500)
w_0 w_0
[1] 2000 2500 3000 3500 4000 4500
The corresponding transported values (quantile-based):
<-
trans_w_0 transport_quantile(x_0 = w_0, x_m_name = "birth_weight",
treatment_name = "cig_rec",
treatment_0 = "No", treatment_1 = "Yes",
data = bind_rows(base_0, base_1))
trans_w_0
$x_0
[1] 2000 2500 3000 3500 4000 4500
$u
[1] 0.035750 0.101125 0.318250 0.706500 0.938750 0.992750
$x_t_quantile
3.575% 10.1125% 31.825% 70.65% 93.875% 99.275%
1899.000 2410.000 2894.000 3410.000 3945.000 4394.058
Lastly, the SCATE computed at the different points, for this sample:
sate(x_0 = w_0,
x_t = trans_w_0$x_t_quantile,
x_m_names = "birth_weight",
mod_0 = reg_univ_0,
mod_1 = reg_univ_1,
pred_mod_0 = model_spline_predict, pred_mod_1 = model_spline_predict) %>%
mutate(size_n = 4000, x_m_name = "birth_weight", idx = row_number())
# A tibble: 6 × 10
birth_weight birth_…¹ y_0 y_1 y_1_t CATE SCATE size_n x_m_n…² idx
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <int>
1 2000 1899 0.543 0.491 0.513 -0.0520 -0.0299 4000 birth_… 1
2 2500 2410 0.418 0.384 0.402 -0.0338 -0.0160 4000 birth_… 2
3 3000 2894 0.321 0.311 0.323 -0.00956 0.00157 4000 birth_… 3
4 3500 3410 0.288 0.299 0.296 0.0105 0.00727 4000 birth_… 4
5 4000 3945 0.351 0.377 0.362 0.0262 0.0114 4000 birth_… 5
6 4500 4394. 0.574 0.601 0.541 0.0275 -0.0330 4000 birth_… 6
# … with abbreviated variable names ¹birth_weight_t, ²x_m_name
4 Simulations
Let us estimate the SCATE using different sizes of the samples (\(4,000\) or \(20,000\) or \(100,000\)).
We can consider different treatments (whether the mother smokes or not, whether the mother is Black or not, and whether the newborn is a girl or a boy) and different mediator variables (newborn weight and mother’s weight gain). We can define a function that wraps the codes presented in Section 3 in a single function.
#' @param size_n
#' @param w_0
#' @param x_m_name name of the mediator variable
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param treatment_0 value for non treated
#' @param treatment_1 value for treated
#' @param data data frame with both T=0 or T=1
<- function(size_n, w_0, target, x_m_name, treatment_name, treatment_0, treatment_1, data){
simul_univ # Random sample of size `size_n` in each group
<- data %>% filter(!!sym(treatment_name) == treatment_0) %>% sample_n(size_n)
base_0 <- data %>% filter(!!sym(treatment_name) == treatment_1) %>% sample_n(size_n)
base_1 # Estimation of $\hat{m}_0()$
<-
reg_univ_0 models_spline(target = target,
treatment_name = treatment_name,
treatment = 0,
x_m_name = x_m_name,
data = base_0, treatment_0 = treatment_0, df = 3)
# Estimation of $\hat{m}_1()$
<-
reg_univ_1 models_spline(target = target,
treatment_name = treatment_name,
treatment = 1,
x_m_name = x_m_name,
data = base_1, treatment_1 = treatment_1, df = 3)
# Transported values (quantile-based)
<-
trans_w_0 transport_quantile(x_0 = w_0, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = bind_rows(base_0, base_1))
# SATE
<-
cate_univ sate(x_0 = w_0,
x_t = trans_w_0$x_t_quantile,
x_m_names = x_m_name,
mod_0 = reg_univ_0,
mod_1 = reg_univ_1,
pred_mod_0 = model_spline_predict, pred_mod_1 = model_spline_predict) %>%
mutate(size_n = size_n, x_m_name = x_m_name, idx = row_number())
cate_univ }
This function will be evaluated 200 times for each sample size. At each time, a new random sample for both treated and non-treated will be drawn.
<- 200 nb_replicate
Let us run the simulations in parallel.
library(parallel)
<- detectCores()-1
ncl <- makeCluster(ncl)
cl invisible(clusterEvalQ(cl, library(tidyverse, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(splines, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(expm, warn.conflicts=FALSE, quietly=TRUE)))
clusterExport(cl, c("models_spline", "model_spline_predict", "sate", "transport_quantile"))
clusterExport(cl, c("base"))
<- "nonnatural_delivery"
target <- "cig_rec"
treatment_name <- "birth_weight"
x_m_name <- "No"
treatment_0 <- "Yes"
treatment_1 # Some values
<- seq(2000,4500,by=500)
w_0
<-
cate_sim_smoker_bw ::pblapply(rep(c(4000, 20000, 100000), each = nb_replicate),
pbapplycl = cl,
simul_univ, w_0 = w_0, target = target, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = base)
The results can be saved:
save(cate_sim_smoker_bw, file = "../output/simulations/cate_sim_smoker_bw.rda")
And then loaded:
load("../output/simulations/cate_sim_smoker_bw.rda")
<- "nonnatural_delivery"
target <- "cig_rec"
treatment_name <- "wtgain"
x_m_name <- "No"
treatment_0 <- "Yes"
treatment_1 # Some values
<- seq(5,55, by=10)
w_0
<-
cate_sim_smoker_wg ::pblapply(rep(c(4000, 20000, 100000), each = nb_replicate),
pbapplycl = cl,
simul_univ, w_0 = w_0, target = target, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = base)
The results can be saved:
save(cate_sim_smoker_wg, file = "../output/simulations/cate_sim_smoker_wg.rda")
And then loaded:
load("../output/simulations/cate_sim_smoker_wg.rda")
<- "nonnatural_delivery"
target <- "black_mother"
treatment_name <- "birth_weight"
x_m_name <- "No"
treatment_0 <- "Yes"
treatment_1 # Some values
<- seq(2000,4500,by=500)
w_0
<-
cate_sim_blackm_bw ::pblapply(rep(c(4000, 20000, 100000), each = nb_replicate),
pbapplycl = cl,
simul_univ, w_0 = w_0, target = target, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = base)
The results can be saved:
save(cate_sim_blackm_bw, file = "../output/simulations/cate_sim_blackm_bw.rda")
And then loaded:
load("../output/simulations/cate_sim_blackm_bw.rda")
<- "nonnatural_delivery"
target <- "black_mother"
treatment_name <- "wtgain"
x_m_name <- "No"
treatment_0 <- "Yes"
treatment_1 # Some values
<- seq(5,55, by=10)
w_0
<-
cate_sim_blackm_wg ::pblapply(rep(c(4000, 20000, 100000), each = nb_replicate),
pbapplycl = cl,
simul_univ, w_0 = w_0, target = target, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = base)
The results can be saved:
save(cate_sim_blackm_wg, file = "../output/simulations/cate_sim_blackm_wg.rda")
And then loaded:
load("../output/simulations/cate_sim_blackm_wg.rda")
<- "nonnatural_delivery"
target <- "sex"
treatment_name <- "birth_weight"
x_m_name <- "Male"
treatment_0 <- "Female"
treatment_1 # Some values
<- seq(2000,4500,by=500)
w_0
<-
cate_sim_sex_bw ::pblapply(rep(c(4000, 20000, 100000), each = nb_replicate),
pbapplycl = cl,
simul_univ, w_0 = w_0, target = target, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = base)
The results can be saved:
save(cate_sim_sex_bw, file = "../output/simulations/cate_sim_sex_bw.rda")
And then loaded:
load("../output/simulations/cate_sim_sex_bw.rda")
<- "nonnatural_delivery"
target <- "sex"
treatment_name <- "wtgain"
x_m_name <- "Male"
treatment_0 <- "Female"
treatment_1 # Some values
<- seq(5,55, by=10)
w_0
<-
cate_sim_sex_wg ::pblapply(rep(c(4000, 20000, 100000), each = nb_replicate),
pbapplycl = cl,
simul_univ, w_0 = w_0, target = target, x_m_name = x_m_name,
treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1,
data = base)
The results can be saved:
save(cate_sim_sex_wg, file = "../output/simulations/cate_sim_sex_wg.rda")
And then loaded:
load("../output/simulations/cate_sim_sex_wg.rda")
Let us close the clusters:
stopCluster(cl = cl)
4.1 Results
4.1.1 Distribution of the transported values
Let us plot the distribution of the transported values, given both the size sample \(n\) and the value transported.
#' @param res_simul results of the simulation
#' @param size_n size of the sample used to make simulations
#' (column `size_n` in the tables obtained in `res_simul`)
#' @param x_m_name
#' @param x_lim limits of the x-axis (and the y-axis)
#' @param scale_factor_density scaling factor for the height of the densities
#' @param offset_density offset for the densities
#' @param arrow_length length of the arrow
#' @param x_lab label for the x-axis
<- function(res_simul, size_n, x_m_name, x_lim, scale_factor_density, offset_density, arrow_length = unit(0.25, "inches"), x_lab){
plot_distrib_transported_val <- paste0(x_m_name, "_t")
x_m_name_t
# Initial values and transported values, for the `size_n` simulations
<-
val_t_sim bind_rows(res_simul) %>%
filter(size_n == !!size_n) %>%
select(!!sym(x_m_name), !!sym(x_m_name_t))
# Points at which the SCATE were computed
<- unique(pull(val_t_sim, x_m_name))
w_0
# Mean of the transported values in the simulations, for each point of interest
<-
val_t_sim_mean %>% group_by(!!sym(x_m_name)) %>%
val_t_sim summarise(mean_t = mean(!!sym(x_m_name_t)))
# Estimation of the density of the transported values at each point of interest
<-
val_t_sim_d %>%
val_t_sim group_by(!!sym(x_m_name)) %>%
nest() %>%
mutate(density = map(data, ~density(pull(.x, x_m_name_t))))
# Plot
plot(NULL,xlab=x_lab,
ylab="", axes=FALSE,
xlim = x_lim, ylim = x_lim)
segments(x_lim[1],x_lim[1], x_lim[2], x_lim[2],col=couleur3)
points(w_0, w_0, pch = 19, col = couleur3)
arrows(w_0, w_0 , w_0, val_t_sim_mean$mean_t, col=couleur3, length = arrow_length)
axis(1)
axis(2)
for(i in 1:length(w_0)){
polygon(w_0[i] + offset_density + val_t_sim_d$density[[i]]$y*scale_factor_density,
$density[[i]]$x,
val_t_sim_dcol=scales::alpha(couleur1,.4),border=NA)
} }
Show the codes used to create the plot
<- cate_sim_smoker_bw
res_simul <- c(1800, 4900)
x_lim <- 10000
scale_factor_density <- "birth_weight"
x_m_name <- "Weight of the baby (mother non-smoker → smoker)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
plot_distrib_transported_val(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
plot_distrib_transported_val(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
plot_distrib_transported_val(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
Show the codes used to create the plot
<- cate_sim_smoker_wg
res_simul <- c(0, 60)
x_lim <- 100
scale_factor_density <- "wtgain"
x_m_name <- "Weight gain of the mother (mother non-smoker → smoker)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
plot_distrib_transported_val(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
plot_distrib_transported_val(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
plot_distrib_transported_val(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_blackm_bw
res_simul <- c(1800, 4900)
x_lim <- 10000
scale_factor_density <- "birth_weight"
x_m_name <- "Weight of the baby (mother non-Black → Black)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
plot_distrib_transported_val(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
plot_distrib_transported_val(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
plot_distrib_transported_val(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
Show the codes used to create the plot
<- cate_sim_blackm_wg
res_simul <- c(0, 60)
x_lim <- 100
scale_factor_density <- "wtgain"
x_m_name <- "Weight gain of the mother (mother non-Black → Black)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
plot_distrib_transported_val(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
plot_distrib_transported_val(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
plot_distrib_transported_val(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_sex_bw
res_simul <- c(1800, 4900)
x_lim <- 10000
scale_factor_density <- "birth_weight"
x_m_name <- "Weight of the baby (baby boy → girl)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
plot_distrib_transported_val(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
plot_distrib_transported_val(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
plot_distrib_transported_val(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 100, x_lab = x_lab, unit(0.05, "inches"))
Show the codes used to create the plot
<- cate_sim_sex_wg
res_simul <- c(0, 60)
x_lim <- 100
scale_factor_density <- "wtgain"
x_m_name <- "Weight gain of the mother (baby boy → girl)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
plot_distrib_transported_val(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
plot_distrib_transported_val(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
plot_distrib_transported_val(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Let us plot the distributions differently. We define a function that estimates the density of the transported values and returns both the estimated values and their range.
<- function(x){
compute_density <- density(x)
d <- min(d$x)
x_min <- max(d$x)
x_max <- min(d$y)
y_min <- max(d$y)
y_max <- tibble(x_min = x_min, x_max = x_max, y_min = y_min, y_max = y_max)
limits <- tibble(x = d$x, y = d$y)
val list(val = val, limits = limits)
}
Then, we can plot the estimated densities depending on the sample size, for each value of interest of the mediator variable.
#' @param res_simul results of the simulation
#' @param x_m_name mediator variable
#' @param x_m_name_label label of the mediator
#' @param x_lab label for the x-axis
<- function(res_simul, x_m_name, x_m_name_label, x_lab){
plot_distrib_transported_val <- paste0(x_m_name, "_t")
x_m_name_t # Points of interest
<- unique(pull(bind_rows(res_simul), !!sym(x_m_name)))
w_0 # Simulation results
<-
val_t_sim bind_rows(res_simul) %>%
mutate(size_n = factor(size_n, levels = c(4000, 20000, 100000), labels = c("4,000", "20,000", "100,000")))
# Average of the transported values, for each sample size and each point of interest
<-
mean_t_df %>%
val_t_sim group_by(size_n, !!sym(x_m_name)) %>%
summarise(!!sym(x_m_name_t) := mean(!!sym(x_m_name_t)))
# Densities of the transported values over the simulations,
# for each value of the mediator variable and each size sample
<-
densities %>% group_by(!!sym(x_m_name), size_n) %>%
val_t_sim nest() %>%
mutate(density = map(data, ~compute_density(pull(.x, x_m_name_t))))
# 3x2 plot matrix + 1 legend at the bottom
layout(mat = matrix(c(1,2,3,4,5,6,7,7,7),nrow = 3,ncol = 3,byrow = TRUE),
heights = c(0.4,0.4,0.2))
# loop over the different points of the mediator variable
for(i in 1:length(w_0)){
# densities of the transported values for the current point
<- densities %>% filter(!!sym(x_m_name) == w_0[i])
val_t_sim_i
# limits of the densities values (over the different sample sizes)
<-
limits %>%
val_t_sim_i mutate(limits = map(density, "limits")) %>%
unnest(cols = limits) %>%
ungroup() %>%
summarise(
x_min = min(x_min),
x_max = max(x_max),
y_min = min(y_min),
y_max = max(y_max)
)par(mar=c(5.1,4.1, 4.1, 2.1))
plot(NULL, xlab = x_lab,
main = paste0(x_m_name_label, " = ", scales::number(w_0[i], big.mark = ",")),
ylab = "", axes = FALSE,
xlim = c(min(limits$x_min, w_0[i]), max(limits$x_max, w_0[i])),
ylim = c(limits$y_min, limits$y_max))
axis(1)
# point of interest
abline(v = w_0[i], lty = 2, col = "grey")
for(j in 1:nrow(val_t_sim_i)){
<-
val_t_sim_i_j %>% ungroup() %>% slice(j) %>%
val_t_sim_i mutate(density = map(density, "val")) %>%
unnest(cols = density)
# Mean of trransported values
<- mean_t_df %>%
mean_t_i_j filter(size_n == unique(val_t_sim_i_j$size_n), !!sym(x_m_name) == w_0[i]) %>%
pull(!!sym(x_m_name_t))
abline(v = mean_t_i_j, col = colr1[j], lty = 2)
# density
polygon(x = val_t_sim_i_j$x, y = val_t_sim_i_j$y, col=scales::alpha(colr1[j],.4),border=NA)
}
}par(mar=c(0,0,1,0))
plot(1, type = "n", axes=FALSE, xlab="", ylab="")
<- scales::alpha(colr1[1:3],.4)
plot_colors legend(x = "top", inset = 0,
legend = c("4,000", "20,000", "100,000"), col = plot_colors, lwd = 4, cex = .8, horiz = TRUE)
}
Show the codes used to create the plot
plot_distrib_transported_val(res_simul = cate_sim_smoker_bw, x_m_name = "birth_weight",
x_m_name_label = "Birth weight",
x_lab = "Weight of the baby (mother non-smoker → smoker)")
Show the codes used to create the plot
plot_distrib_transported_val(res_simul = cate_sim_smoker_wg, x_m_name = "wtgain",
x_m_name_label = "Weight gain",
x_lab = "Weight gain of the mother (mother non-smoker → smoker)")
Show the codes used to create the plot
plot_distrib_transported_val(res_simul = cate_sim_blackm_bw, x_m_name = "birth_weight",
x_m_name_label = "Birth weight",
x_lab = "Weight of the baby (non-Black mother → Black mother)")
Show the codes used to create the plot
plot_distrib_transported_val(res_simul = cate_sim_blackm_wg, x_m_name = "wtgain",
x_m_name_label = "Weight gain",
x_lab = "Weight gain of the mother (non-Black mother → Black mother)")
Show the codes used to create the plot
plot_distrib_transported_val(res_simul = cate_sim_sex_bw, x_m_name = "birth_weight",
x_m_name_label = "Birth weight",
x_lab = "Weight of the baby (baby boy → baby girl)")
Show the codes used to create the plot
plot_distrib_transported_val(res_simul = cate_sim_sex_wg, x_m_name = "wtgain",
x_m_name_label = "Weight gain",
x_lab = "Weight gain of the mother (baby boy → baby girl)")
Same kind of representation, with ggplot2.
#' @param res_simul results of the simulation
#' @param x_m_name mediator variable
#' @param x_m_name_label label of the mediator
#' @param x_lab label for the x-axis
<- function(res_simul, x_m_name, x_m_name_label, x_lab, type = c("density", "histogram")){
plot_distrib_transported_val_2 <- paste0(x_m_name, "_t")
x_m_name_t # transported values (simulations)
<-
val_t_sim bind_rows(res_simul) %>%
mutate(size_n = factor(size_n, levels = c(4000, 20000, 100000), labels = c("4,000", "20,000", "100,000")))
# Points of interest
<- unique(pull(bind_rows(val_t_sim), !!sym(x_m_name)))
w_0
# Average of the transported values for each sample size and each point of interest
<-
mean_t_df bind_rows(val_t_sim) %>%
group_by(size_n, !!sym(x_m_name)) %>%
summarise(!!sym(x_m_name_t) := mean(!!sym(x_m_name_t)))
# Labels for faceting
<- paste0(x_m_name_label, "=", scales::number(w_0, big.mark = ","))
w_0_labels names(w_0_labels) <- w_0
<-
p ggplot(data = val_t_sim) +
geom_vline(data = mean_t_df, mapping = aes(xintercept = !!sym(x_m_name_t), colour = size_n), linetype = "dashed")
if(type == "density") p <- p +
geom_density(mapping = aes(x = !!sym(x_m_name_t), fill = size_n), alpha = .8, colour = NA)
if(type == "histogram") p <- p +
geom_histogram(mapping = aes(x = !!sym(x_m_name_t), fill = size_n), alpha = .8, colour = NA, position = "identity")
<- p +
p geom_vline(data = tibble(!!sym(x_m_name) := w_0), mapping = aes(xintercept = !!sym(x_m_name)), colour = "grey", linetype = "dashed") +
facet_wrap(vars(!!sym(x_m_name)), scales = "free", labeller = labeller(!!sym(x_m_name) := w_0_labels), ncol = 2) +
labs(x = x_lab, y = NULL) +
scale_fill_manual("Sample size", values = c("4,000" = colr1[1], "20,000" = colr1[2], "100,000" = colr1[3])) +
scale_colour_manual("Sample size", values = c("4,000" = colr1[1], "20,000" = colr1[2], "100,000" = colr1[3])) +
theme_minimal() +
theme(legend.position = "bottom", axis.text.y = element_blank(), strip.text = element_text(face = "bold"))
p }
Show the codes used to create the plot
plot_distrib_transported_val_2(res_simul = cate_sim_smoker_bw, x_m_name = "birth_weight",
x_m_name_label = "Birth weight",
x_lab = "Weight of the baby (mother non-smoker → smoker)",
type = "density")
Show the codes used to create the plot
plot_distrib_transported_val_2(res_simul = cate_sim_smoker_wg, x_m_name = "wtgain",
x_m_name_label = "Weight gain",
x_lab = "Weight gain of the mother (mother non-smoker → smoker)",
type = "histogram")
Show the codes used to create the plot
plot_distrib_transported_val_2(res_simul = cate_sim_blackm_bw, x_m_name = "birth_weight",
x_m_name_label = "Birth weight",
x_lab = "Weight of the baby (non-Black mother → Black mother)",
type = "density")
Show the codes used to create the plot
plot_distrib_transported_val_2(res_simul = cate_sim_blackm_wg, x_m_name = "wtgain",
x_m_name_label = "Weight gain",
x_lab = "Weight gain of the mother (non-Black mother → Black mother)",
type = "histogram")
Show the codes used to create the plot
plot_distrib_transported_val_2(res_simul = cate_sim_sex_bw, x_m_name = "birth_weight",
x_m_name_label = "Birth weight",
x_lab = "Weight of the baby (baby boy → baby girl)",
type = "density")
Show the codes used to create the plot
plot_distrib_transported_val_2(res_simul = cate_sim_sex_wg, x_m_name = "wtgain",
x_m_name_label = "Weight gain",
x_lab = "Weight gain of the mother (baby boy → baby girl)",
type = "histogram")
4.2 SCATE with GAM, quantile-based transport
Let us now display the distribution of \(\text{CATE}(x)\) and \(\text{SCATE}(x)\) obtained in the simulations.
#' @param res_simul results of the simulation
#' @param size_n size of the sample used to make simulations
#' (column `size_n` in the tables obtained in `res_simul`)
#' @param x_m_name
#' @param x_lim limits of the x-axis
#' @param y_lim limits of the y-axis
#' @param offset_density offset for the boplots
#' @param x_lab label for the x-axis
<- function(res_simul, size_n, x_m_name, x_lim, y_lim, scale_factor_density, offset_density, x_lab){
boxplot_cate_simul <- paste0(x_m_name, "_t")
x_m_name_t
# Initial values and transported values, for the `size_n` simulations
<-
val_t_sim bind_rows(res_simul) %>%
filter(size_n == !!size_n) %>%
select(!!sym(x_m_name), CATE, SCATE)
# Points at which the SCATE were computed
<- unique(pull(bind_rows(res_simul), !!sym(x_m_name)))
w_0
plot(NULL, xlab = x_lab,
ylab = paste0("CATE estimate, n=", scales::number(size_n, big.mark = ",")),
axes=FALSE,
xlim=x_lim, ylim = y_lim)
axis(1)
axis(2)
# Loop over the points of interest
for(i in 1:length(w_0)){
<- pull(val_t_sim %>% filter(!!sym(x_m_name) == w_0[i]), CATE)
cate_i <- pull(val_t_sim %>% filter(!!sym(x_m_name) == w_0[i]), SCATE)
scate_i
# Boxplot for CATE
boxplot(cate_i, add = TRUE,
at = w_0[i] - offset_boxplot, border = scales::alpha(couleur3, .25), axes = FALSE)
points(w_0[i] - offset_boxplot, mean(cate_i), pch = 19, col = scales::alpha(couleur3,.25))
# Boxplot for SCATE
boxplot(scate_i, add = TRUE,
at = w_0[i] + offset_boxplot, border = couleur3, axes = FALSE)
points(w_0[i] + offset_boxplot, mean(scate_i), pch = 19, col = couleur3)
} }
Show the codes used to create the plot
<- cate_sim_smoker_bw
res_simul <- c(1800, 4900)
x_lim <- c(-.2, .2)
y_lim <- 50
offset_boxplot <- "birth_weight"
x_m_name <- "Weight of the baby (mother non-smoker → smoker)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
boxplot_cate_simul(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_smoker_wg
res_simul <- c(0, 60)
x_lim <- c(-.05, .05)
y_lim <- 1
offset_boxplot <- "wtgain"
x_m_name <- "Weight gain of the mother (mother non-smoker → smoker)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
boxplot_cate_simul(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_blackm_bw
res_simul <- c(1800, 4900)
x_lim <- c(-.3, .3)
y_lim <- 50
offset_boxplot <- "birth_weight"
x_m_name <- "Weight of the baby (mother non-Black → Black)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
boxplot_cate_simul(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_blackm_wg
res_simul <- c(0, 60)
x_lim <- c(-.1, .1)
y_lim <- 1
offset_boxplot <- "wtgain"
x_m_name <- "Weight gain of the mother (mother non-Black → Black)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
boxplot_cate_simul(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_sex_bw
res_simul <- c(1800, 4900)
x_lim <- c(-.2, .2)
y_lim <- 50
offset_boxplot <- "birth_weight"
x_m_name <- "Weight of the baby (baby boy → girl)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
boxplot_cate_simul(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Show the codes used to create the plot
<- cate_sim_sex_wg
res_simul <- c(0, 60)
x_lim <- c(-.05, .05)
y_lim <- 1
offset_boxplot <- "wtgain"
x_m_name <- "Weight gain of the mother (baby boy → girl)"
x_lab
layout(mat = matrix(c(1,2,3), ncol=1))
boxplot_cate_simul(res_simul = res_simul,size_n = 4000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 20000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
boxplot_cate_simul(res_simul = res_simul,size_n = 100000, x_m_name = x_m_name,
x_lim = x_lim, y_lim = y_lim, scale_factor_density = scale_factor_density,
offset_density = 0.5, x_lab = x_lab)
Let us get the same kind of graphs with {ggplot2}.
#' @param res_simul results of the simulation
#' (column `size_n` in the tables obtained in `res_simul`)
#' @param x_m_name
#' @param x_lab label for the x-axis
<- function(res_simul, x_m_name, x_lab){
boxplot_cate_simul_2 <- paste0(x_m_name, "_t")
x_m_name_t # Labels for faceting
<- bind_rows(res_simul)
val_sim <- unique(val_sim$size_n)
sample_sizes <- paste0("n=", scales::number(sample_sizes, big.mark = ","))
size_labels names(size_labels) <- sample_sizes
%>%
val_sim select(!!sym(x_m_name), CATE, SCATE, size_n) %>%
pivot_longer(cols = c(CATE, SCATE)) %>%
ggplot(data = .) +
geom_violin(mapping = aes(x = factor(!!sym(x_m_name)), y = value, fill = name)) +
scale_fill_manual(NULL, values = c("CATE" = scales::alpha(couleur3,.25), "SCATE" = couleur3)) +
labs(x = x_lab, y = "Conditional Average Treatment Effect") +
theme_minimal() +
theme(legend.position = "bottom", strip.text = element_text(face = "bold")) +
facet_wrap(~size_n, scales = "free", labeller = labeller(size_n = size_labels), ncol = 1)
}
Show the codes used to create the plot
boxplot_cate_simul_2(res_simul = cate_sim_smoker_bw,
x_m_name = "birth_weight",
x_lab = "Weight of the baby (mother non-smoker → smoker)")
Show the codes used to create the plot
boxplot_cate_simul_2(res_simul = cate_sim_smoker_wg,
x_m_name = "wtgain",
x_lab = "Weight gain of the mother (mother non-smoker → smoker)")
Show the codes used to create the plot
boxplot_cate_simul_2(res_simul = cate_sim_blackm_bw,
x_m_name = "birth_weight",
x_lab = "Weight of the baby (mother non-Black → Black)")
Show the codes used to create the plot
boxplot_cate_simul_2(res_simul = cate_sim_blackm_wg,
x_m_name = "wtgain",
x_lab = "Weight gain of the mother (mother non-Black → Black)")
Show the codes used to create the plot
boxplot_cate_simul_2(res_simul = cate_sim_sex_bw,
x_m_name = "birth_weight",
x_lab = "Weight of the baby (baby boy → girl)")
Show the codes used to create the plot
boxplot_cate_simul_2(res_simul = cate_sim_sex_wg,
x_m_name = "wtgain",
x_lab = "Weight gain of the mother (baby boy → girl)")