\[ \usepackage{dsfont} \usepackage{xcolor} \require{mathtools} \definecolor{bayesred}{RGB}{147, 30, 24} \definecolor{bayesblue}{RGB}{32, 35, 91} \definecolor{bayesorange}{RGB}{218, 120, 1} \definecolor{grey}{RGB}{128, 128, 128} \definecolor{couleur1}{RGB}{0,163,137} \definecolor{couleur2}{RGB}{255,124,0} \definecolor{couleur3}{RGB}{0, 110, 158} \definecolor{coul1}{RGB}{255,37,0} \definecolor{coul2}{RGB}{242,173,0} \definecolor{col_neg}{RGB}{155, 191, 221} \definecolor{col_pos}{RGB}{255, 128, 106} {\color{bayesorange} P (\text{H} \mid \text{E})} = \frac {{\color{bayesred} P(\text{H})} \times {\color{bayesblue}P(\text{E} \mid \text{H})}} {\color{grey} {P(\text{E})}} \]
In this notebook, we compute the SCATE at three different points: \(\boldsymbol{x}=(2500,60)\), \(\boldsymbol{x}=(4200,60)\), and \(\boldsymbol{x}=(2500,20)\). At each point, we will estimate the SCATE using a GAM model with Gaussian transport depending on the number of observations used to train the models.
1 Data
To illustrate the methods, we rely on the Linked Birth/Infant Death Cohort Data (2013 cohort). The CSV files for the 2013 cohort were downloaded from the NBER collection of Birth Cohort Linked Birth and Infant Death Data of the National Vital Statistics System of the National Center for Health Statistics, on the NBER website. We formated the data (see ./births_stats.html).
Let us load the birth data.
library(tidyverse)
load("../data/births.RData")
Then, we can only keep a subsample of variables to illustrate the method.
<-
base %>%
births mutate(
black_mother = mracerec == "Black",
nonnatural_delivery = rdmeth_rec != "Vaginal"
%>%
) select(
sex, dbwt, cig_rec, wtgain,
black_mother, nonnatural_delivery,
mracerec, rdmeth_rec%>%
) mutate(
cig_rec = replace_na(cig_rec, "Unknown or not stated"),
black_mother = ifelse(black_mother, yes = "Yes", no = "No"),
is_girl = ifelse(sex == "Female", yes = "Yes", no = "No")
%>%
) ::set_variable_labels(
labelledblack_mother = "Is the mother Black?",
is_girl = "Is the newborn a girl?",
nonnatural_delivery = "Is the delivery method non-natural?"
%>%
) rename(birth_weight = dbwt)
base
# A tibble: 3,940,764 × 9
sex birth_weight cig_rec wtgain black_mo…¹ nonna…² mrace…³ rdmet…⁴ is_girl
<fct> <dbl> <fct> <dbl> <chr> <lgl> <fct> <fct> <chr>
1 Male 3017 No 1 No FALSE "White" Vaginal No
2 Female 3367 No 18 No FALSE "White" Vaginal Yes
3 Male 4409 No 36 No FALSE "White" Vaginal No
4 Male 4047 No 25 No FALSE "White" Vaginal No
5 Male 3119 Yes 0 No FALSE "Ameri… Vaginal No
6 Female 4295 Yes 34 No FALSE "White" Vaginal Yes
7 Male 3160 No 47 No FALSE "Ameri… Vaginal No
8 Male 3711 No 39 No FALSE "White" Vaginal No
9 Male 3430 No NA No FALSE "White" Vaginal No
10 Female 3772 No 36 No FALSE "White" Vaginal Yes
# … with 3,940,754 more rows, and abbreviated variable names ¹black_mother,
# ²nonnatural_delivery, ³mracerec, ⁴rdmeth_rec
The variables that were kept are the following:
sex
: sex of the newbornbirth_weight
: Birth Weight (in Grams)cig_rec
: mother smokes cigaretteswtgain
: mother weight gainmracerec
: mother’s raceblack_mother
: is mother black?rdmeth_rec
: delivery methodnonnatural_delivery
: is the delivery method non-natural?
Then, let us discard individuals with missing values.
<-
base %>%
base filter(
!is.na(birth_weight),
!is.na(wtgain),
!is.na(nonnatural_delivery),
!is.na(black_mother)
)
Let us define some colours for later use:
library(wesanderson)
<- wes_palette("Darjeeling1")
colr1 <- wes_palette("Darjeeling2")
colr2 <- colr1[2]
couleur1 <- colr1[4]
couleur2 <- colr2[2]
couleur3
<- "#882255"
coul1 <- "#DDCC77" coul2
We need to load some packages.
library(car)
library(transport)
library(splines)
library(mgcv)
library(expm)
library(tidyverse)
Let us define the same functions as that used in reproduction_multivariate.html.
2 Functions
2.1 Models
Let us estimate \(\color{couleur1}{\widehat{m}_0}\) and \(\color{couleur2}{\widehat{m}_1}\) with a logistic GAM model (cubic spline first, then with more knots and degrees). To that end, we can define a function that will estimate both models and return them within a list.
library(splines)
#' Returns $\hat{m}_0()$ and $\hat{m}_1()$, using GAM
#' @param target name of the target variable
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param x_m_names names of the mediator variable
#' @param data data frame with the observations
#' @param treatment_0 value for non treated
#' @param treatment_1 value for treated
#' @param df degrees of freedom (in the `bs()` function, deefault to `df=3`)
<- function(target, treatment_name, x_m_names, data, treatment_0, treatment_1, df = 3){
models_spline
# \hat{m}_0()
<- paste0(target, "~", paste0("bs(", x_m_names, ", df = ",df, ")", collapse = " + "))
formula_glm <- bquote(
reg_0 glm(formula = .(formula_glm), data=data, family = binomial,
subset = (.(treatment_name) == .(treatment_0))),
list(
formula_glm = formula_glm,
treatment_name = as.name(treatment_name),
treatment_0 = treatment_0)
%>% eval()
) # \hat{m}_1()
<- bquote(
reg_1 glm(formula = .(formula_glm), data=data, family = binomial,
subset = (.(treatment_name) == .(treatment_1))),
list(
formula_glm = formula_glm,
treatment_name = as.name(treatment_name),
treatment_1 = treatment_1)
%>% eval()
)
list(reg_0 = reg_0, reg_1 = reg_1)
}
2.1.1 GAM (with cubic splines)
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "cig_rec"
treatment_name <- "No"
treatment_0 <- "Yes" treatment_1
\(\color{couleur1}{\widehat{m}_0}\) and \(\color{couleur2}{\widehat{m}_1}\):
<-
reg_gam_smoker models_spline(target = target,
treatment_name = treatment_name,
x_m_names = x_m_names,
data = base, treatment_0 = treatment_0, treatment_1 = treatment_1, df = c(3, 3))
<- reg_gam_smoker$reg_0
reg_gam_smoker_0 <- reg_gam_smoker$reg_1 reg_gam_smoker_1
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "black_mother"
treatment_name <- "No"
treatment_0 <- "Yes" treatment_1
<-
reg_gam_blackm models_spline(target = target,
treatment_name = treatment_name,
x_m_names = x_m_names,
data = base, treatment_0 = treatment_0, treatment_1 = treatment_1, df = c(3, 3))
<- reg_gam_blackm$reg_0
reg_gam_blackm_0 <- reg_gam_blackm$reg_1 reg_gam_blackm_1
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "sex"
treatment_name <- "Male"
treatment_0 <- "Female" treatment_1
\(\color{couleur1}{\widehat{m}_0}\) and \(\color{couleur2}{\widehat{m}_1}\):
<-
reg_gam_sex models_spline(target = target,
treatment_name = treatment_name,
x_m_names = x_m_names,
data = base, treatment_0 = treatment_0, treatment_1 = treatment_1, df = c(3, 3))
<- reg_gam_sex$reg_0
reg_gam_sex_0 <- reg_gam_sex$reg_1 reg_gam_sex_1
2.2 Prediction Functions
The prediction function for GAM:
#' @param object regression model (GAM)
#' @param newdata data frame in which to look for the mediator variable used to predict the target
<- function(object, newdata){
model_spline_predict predict(object, newdata = newdata, type="response")
}
2.3 Transport
2.3.1 Gaussian Assumption
The transport function can be defined as follows:
#' Optimal Transport assuming Gaussian distribution for the mediator variables (helper function)
#' @return A list with the mean and variance of the mediator in each subset, and the symmetric matrix A.
#' @param target name of the target variable
#' @param x_m_names vector of names of the mediator variables
#' @param scale vector of scaling to apply to each `x_m_names` variable to transport (default to 1)
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param treatment_0 value for non treated
#' @param treatment_1 value for treated
<- function(target, x_m_names, scale=1, treatment_name, treatment_0, treatment_1){
transport_gaussian_param <-
base_0_unscaled %>%
base select(!!c(x_m_names, treatment_name)) %>%
filter(!!sym(treatment_name) == treatment_0)
<-
base_1_unscaled %>%
base select(!!c(x_m_names, treatment_name)) %>%
filter(!!sym(treatment_name) == treatment_1)
for(i in 1:length(scale)){
<-
base_0_scaled %>%
base_0_unscaled mutate(!!sym(x_m_names[i]) := !!sym(x_m_names[i]) * scale[i])
<-
base_1_scaled %>%
base_1_unscaled mutate(!!sym(x_m_names[i]) := !!sym(x_m_names[i]) * scale[i])
}
# Mean in each subset (i.e., T=0, T=1)
<- base_0_scaled %>% summarise(across(!!x_m_names, mean)) %>% as_vector()
m_0 <- base_1_scaled %>% summarise(across(!!x_m_names, mean)) %>% as_vector()
m_1 # Variance
<- base_0_scaled %>% select(!!x_m_names) %>% var()
S_0 <- base_1_scaled %>% select(!!x_m_names) %>% var()
S_1 # Matrix A
<- (solve(sqrtm(S_0))) %*% sqrtm( sqrtm(S_0) %*% S_1 %*% (sqrtm(S_0)) ) %*% solve(sqrtm(S_0))
A
list(m_0 = m_0, m_1 = m_1, S_0 = S_0, S_1 = S_1, A = A, scale = scale, x_m_names = x_m_names)
}
We create a function that transports a single observation, given the different parameters:
#' Gaussian transport for a single observation (helper function)
#' @param z vector of variables to transport (for a single observation)
#' @param A symmetric positive matrix that satisfies \(A\sigma_0A=\sigma_1\)
#' @param m_0,m_1 vectors of mean values in the subsets \(\mathcal{D}_0\) and \(\mathcal{D}_1\)
#' @param scale vector of scaling to apply to each variable to transport (default to 1)
<- function(z, A, m_0, m_1, scale = 1){
T_C_single <- z*scale
z as.vector(m_1 + A %*% (z-m_0))*(1/scale)
}
Lastly, we define a function that transports the mediator variables, under the Gaussian assumption. If the params
argument is NULL
, the arguments needed to estimate them through the transport_gaussian_param()
function are required (target
, x_m_names
, treatment_name
, treatment_0
, treatment_1
).
#' Gaussian transport
#' @param z data frame of variables to transport
#' @param params (optional) parameters to use for the transport (result of `transport_gaussian_param()`, default to NULL)
#' @param target name of the target variable
#' @param x_m_names vector of names of the mediator variables
#' @param scale vector of scaling to apply to each `x_m_names` variable to transport (default to 1)
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param treatment_0 value for non treated
#' @param treatment_1 value for treated
<- function(z, params = NULL, ..., x_m_names, scale, treatment_name, treatment_0, treatment_1){
gaussian_transport if(is.null(params)){
# If the parameters of the transport function are not provided
# they need to be computed first
<-
params transport_gaussian_param(target = target, x_m_names = x_m_names,
scale = scale, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1)
}<- params$A
A <- params$m_0 ; m_1 <- params$m_1
m_0 <- params$scale
scale <- params$x_m_names
x_m_names
<- z %>% select(!!x_m_names)
values_to_transport <-
transported_val apply(values_to_transport, 1, T_C_single, A = A, m_0 = m_0, m_1 = m_1,
scale = scale, simplify = FALSE) %>%
do.call("rbind", .)
colnames(transported_val) <- colnames(z)
<- as_tibble(transported_val)
transported_val
structure(.Data = transported_val, params = params)
}
3 Example with \(n=1000\)
We will now compute the SCATE at three different points: \(\boldsymbol{x}=(2500,60)\), \(\boldsymbol{x}=(4200,60)\), and \(\boldsymbol{x}=(2500,20)\). At each point, we will estimate the SCATE using a GAM model with Gaussian transport depending on the number of observations used to train the models.
= tibble(
point_X wtgain = c(60,60,20),
birth_weight = c(2500,4200,2500)
)
We will make the sample size \(n\) vary. Let us first consider \(n=1000\).
<- 1000 size
Let us create a sample with 1,000 observation from base
:
<- base[sample(1:nrow(base), size=size), ] sbase
The transported values:
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "cig_rec"
treatment_name <- "No"
treatment_0 <- "Yes" treatment_1
<-
point_X_t_n_smoker gaussian_transport(z = point_X, target = target,
x_m_names = x_m_names,
scale = scale, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1)
point_X_t_n_smoker
# A tibble: 3 × 2
wtgain birth_weight
* <dbl> <dbl>
1 64.9 2353.
2 65.2 4071.
3 18.3 2284.
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "black_mother"
treatment_name <- "No"
treatment_0 <- "Yes" treatment_1
<-
point_X_t_n_blackm gaussian_transport(z = point_X, target = target,
x_m_names = x_m_names,
scale = scale, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1)
point_X_t_n_blackm
# A tibble: 3 × 2
wtgain birth_weight
* <dbl> <dbl>
1 62.4 2208.
2 62.4 4087.
3 16.9 2205.
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "sex"
treatment_name <- "Male"
treatment_0 <- "Female" treatment_1
<-
point_X_t_n_sex gaussian_transport(z = point_X, target = target,
x_m_names = x_m_names,
scale = scale, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1)
point_X_t_n_sex
# A tibble: 3 × 2
wtgain birth_weight
* <dbl> <dbl>
1 59.0 2405.
2 58.9 4030.
3 19.3 2428.
The estimated parameters can be extracted as follows:
<- attr(point_X_t_n_smoker, "params")
params_smoker params_smoker
$m_0
wtgain birth_weight
30.40885 32.92174
$m_1
wtgain birth_weight
30.55169 31.02590
$S_0
wtgain birth_weight
wtgain 218.95677 14.54058
birth_weight 14.54058 34.09920
$S_1
wtgain birth_weight
wtgain 297.62083 22.09039
birth_weight 22.09039 35.41197
$A
[,1] [,2]
[1,] 1.16471785 0.01715655
[2,] 0.01715655 1.01085009
$scale
[1] 1.00 0.01
$x_m_names
[1] "wtgain" "birth_weight"
The estimated parameters can be extracted as follows:
<- attr(point_X_t_n_blackm, "params")
params_blackm params_blackm
$m_0
wtgain birth_weight
30.67049 33.09063
$m_1
wtgain birth_weight
29.02406 31.00026
$S_0
wtgain birth_weight
wtgain 215.83720 14.15844
birth_weight 14.15844 32.86016
$S_1
wtgain birth_weight
wtgain 278.8963 17.99050
birth_weight 17.9905 40.18716
$A
[,1] [,2]
[1,] 1.1366862866 0.0007016672
[2,] 0.0007016672 1.1055783355
$scale
[1] 1.00 0.01
$x_m_names
[1] "wtgain" "birth_weight"
The estimated parameters can be extracted as follows:
<- attr(point_X_t_n_sex, "params")
params_sex params_sex
$m_0
wtgain birth_weight
30.79877 33.32068
$m_1
wtgain birth_weight
30.00267 32.17229
$S_0
wtgain birth_weight
wtgain 227.8193 16.11780
birth_weight 16.1178 35.86006
$S_1
wtgain birth_weight
wtgain 224.11168 13.81543
birth_weight 13.81543 32.60395
$A
[,1] [,2]
[1,] 0.99222733 -0.00565936
[2,] -0.00565936 0.95595986
$scale
[1] 1.00 0.01
$x_m_names
[1] "wtgain" "birth_weight"
3.1 Estimation
Now that the two models \(\color{couleur1}{\widehat{m}_0(x)}\) and \(\color{couleur2}{\widehat{m}_1(x)}\) are defined and fitted to the data, we can compute the Mutatis Mutandis Sample Conditional Average Effect.
#' Computes the Sample Average Treatment Effect with and without transport
#' @param x_0 vector of values at which to compute $\hat{m}_0(x_0)$, $\hat{m}_1(x_0)$
#' @param x_t vector of values at which to compute $\hat{m}_1(\mathcal{T}(x_t))$
#' @param x_m_name vector of names of the mediator variables
#' @param mod_0 model $\hat{m}_0$
#' @param mod_1 model $\hat{m}_1$
#' @param pred_mod_0 prediction function for model $\hat{m}_0(\cdot)$
#' @param pred_mod_1 prediction function for model $\hat{m}_1(\cdot)$
#' @param return_x if `TRUE` (default) the mediator variables are returned in the table, as well as their transported values
<- function(x_0, x_t, x_m_names, mod_0, mod_1, pred_mod_0, pred_mod_1, return_x = TRUE){
sate if(is.vector(x_0)){
# Univariate case
<- tibble(!!x_m_names := x_0)
new_data else{
}<- x_0
new_data
}if(is.vector(x_t)){
# Univariate case
<- tibble(!!x_m_names := x_t)
new_data_t else{
}<- x_t
new_data_t
}
# $\hat{m}_0(x_0)$
<- pred_mod_0(object = mod_0, newdata = new_data)
y_0 # $\hat{m}_1(x_0)$
<- pred_mod_1(object = mod_1, newdata = new_data)
y_1 # $\hat{m}_1(\mathcal{T}(x_0))$
<- pred_mod_1(object = mod_1, newdata = new_data_t)
y_1_t
<-
scate_tab tibble(y_0 = y_0, y_1 = y_1, y_1_t = y_1_t,
CATE = y_1-y_0, SCATE = y_1_t-y_0)
if(return_x){
<- new_data_t %>% rename_all(~paste0(.x, "_t"))
new_data_t <- bind_cols(new_data, new_data_t, scate_tab)
scate_tab
}
scate_tab }
3.2 GAM (with cubic splines), Gaussian assumption for transport
Let us compute here the SCATE where the models \(\color{couleur1}{\widehat{m}_0(x)}\) and \(\color{couleur2}{\widehat{m}_1(x)}\) are GAM and where the transport method assumes a Gaussian distribution for both mediators.
<-
mm_sate_gam_smoker sate(
x_0 = point_X,
x_t = point_X_t_n_smoker,
x_m_names = c("wtgain", "birth_weight"),
mod_0 = reg_gam_smoker_0,
mod_1 = reg_gam_smoker_1,
pred_mod_0 = model_spline_predict, pred_mod_1 = model_spline_predict,
return = TRUE)
mm_sate_gam_smoker
# A tibble: 3 × 9
wtgain birth_weight wtgain_t birth_weight_t y_0 y_1 y_1_t CATE SCATE
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 60 2500 64.9 2353. 0.488 0.435 0.476 -0.0528 -0.0112
2 60 4200 65.2 4071. 0.469 0.461 0.443 -0.00779 -0.0255
3 20 2500 18.3 2284. 0.384 0.363 0.401 -0.0208 0.0166
<-
mm_sate_gam_blackm sate(
x_0 = point_X,
x_t = point_X_t_n_blackm,
x_m_names = c("wtgain", "birth_weight"),
mod_0 = reg_gam_blackm_0,
mod_1 = reg_gam_blackm_1,
pred_mod_0 = model_spline_predict, pred_mod_1 = model_spline_predict,
return = TRUE)
mm_sate_gam_blackm
# A tibble: 3 × 9
wtgain birth_weight wtgain_t birth_weight_t y_0 y_1 y_1_t CATE SCATE
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 60 2500 62.4 2208. 0.483 0.465 0.527 -0.0177 0.0441
2 60 4200 62.4 4087. 0.460 0.551 0.515 0.0917 0.0558
3 20 2500 16.9 2205. 0.380 0.387 0.441 0.00713 0.0608
<-
mm_sate_gam_sex sate(
x_0 = point_X,
x_t = point_X_t_n_sex,
x_m_names = c("wtgain", "birth_weight"),
mod_0 = reg_gam_sex_0,
mod_1 = reg_gam_sex_1,
pred_mod_0 = model_spline_predict, pred_mod_1 = model_spline_predict,
return = TRUE)
mm_sate_gam_sex
# A tibble: 3 × 9
wtgain birth_weight wtgain_t birth_weigh…¹ y_0 y_1 y_1_t CATE SCATE
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 60 2500 59.0 2405. 0.490 0.474 0.491 -0.0161 6.34e-4
2 60 4200 58.9 4030. 0.466 0.469 0.417 0.00245 -4.89e-2
3 20 2500 19.3 2428. 0.391 0.375 0.390 -0.0159 -9.93e-4
# … with abbreviated variable name ¹birth_weight_t
4 Simulations
Now, let us wrap all these estimation stepts in a single function, so that we can make simulations where we vary the size of the sample \(n\) used to compute the SCTE. For each value of \(n\), let us compute the mutatis mutandis CATE (SCATE) for our three individuals of \(\boldsymbol{x}\). We will further draw 500 different sub-samples over which the estimations will be computed.
#' @param size size of the sample over which to compute the SCATE
#' @param target name of the target variable
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param x_m_names names of the mediator variables
#' @param treatment_name name of the treatment variable (column in `data`)
#' @param treatment_0 value for non treated
#' @param treatment_1 value for treated
#' @param scale vector of scaling to apply to each `x_m_names` variable to transport (default to 1)
<- function(size, target, x_m_names, treatment_name, treatment_0, treatment_1, scale){
simul_n # Random sub-sample of size `size`
<- base[sample(1:nrow(base), size=size), ]
sbase # Regressions on both subsets
<-
reg_gam models_spline(target = target,
treatment_name = treatment_name,
x_m_names = x_m_names,
data = sbase, treatment_0 = treatment_0, treatment_1 = treatment_1, df = c(3, 3))
<- reg_gam$reg_0
reg_gam_0 <- reg_gam$reg_1
reg_gam_1
# Transported values
<-
point_X_t_n gaussian_transport(z = point_X, target = target,
x_m_names = x_m_names,
scale = scale, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1)
# SCATE
<-
mm_sate_gam sate(
x_0 = point_X,
x_t = point_X_t_n,
x_m_names = c("wtgain", "birth_weight"),
mod_0 = reg_gam_0,
mod_1 = reg_gam_1,
pred_mod_0 = model_spline_predict, pred_mod_1 = model_spline_predict,
return = TRUE)
tibble(SCATE = mm_sate_gam$SCATE, idx = 1:nrow(point_X), size = size, treatment_name = treatment_name)
}
For example, with a size \(n=100\), when the treatment is whether the mother is a smoker or not:
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "cig_rec"
treatment_name <- "No"
treatment_0 <- "Yes"
treatment_1
<-
resul_simul_100 simul_n(size = 100, target = target, x_m_names = x_m_names,
treatment_name = treatment_name, treatment_0 = treatment_0, treatment_1 = treatment_1, scale = scale)
resul_simul_100
# A tibble: 3 × 4
SCATE idx size treatment_name
<dbl> <int> <dbl> <chr>
1 0.268 1 100 cig_rec
2 0.388 2 100 cig_rec
3 0.286 3 100 cig_rec
The different values of \(n\):
<- 10^(seq(3,5.2,length=51))
sample_size sample_size
[1] 1000.000 1106.624 1224.616 1355.189 1499.685 1659.587
[7] 1836.538 2032.357 2249.055 2488.857 2754.229 3047.895
[13] 3372.873 3732.502 4130.475 4570.882 5058.247 5597.576
[19] 6194.411 6854.882 7585.776 8394.600 9289.664 10280.163
[25] 11376.273 12589.254 13931.568 15417.005 17060.824 18879.913
[31] 20892.961 23120.648 25585.859 28313.920 31332.857 34673.685
[37] 38370.725 42461.956 46989.411 51999.600 57543.994 63679.552
[43] 70469.307 77983.011 86297.855 95499.259 105681.751 116949.939
[49] 129419.584 143218.790 158489.319
The number of replications of the simulations for each value of \(n\):
<- 500 nb_replicate
Let us perform the simulations in parallel.
library(parallel)
<- detectCores()-1
ncl <- makeCluster(ncl)
cl invisible(clusterEvalQ(cl, library(tidyverse, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(splines, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(expm, warn.conflicts=FALSE, quietly=TRUE)))
The simul_n()
function we defined need to access some functions and some data in each cluster:
clusterExport(cl, c("models_spline", "gaussian_transport", "transport_gaussian_param",
"T_C_single" ,"mm_sate_gam", "model_spline_predict", "sate"))
clusterExport(cl, c("point_X", "base"))
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "cig_rec"
treatment_name <- "No"
treatment_0 <- "Yes"
treatment_1
<-
cate_sim_smoker ::pblapply(rep(sample_size, each = nb_replicate),
pbapplycl = cl, target = target, x_m_names = x_m_names,
simul_n, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1, scale = scale)
The results can be saved:
save(cate_sim_smoker, file = "../output/simulations/cate_sim_smoker.rda")
And then loaded:
load("../output/simulations/cate_sim_smoker.rda")
1]] cate_sim_smoker[[
# A tibble: 3 × 4
SCATE idx size treatment_name
<dbl> <int> <dbl> <chr>
1 0.0827 1 1000 cig_rec
2 0.162 2 1000 cig_rec
3 0.0970 3 1000 cig_rec
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "black_mother"
treatment_name <- "No"
treatment_0 <- "Yes"
treatment_1
<-
cate_sim_blackm ::pblapply(rep(sample_size, each = nb_replicate),
pbapplycl = cl, target = target, x_m_names = x_m_names,
simul_n, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1, scale = scale)
The results can be saved:
save(cate_sim_blackm, file = "../output/simulations/cate_sim_blackm.rda")
And then loaded:
load("../output/simulations/cate_sim_blackm.rda")
<- "nonnatural_delivery"
target <- c("wtgain", "birth_weight")
x_m_names <- c(1,1/100)
scale <- "sex"
treatment_name <- "Male"
treatment_0 <- "Female"
treatment_1
<-
cate_sim_sex ::pblapply(rep(sample_size, each = nb_replicate),
pbapplycl = cl, target = target, x_m_names = x_m_names,
simul_n, treatment_name = treatment_name,
treatment_0 = treatment_0, treatment_1 = treatment_1, scale = scale)
The results can be saved:
save(cate_sim_sex, file = "../output/simulations/cate_sim_sex.rda")
And then loaded:
load("../output/simulations/cate_sim_sex.rda")
stopCluster(cl = cl)
Then, we can plot the results of the simulations. First, we can compute the average value of the SCATE over the 500 runs of the simulation, for each sample size.
<-
cate_sim_df bind_rows(cate_sim_smoker) %>%
bind_rows(
bind_rows(cate_sim_blackm)
%>%
) bind_rows(
bind_rows(cate_sim_sex)
%>%
) group_by(treatment_name, idx, size) %>%
summarise(
q1_05 = quantile(SCATE, probs = .05),
mean = mean(SCATE),
q1_95 = quantile(SCATE, probs = .95),
)
%>%
cate_sim_df mutate(
name = factor(idx, levels = c(1:3),
labels = c("x = (2500, 60)", "x = (4200, 60)", "x = (2500, 20)")),
treatment_name = factor(treatment_name, levels = c("cig_rec", "black_mother", "sex"),
labels = c("Smoker mother", "Black mother", "Baby girl"))
%>%
) ggplot(data = .,
mapping = aes(x = size, y = mean)) +
geom_ribbon(mapping = aes(ymin = q1_05, ymax = q1_95), fill = couleur1, alpha = .3) +
geom_hline(yintercept = 0, colour = "grey", linetype = "dashed") +
geom_line(colour = couleur3) +
facet_grid(name~treatment_name, scales = "free") +
scale_x_log10() +
labs(x = "Number of observations (log scales)", y = "SCATE(x)") +
theme_bw()
Equivalently, we can create beautiful plots with {graphics}.
<-
cate_sim_smoker_df bind_rows(cate_sim_smoker) %>%
group_by(idx, size) %>%
summarise(
q1_05 = quantile(SCATE, probs = .05),
mean = mean(SCATE),
q1_95 = quantile(SCATE, probs = .95),
)
par(mfrow = c(1,3))
<- cate_sim_smoker_df %>% filter(idx == 1)
CATE plot(x = CATE$size, y = CATE$mean,
col=couleur3, lwd=2,
type="l", ylim=c(-.5,.5),
log="x",
xlab="Number of observations (log scales)",
ylab="CATE(2500,60)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col=scales::alpha(couleur1,.3),border=NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<- cate_sim_smoker_df %>% filter(idx == 2)
CATE plot(x = CATE$size, y = CATE$mean,
col=couleur3, lwd=2,
type = "l", ylim = c(-.5,.5), log = "x",
xlab = "Number of observations (log scales)",
ylab = "CATE(4200,60)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col = scales::alpha(couleur1,.3), border = NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<- cate_sim_smoker_df %>% filter(idx == 3)
CATE plot(x = CATE$size, y = CATE$mean,
col = couleur3, lwd=2,
type = "l",ylim=c(-.5,.5),log="x",
xlab = "Number of observations (log scales)",
ylab = "CATE(2500,20)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col = scales::alpha(couleur1,.3), border = NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<-
cate_sim_blackm_df bind_rows(cate_sim_blackm) %>%
group_by(idx, size) %>%
summarise(
q1_05 = quantile(SCATE, probs = .05),
mean = mean(SCATE),
q1_95 = quantile(SCATE, probs = .95),
)
par(mfrow = c(1,3))
<- cate_sim_blackm_df %>% filter(idx == 1)
CATE plot(x = CATE$size, y = CATE$mean,
col=couleur3, lwd=2,
type = "l", ylim = c(-.3,.3), log = "x",
xlab="Number of observations (log scales)",
ylab="CATE(2500,60)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col=scales::alpha(couleur1,.3),border=NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<- cate_sim_blackm_df %>% filter(idx == 2)
CATE plot(x = CATE$size, y = CATE$mean,
col=couleur3, lwd=2,
type = "l", ylim = c(-.3,.3), log = "x",
xlab = "Number of observations (log scales)",
ylab = "CATE(4200,60)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col = scales::alpha(couleur1,.3), border = NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<- cate_sim_blackm_df %>% filter(idx == 3)
CATE plot(x = CATE$size, y = CATE$mean,
col = couleur3, lwd=2,
type = "l", ylim = c(-.3,.3), log = "x",
xlab = "Number of observations (log scales)",
ylab = "CATE(2500,20)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col = scales::alpha(couleur1,.3), border = NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<-
cate_sim_sex_df bind_rows(cate_sim_sex) %>%
group_by(idx, size) %>%
summarise(
q1_05 = quantile(SCATE, probs = .05),
mean = mean(SCATE),
q1_95 = quantile(SCATE, probs = .95),
)
par(mfrow = c(1,3))
<- cate_sim_sex_df %>% filter(idx == 1)
CATE plot(x = CATE$size, y = CATE$mean,
col=couleur3, lwd=2,
type="l", ylim=c(-.2,.2), log="x",
xlab="Number of observations (log scales)",
ylab="CATE(2500,60)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col=scales::alpha(couleur1,.3),border=NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<- cate_sim_sex_df %>% filter(idx == 2)
CATE plot(x = CATE$size, y = CATE$mean,
col=couleur3, lwd=2,
type="l", ylim=c(-.2,.2), log="x",
xlab = "Number of observations (log scales)",
ylab = "CATE(4200,60)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col = scales::alpha(couleur1,.3), border = NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)
<- cate_sim_sex_df %>% filter(idx == 3)
CATE plot(x = CATE$size, y = CATE$mean,
col = couleur3, lwd=2,
type="l", ylim=c(-.2,.2), log="x",
xlab = "Number of observations (log scales)",
ylab = "CATE(2500,20)")
polygon(c(CATE$size, rev(CATE$size)), c(CATE$q1_05, rev(CATE$q1_95)),
col = scales::alpha(couleur1,.3), border = NA)
lines(CATE$size, CATE$q1_05, col = couleur1)
lines(CATE$size, CATE$q1_95, col = couleur1)