Jean-Philippe Boucher, Université du Québec À Montréal (🐦 @J_P_Boucher)
Arthur Charpentier, Université du Québec À Montréal (🐦 @freakonometrics)
Ewen Gallic, Aix-Marseille Université (🐦 @3wen)
Analyse the composition of the portfolio: covariates, frequency by covariates.
First, load the training and testing sets, located in the following folder: data/canada_panel/
(CanadaPanelTrain.csv
and CanadaPanelTest.csv
, respectively).
library(tidyverse)
canada_train <- readr::read_csv("./data/canada_panel/CanadaPanelTrain.csv")
canada_test <- readr::read_csv("./data/canada_panel/CanadaPanelTest.csv")
nrow(canada_train)
canada_train
Define a function that computes summary statistics for a vector of numerics:
Try to account for possible NA
values. Name the function my_summary
.
#' my_summary
#' Returns a tibble with summary statistics for a numerical vector
#' @param x vector of numerics
my_summary <- function(x) {
tibble(
Average = mean(x, na.rm=TRUE),
Min = min(x, na.rm = TRUE),
Max = max(x, na.rm = TRUE),
`10th percentile` = quantile(x, probs = .1, names = FALSE),
`25th percentile` = quantile(x, probs = .25, names = FALSE),
Median = median(x, na.rm = TRUE),
`75th percentile` = quantile(x, probs = .75, names = FALSE),
`90th percentile` = quantile(x, probs = .9, names = FALSE)
)
}
Use the function on a single variable from the training sample:
Apply this function to multiple variables of your choice to explore the dataset and put the results in a table. You may either use a for loop, lapply()
or map()
(pick the option you are more comfortable with). Do not forget to add the variable name to be able to identify the variables.
some_variables <- c("RA_DISTANCE_DRIVEN", "RA_NBTRIP", "RA_HOURS_DRIVEN")
# Using a for loop
summary_table_canada <- NULL
for( var in some_variables ) {
summary_table_canada <-
summary_table_canada %>%
bind_rows(
my_summary(canada_train[[var]])
)
}
summary_table_canada$variable <- some_variables
summary_table_canada
# Using lapply
summary_table_canada <-
lapply(some_variables, function(var) my_summary(canada_train[[var]])) %>%
bind_rows()
summary_table_canada$variable <- some_variables
summary_table_canada
# Using map
summary_table_canada <-
canada_train %>%
dplyr::select(!!some_variables) %>%
purrr::map(my_summary) %>%
bind_rows(.id = "variable")
summary_table_canada
Plot the distribution of claims on a barplot. You may display the distribution of gender among each claims levels (by showing the proportion of each gender within the bars). The result may look like the figure displayed below:
p_distribution_claims_gender <-
canada_train %>%
dplyr::group_by(RA_ACCIDENT_IND, RA_GENDER) %>%
dplyr::tally() %>%
mutate(RA_GENDER = factor(RA_GENDER, levels = c("2", "1", "3"),
labels = c("Female", "Male", "Unknown"))) %>%
ggplot(data = ., aes(x = reorder(RA_ACCIDENT_IND, -n), y = n, fill = RA_GENDER)) +
geom_col() +
scale_fill_manual(name = "Gender",
values = c("Female" = "#66c2a5",
"Male" = "#fc8d62",
"Unknown" = "#8da0cb")) +
labs(x = "Number of Claims", y = "Frequency",
title = "Distribution of Number of Claims and Gender") +
# Legend below the graph
theme(legend.position = "bottom")
ggsave(p_distribution_claims_gender,
file = "figs/p_distribution_claims_gender.png", width = 10, height = 6)
p_distribution_claims_gender
Plot a boxplot of exposure time depending on marital status and vehicle use. Use faceting to separate the data according to vehicule use. The resulting plot should look like the one below:
p_boxplot_distrib_exposure_time <-
canada_train %>%
dplyr::mutate(RA_MARITALSTATUS = factor(RA_MARITALSTATUS,
levels = c(0, 1),
labels = c("Single", "Other"))) %>%
dplyr::mutate(RA_VEH_USE = factor(RA_VEH_USE,
levels = c(1, 2, 0),
labels = c("Commute", "Pleasure", "Other"))) %>%
ggplot(data = .,
aes(y = RA_EXPOSURE_TIME, x = RA_MARITALSTATUS)) +
geom_boxplot(aes(fill = RA_MARITALSTATUS)) +
coord_flip() +
labs(x = "Marital Status", y = "Exposure Time") +
scale_fill_discrete("Marital Status") +
facet_wrap(~RA_VEH_USE, scales = "free_x")
ggsave(p_boxplot_distrib_exposure_time,
file = "figs/p_boxplot_distrib_exposure_time.png",
width = 10, height = 6)
p_boxplot_distrib_exposure_time
using GLM Poisson and several covariates (of your choice), compute the impact of some telematic informations on the frequency part of the premium. To do so, start with loading the training and testing sets located in the following folder: data/canada_panel/
(CanadaPanelTrain.csv
and CanadaPanelTest.csv
, respectively).
library(tidyverse)
canada_train <- readr::read_csv("./data/canada_panel/CanadaPanelTrain.csv")
canada_test <- readr::read_csv("./data/canada_panel/CanadaPanelTest.csv")
nrow(canada_train)
canada_train
First, some variables need to be recoded in both datasets.
The gender variable currently offers three levels: female (2), male (1) and unknown (3). Create a new variable (gender
) which takes the following values:
1
if male2
otherwise.Define Other
as the reference.
canada_train <-
canada_train %>%
dplyr::mutate(gender = ifelse(RA_GENDER == 1, yes = "Male", "Other"),
gender = factor(gender, levels = c("Other", "Male")))
canada_test <-
canada_test %>%
dplyr::mutate(gender = ifelse(RA_GENDER == 1, yes = "Male", "Other"),
gender = factor(gender, levels = c("Other", "Male")))
Change the marital status variable (RA_MARITALSTATUS
) to a factor variable which takes two values:
Single
if the insured is single (i.e., when RA_MARITALSTATUS
equals 0)Other
otherwise.canada_train <-
canada_train %>%
dplyr::mutate(RA_MARITALSTATUS = ifelse(RA_MARITALSTATUS == 0, yes = "Single", "Other"),
RA_MARITALSTATUS = factor(RA_MARITALSTATUS, levels = c("Other", "Single")))
canada_test <-
canada_test %>%
dplyr::mutate(RA_MARITALSTATUS = ifelse(RA_MARITALSTATUS == 0, yes = "Single", "Other"),
RA_MARITALSTATUS = factor(RA_MARITALSTATUS, levels = c("Other", "Single")))
Vehicle use (RA_VEH_USE
) is currently labelled as follows:
0
: other1
: commute2
: pleasure.Turn that variable into a factor and add the corresponding labels. Define Other
as the reference.
canada_train <-
canada_train %>%
dplyr::mutate(RA_VEH_USE = case_when(RA_VEH_USE == 1 ~ "Commute",
RA_VEH_USE == 2 ~ "Pleasure",
TRUE ~ "Other"),
RA_VEH_USE = factor(RA_VEH_USE, levels = c("Other", "Commute", "Pleasure"))
)
canada_test <-
canada_test %>%
dplyr::mutate(RA_VEH_USE = case_when(RA_VEH_USE == 1 ~ "Commute",
RA_VEH_USE == 2 ~ "Pleasure",
TRUE ~ "Other"),
RA_VEH_USE = factor(RA_VEH_USE, levels = c("Other", "Commute", "Pleasure"))
)
Using the glm()
function, fit a Poisson regression model on claims, without any offset. Use the following covariates: gender
, RA_MARITALSTATUS
, and RA_VEH_USE
. Store the estimation results in an object named mod_1
.
mod_1 <-
glm(RA_ACCIDENT_IND ~ gender + RA_MARITALSTATUS + RA_VEH_USE,
family="poisson", data=canada_train)
summary(mod_1)
Fit another model using exposure time (RA_EXPOSURE_TIME
) as an offset. Store the estimation results in a object named mod_2
.
mod_2 <-
glm(RA_ACCIDENT_IND ~ RA_GENDER + RA_MARITALSTATUS + RA_VEH_USE +
offset(log(RA_EXPOSURE_TIME)),
family="poisson", data=canada_train)
summary(mod_2)
Fit another model using distance driven (RA_DISTANCE_DRIVEN
) as an offset. Store the estimation results in a object named mod_3
.
mod_3 <-
glm(RA_ACCIDENT_IND ~ RA_GENDER + RA_MARITALSTATUS + RA_VEH_USE +
offset(log(RA_DISTANCE_DRIVEN)),
family="poisson", data=canada_train)
summary(mod_3)
Fit another model using the number of trips (RA_NBTRIP
) as an offset. Store the estimation results in a object named mod_4
.
mod_4 <-
glm(RA_ACCIDENT_IND ~ RA_GENDER + RA_MARITALSTATUS + RA_VEH_USE +
offset(log(RA_NBTRIP)),
family="poisson", data=canada_train)
summary(mod_4)
Fit another model using hours driven (RA_HOURS_DRIVEN
) as an offset. Store the estimation results in a object named mod_5
.
mod_5 <-
glm(RA_ACCIDENT_IND ~ RA_GENDER + RA_MARITALSTATUS + RA_VEH_USE +
offset(log(RA_HOURS_DRIVEN)),
family="poisson", data=canada_train)
summary(mod_5)
With the help of the function mtable()
from {memisc
}, create a table showing the results of the 5 estimations. Include in the table the number of observations as well as Akaike Information Criterion (AIC) values.
Let us assume an exponential “time” between claims. Use the neg_logl_Expo_Duration()
function defined below to fit different Gamma Duration using different measures of exposure (exposure time, distance driven, number of trips, hours driven).
#' neg_logl_Expo_Duration
#' @param parms vector of coefficients values
#' @param X matrix of predictors (including the constant as the first column)
#' @param y variable of interest
#' @param expo exposure values
neg_logl_Gamma_Duration <- function(parms, X, y, expo) {
beta <- parms[-length(parms)]
alpha <- parms[length(parms)]
lambda <- exp(X %*% beta)
Fct1 <- (y == 0) * 1 +
(y != 0) * pgamma(expo, scale = 1/lambda, shape = alpha * (pmax(y, 1)))
Fct2 <- pgamma(expo, scale = 1/lambda, shape = alpha * (y+1))
loglike <- t(log(Fct1 - Fct2)) %*% X[,1]
logliketot <- loglike
-logliketot
}
Using the model.matrix()
function, create the model matrix containing the dummy variables for the insured gender (gender
), marital status (RA_MARITALSTATUS
) and vehicle use (RA_VEH_USE
).
Then extract the variable of interest, i.e., the number of claims from the dataset and store it in a vector named nb_sin
:
Complete the body of the following fit_gamma()
function so that:
var_expo
)optim()
which is fed with some initial values (do not forget to set the parameter hessian
to TRUE
)The function should return a list with three elements:
fit
: the model fitcoefs
: the tibble with the estimates and their 95% confidence intervalaic
: the AIC#' fit_gamma
#' @param expo_name name of the variable to use as exposure
#' @param init starting values for the optimization algorithm
fit_gamma <- function(expo_name, init) {
var_expo <- magrittr::extract2(canada_train, expo_name)
fit_gamma <-
optim(par = init,
fn = neg_logl_Gamma_Duration,
X = predictors,
y = nb_sin,
expo = var_expo,
hessian=TRUE)
# Coefficients
coefs_fit_gamma <- fit_gamma$par
names(coefs_fit_gamma) <- c(colnames(predictors), "alpha")
# Standard errors
fisher_info <- solve(fit_gamma$hessian)
prop_sigma <- sqrt(abs(diag(fisher_info)))
# Confidence interval
upper <- fit_gamma$par + 1.96 * prop_sigma
lower <- fit_gamma$par - 1.96 * prop_sigma
coefs <-
tibble(
coef_name = c(colnames(predictors), "alpha"),
value = fit_gamma$par,
upper=upper,
lower=lower,
se = prop_sigma)
AIC_gamma <- 2 * fit_gamma$value + 2*(length(fit_gamma$par))
list(fit = fit_gamma, coefs = coefs, aic = AIC_gamma)
}
Use that function to estimate a Gamma Duration model for the number of claims, using exposure time in calendar year (RA_EXPOSURE_TIME
) as the exposure measure:
init <- c(-5, -0.2, -0.5, .1, -0.2, 1)
fit_gamma_exposure <-
fit_gamma(expo_name = "RA_EXPOSURE_TIME", init = init)
fit_gamma_exposure$coefs
fit_gamma_exposure$aic
Use that function to estimate a Gamma Duration model for the number of claims, using the distance driven (RA_DISTANCE_DRIVEN
) as the exposure measure:
init <- c(-10, -0.2, -0.5, .1, -0.2, 1)
fit_gamma_distance <-
fit_gamma(expo_name = "RA_DISTANCE_DRIVEN", init = init)
fit_gamma_distance$coefs
fit_gamma_distance$aic
Use that function to estimate a Gamma Duration model for the number of claims, using the number of trips (RA_NBTRIP
) as the exposure measure:
init <- c(-5, -0.2, -0.5, .1, -0.2, 1)
fit_gamma_nbtrips <-
fit_gamma(expo_name = "RA_NBTRIP", init = init)
fit_gamma_nbtrips$coefs
fit_gamma_nbtrips$aic
Use that function to estimate a Gamma Duration model for the number of claims, using the hours driven (RA_HOURS_DRIVEN
) as the exposure measure:
init <- c(-10, -0.2, -0.5, .1, -0.2, 1)
fit_gamma_hours <-
fit_gamma(expo_name = "RA_HOURS_DRIVEN", init = init)
fit_gamma_hours$coefs
fit_gamma_hours$aic
Now we should compare the performances of the different models with out-of-sample predictions.
Define the function sq_error()
which computes the squared error given some observations and prediction.
#' sq_error
#' Computes the Squared Error
#' @param obs vector of observed values
#' @param pred vector of predicted values
sq_error <- function(obs, pred){
sum((pred - obs)**2)
}
Define a function that you will name predict_gamma
which performs out of sample predictions for a Gamma Duration model. The function may be constructed as follows:
This function may return a list of three elements:
pred
: the predicted valuesproba
: the predicted probabilitiesscores
: a table with the out-of-sample scores#' predict_gamma
#' Performs out-of-sample predictions for a GCD model
#' @param fit fit of the model
#' @param expo_name name of the exposition variable
#' @param model_name name of the model
predict_gamma <- function(fit, expo_name, model_name) {
# Exposition variable
expo <- magrittr::extract2(canada_test, expo_name)
# Estimated coefficients
beta <- fit$par[-length(fit$par)]
alpha <- fit$par[length(fit$par)]
lambda <- exp(predictors %*% beta)
# Predicted values
dim <- nrow(canada_test)
mean_gcd <- rep(NA, dim)
pb <- txtProgressBar(min = 0, max = dim, style = 3)
for (j in 1:dim) {
meangcd <- 0
for (i in 1:50) {
Fct1 <- pgamma(expo[j], scale = 1/lambda[j], shape = alpha * (pmax(i,1)))
Fct2 <- pgamma(expo[j], scale = 1/lambda[j], shape = alpha*(i+1))
meangcd <- meangcd + i * (Fct1 - Fct2)
}
mean_gcd[j] <- meangcd
setTxtProgressBar(pb, j)
}
# Probability
Fct1 <- (nb_sin == 0) * 1 + (nb_sin != 0) *
pgamma(expo, scale = 1/lambda, shape = alpha * (pmax(nb_sin,1)))
Fct2 <- pgamma(expo, scale = 1/lambda, shape = alpha * (nb_sin+1))
proba <- Fct1 - Fct2
# Scores
scores_GCD <-
tibble(
Model = model_name,
Logarithmic = sum(-log(proba)),
`Squared Error` = sq_error(canada_test$RA_ACCIDENT_IND,
mean_gcd)
)
list(pred = mean_gcd, proba = proba, scores = scores_GCD)
}
Now you can use the function predict_gamma()
on the four Gamma Duration modes previously estimated.
predictors <-
model.matrix(~ gender + RA_MARITALSTATUS + RA_VEH_USE, data = canada_test)
nb_sin <- canada_test$RA_ACCIDENT_IND
Compute the out-of-sample predictions for the model which uses exposure time (RA_EXPOSURE_TIME
) as the exposure measure (fit_gamma_exposure
).
oos_exposure_time <-
predict_gamma(fit = fit_gamma_exposure$fit,
expo_name = "RA_EXPOSURE_TIME",
model_name = "Exp. Time")
oos_exposure_time$scores
Compute the out-of-sample predictions for the model which uses distance drivent (RA_DISTANCE_DRIVEN
) as the exposure measure (fit_gamma_distance
).
oos_distance_driven <-
predict_gamma(fit = fit_gamma_distance$fit,
expo_name = "RA_DISTANCE_DRIVEN",
model_name = "Distance")
oos_distance_driven$scores
Compute the out-of-sample predictions for the model which uses the number of trips (RA_NBTRIP
) as the exposure measure (fit_gamma_nbtrips
).
oos_nb_trips <-
predict_gamma(fit = fit_gamma_nbtrips$fit,
expo_name = "RA_NBTRIP",
model_name = "# Trips")
oos_nb_trips$scores
Compute the out-of-sample predictions for the model which uses hours driven (RA_HOURS_DRIVEN
) as the exposure measure (fit_gamma_hours
).
oos_hours_driven <-
predict_gamma(fit = fit_gamma_hours$fit,
expo_name = "RA_HOURS_DRIVEN",
model_name = "Hours")
oos_hours_driven$scores
Lastly, bind the rows of the tables containing the scores of the four models to compare the performances of your models with out-of-sample data.