2 Estimation of a generalized Richards model of epidemic curve

This section provides the codes used to estimate the models.

2.1 Load Data

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

rm_NA_headtail <- function(x) {
  cumsum(!is.na(x)) != 0 & rev(cumsum(!is.na(rev(x)))) != 0
}

We divide the countries according to the following geo-political groups, as in the first part.

load("./data/output/list_countries.rda")

if (knitr::is_html_output()) {
  list_countries |>
  DT::datatable(
    options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    )
  )
}

The names of the countries can be stored in a variable:

names_countries <- list_countries$country |> unique()

The data from the Covid-19 epidemic can be loaded as follows (see 01_Load_Data.html for more details):

load("./data/output/covid_cases.rda")
covid_cases

# A tibble: 61,410 × 5
   country     date       country_code value stringency_index
   <chr>       <date>     <chr>        <dbl>            <dbl>
 1 Afghanistan 2020-01-22 AFG              0                0
 2 Afghanistan 2020-01-23 AFG              0                0
 3 Afghanistan 2020-01-24 AFG              0                0
 4 Afghanistan 2020-01-25 AFG              0                0
 5 Afghanistan 2020-01-26 AFG              0                0
 6 Afghanistan 2020-01-27 AFG              0                0
 7 Afghanistan 2020-01-28 AFG              0                0
 8 Afghanistan 2020-01-29 AFG              0                0
 9 Afghanistan 2020-01-30 AFG              0                0
10 Afghanistan 2020-01-31 AFG              0                0
# ℹ 61,400 more rows

The population data also need to be loaded:

load("./data/output/population.rda")

Some country names need to be changed to match those found in the Oxford data:

population <- 
  population |> 
  mutate(
    country_wdi = country,
    country_wdi  = recode(
      country_wdi,
      "Brunei Darussalam" = "Brunei",
      "Cabo Verde" = "Cape Verde",
      "Congo, Dem. Rep." = "Democratic Republic of Congo",
      "Congo, Rep." = "Congo",
      "Egypt, Arab Rep." = "Egypt",
      "Gambia, The" = "Gambia",
      "Iran, Islamic Rep." = "Iran",
      "Lao PDR" = "Laos",
      "Russian Federation" = "Russia",
      "Syrian Arab Republic" = "Syria",
      "Venezuela, RB" = "Venezuela",
      "Yemen, Rep." = "Yemen"
    )
  )

2.2 Functions

Our approach is based on an empirical approach. We estimate the parameters of a generalized Richards equation which belongs to the family of exponential growth models, extensively used in the literature:

\[ \Delta C_t = C_{t+1} - C_t = r C_t^P \left[ 1- \left(\frac{C_t}{K}\right)^\alpha\right] + \varepsilon_t, \quad P \in [0,1]. \] where \(\varepsilon_t\) is the model error and is distributed as a Gaussian noise process \(N(0,\sigma^2_{\varepsilon}) \ \forall t\).

The four parameters will be estimated using non linear last square (using nls.lm() from {minpack.lm}).

Let us load the required package:

library(minpack.lm)

We need to define the function objective function, i.e., the one that will be optimized:

#' Generalized Richards
#' 
#' @param theta list of named parameters (r, P, alpha, and K)
#' @param x numeric, cumulative number of infected
generalized_richards_f <- function(theta, x) {
  r <- theta[["r"]];
  P <- theta[["P"]];
  alpha <- theta[["alpha"]] ; 
  K <- theta[["K"]] ; 
  r * x^P *(1-(x/K)^alpha) + x
}

The function that will be minimized, the quadratic error:

#' Loss function
#' @param theta list of named parameters
#' @param fun function to be optimized
#' @param y observed number of new infected
#' @param x cumulative number of infected
loss_function <- function(theta,
                          fun,
                          y,
                          x) {
  (y - fun(theta = theta, x = x))^2
}

Let us also define a function to plot multiple graphs on a single figure, with a shared legend:

g_legend <- function(a_gplot) {
  tmp <- ggplot_gtable(ggplot_build(a_gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

We can define a theme for the graphs, as in the data section:

#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  theme(
    text = element_text(family = "Times"),
    plot.background = element_rect(fill = "transparent", color = NA),
    panel.background = element_rect(fill = "transparent", color = NA),
    panel.border = element_rect(fill = NA, colour = "grey50", linewidth = 1),
    axis.text = element_text(),
    legend.text = element_text(size = rel(1.1)),
    legend.title = element_text(size = rel(1.1)),
    legend.background = element_rect(fill = "transparent", color = NULL),
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.key = element_blank(),
    panel.spacing = unit(1, "lines"),
    panel.grid.major = element_line(colour = "grey90"),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0, size = rel(1.3), face = "bold"),
    plot.title.position = "plot",
    plot.margin = unit(c(1, 1, 1, 1), "lines"),
    strip.background = element_rect(fill = NA, colour = NA),
    strip.text = element_text(size = rel(1.1))
  )
}

save(theme_paper, file="./assets/theme_paper.rda")

2.3 Estimations

2.3.1 For a Single Country

Before estimating the parameters for all countries, let us go through an example for a single country, to better understand the codes.

Let us consider the first country:

i <- 1
country <- names_countries[i]
country

[1] "Algeria"

The population of the country:

pop_country <- population |> 
  filter(country_wdi == !!country) |> 
  pull(pop)
pop_country

[1] 41130281

The Oxford data need to be filtered to keep only those of the current country:

dat <- 
  covid_cases |> 
  filter(country %in% !!country)

Let us create \(C_{t+1}\) (C) and \(C_{t}\) (C_m):

dat <- dat |> 
  mutate(C = lead(value), C_m = value) |> 
  filter(rm_NA_headtail(C))

2.3.1.1 Non Linear Leat Square

We need to define starting valued for the optimization algorithm:

# starting param values
start <- list(
  r = 1,
  P  = .5,
  alpha = .5,
  K = .6*pop_country
)

Let us estimate the four parameters using nls.lm() from {minpack.lm}:

out <- nls.lm(
  par = start, 
  fn = loss_function,
  y = dat$C,
  x = dat$C_m,
  fun = generalized_richards_f,
  control = nls.lm.control(maxiter = 1024),
  lower = c(r=0,P=0,alpha=0,K=0), 
  upper = c(r=100,P=1,alpha=Inf,K=pop_country)
)

The estimated parameters:

nls_estimates <- out$par
nls_estimates

$r
[1] 26.29748

$P
[1] 0.2884883

$alpha
[1] 0.1584229

$K
[1] 41130281

Those can be used to compute the fitted values:

fit <- generalized_richards_f(nls_estimates, dat$C_m)
head(fit, 100)

  [1]    0.00000    0.00000    0.00000    0.00000    0.00000    0.00000
  [7]    0.00000    0.00000    0.00000    0.00000    0.00000    0.00000
 [13]    0.00000    0.00000    0.00000    0.00000    0.00000    0.00000
 [19]    0.00000    0.00000    0.00000    0.00000    0.00000    0.00000
 [25]    0.00000    0.00000    0.00000    0.00000    0.00000    0.00000
 [31]    0.00000   14.45817   17.67269   19.90339   21.67474   23.17191
 [37]   24.48413   27.73068   31.13966   38.11410   43.47919   49.94947
 [43]   55.53948   61.19269   66.11604   70.21619   72.32788   75.41277
 [49]   77.66918   82.53435   89.29164   96.86893  105.42271  116.50872
 [55]  128.94718  141.17733  160.05001  187.35624  217.76008  252.08860
 [61]  289.64705  334.94328  385.74821  435.30017  483.60924  538.33976
 [67]  607.68943  690.68949  784.32650  898.88698 1018.06442 1138.50904
 [73] 1262.97097 1374.20694 1481.20744 1581.37989 1668.16490 1752.49283
 [79] 1839.66129 1921.75711 2009.92753 2095.97140 2183.99208 2279.97786
 [85] 2383.47793 2487.77367 2594.90935 2702.84616 2812.02437 2919.53768
 [91] 3022.63108 3127.56261 3236.94691 3352.95833 3474.57532 3610.64030
 [97] 3755.48371 3904.31661 4054.82115 4212.93494

We can compute the Root Mean Square Error:

(err_nls <- sqrt(sum((dat$C - fit)^2)))

[1] 4997.746

2.3.2 For all Countries

Let us wrap the code used for a single country in a function.

estimate_country <- function(i) {
  country <- names_countries[i]
  
  pop_country <- population |> 
    filter(country == !!country) |> 
    pull(pop)
  
  # Filtering the Oxford data to keep only those of the current country
  dat <- 
    covid_cases |> 
    filter(country %in% !!country) |> 
    filter(rm_NA_headtail(value))
  
  if(nrow(dat) == 0){
    return(NULL)
  }
  
  dat <- dat |> 
    mutate(C = lead(value), C_m = value) |> 
    filter(rm_NA_headtail(C))
  
  # starting param values
  start <- list(
    r = 1,
    P  = .5,
    alpha = .5,
    K = .6*pop_country
  )
  
  # non linear least square estimation
  out <- try(
    nls.lm(
      par = start, 
      fn = loss_function,
      y = dat$C,
      x = dat$C_m,
      fun = generalized_richards_f,
      control = nls.lm.control(maxiter = 1024),
      lower = c(r=0, P=0, alpha=0, K=0), 
      upper = c(r=100, P=1, alpha=Inf, K=pop_country)
    )
  )
  
  if (!inherits(out, "try-error")) {
    # fitted values
    fit <- generalized_richards_f(out$par, dat$C_m)
    
    # extract estimated model parameters
    nls_estimates <- out$par
    
    # RMSE
    err_nls = sqrt(sum((dat$C - fit)^2))
  } else {
    fit <- nls_estimates <- err_nls <- NULL
  }
  
  resul_c <- 
    list(
      nls = list(
        estimates = nls_estimates,
        fit = fit,
        rmse = err_nls
      ),
      dat = dat
    )
  
  resul_c
}# End of estimate_country()

Then, let us loop over the countries. As each run is independent from the others, we can perform the computation in parallel. Please note that in this notebook, we will only load the results obtained using the same codes and not actually run those here.

library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))

Some packages need to be run in each cluster:

clusterEvalQ(cl, {
    library(dplyr)
    library(stringr)
  library(minpack.lm)
  }) |> 
    invisible()

Some data and functions also need to be sent to the clusters:

clusterExport(cl, c("names_countries", "population", "covid_cases", "rm_NA_headtail"))
clusterExport(cl, c("generalized_richards_f", "loss_function"))

Then we can loop over the index of names_countries, using pblapply() from {pbapply}:

resul_fit_countries <- pbapply::pblapply(
  1:length(names_countries), estimate_country, cl = cl
)
names(resul_fit_countries) <- names_countries
save(resul_fit_countries, file = "./estim/resul_fit_countries.rda")
stopCluster(cl)