1  Load Data

This chapter provides the codes used to obtain the final dataset used in the article. It contains two sections. The first one concerns the Covid-19 epidemic, the second one is devoted to the 2009-2010 H1N1 epidemic.

The {tidyverse} package is needed, as well as {lubridate}:

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)

For graphical output, we can define a theme:

#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  theme(
    text = element_text(family = "Times"),
    plot.background = element_rect(fill = "transparent", color = NA),
    panel.background = element_rect(fill = "transparent", color = NA),
    panel.border = element_rect(fill = NA, colour = "grey50", linewidth = 1),
    axis.text = element_text(),
    legend.text = element_text(size = rel(1.1)),
    legend.title = element_text(size = rel(1.1)),
    legend.background = element_rect(fill = "transparent", color = NULL),
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.key = element_blank(),
    panel.spacing = unit(1, "lines"),
    panel.grid.major = element_line(colour = "grey90"),
    panel.grid.minor = element_blank(),
    plot.title = element_text(hjust = 0, size = rel(1.3), face = "bold"),
    plot.title.position = "plot",
    plot.margin = unit(c(1, 1, 1, 1), "lines"),
    strip.background = element_rect(fill = NA, colour = NA),
    strip.text = element_text(size = rel(1.1))
  )
}

1.1 Covid-19

This section is devoted to Covid-19 data. The first part presents the data relative to the number of cases while the second part shows the socio-economic data associated to each country during the period of the epidemic.

1.1.1 Epidemic Data

The epidemic data come from the COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (GitHub).

To handle dates properly, some adjustments need to be done:

# source("themes.R")
if (.Platform$OS.type == "unix") {
  Sys.setlocale("LC_ALL", "en_US.UTF-8")
} else {
  Sys.setlocale("LC_ALL", "English_United States")
}
[1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"

Let us define the following function (as suggested by markus on stackoverflow) to remove trailing NAs:

rm_NA_headtail <- function(x) {
  cumsum(!is.na(x)) != 0 & rev(cumsum(!is.na(rev(x)))) != 0
}

We can create a table, list_countries, with two columns: one that contains the name of the countries, and another one that gives the group the corresponding country belongs to.

mena <- c(
  "Algeria", "Bahrain", "Egypt", "Iran", "Iraq", "Israel", "Jordan", "Kuwait",
  "Lebanon", "Morocco", "Oman", "Qatar", "Saudi Arabia", "Syria", "Tunisia",
  "Yemen"
)
ssa <- c(
  "Angola", "Benin", "Burkina Faso", "Burundi","Cape Verde",
  "Cameroon", "Central African Republic", "Chad", 
  "Democratic Republic of Congo", "Congo", "Cote d'Ivoire", "Djibouti", 
  "Ethiopia", "Gabon", "Gambia", "Ghana", "Guinea", "Kenya", 
  "Lesotho", "Liberia", "Madagascar", "Malawi", "Mali", "Mauritania", 
  "Mauritius", "Mozambique", "Namibia", "Niger", "Nigeria", "Rwanda", 
  "Senegal", "Seychelles", "Sierra Leone", "Somalia", "South Africa", 
  "South Sudan", "Sudan", "Eswatini", "Tanzania", "Togo", "Uganda", 
  "Zimbabwe"
)
latin_america <- c(
  "Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador",
  "El Salvador", "Guatemala", "Haiti", "Honduras", "Jamaica", "Mexico",
  "Nicaragua", "Panama", "Paraguay", "Peru", "Uruguay", "Venezuela"
)
asia <- c(
  "Afghanistan", "Bangladesh", "Bhutan", "Brunei", "Cambodia", "India", 
  "Indonesia", "Laos", "Malaysia", "Mongolia", "Myanmar", "Nepal", "Pakistan", 
  "Philippines", "Russia", "Solomon Islands", "South Korea", "Sri Lanka",
  "Thailand", "Timor-Leste", "Vanuatu", "Vietnam", "China"
)
indus <- c(
  "Australia", "Austria", "Belgium", "Canada", "Denmark", "France", "Germany",
  "Greece", "Ireland", "Italy", "Japan", "Netherlands", "Spain", "Sweden", 
  "United Kingdom", "United States"
)

list_countries <- 
  tibble(country = mena, group = "MENA") |> 
  bind_rows(tibble(country = ssa, group = "Sub-Saharan Africa")) |> 
  bind_rows(tibble(country = latin_america, group = "Latin America")) |> 
  bind_rows(tibble(country = asia, group = "Asia")) |> 
  bind_rows(tibble(country = indus, group = "Industrialized"))

list_countries
# A tibble: 115 × 2
   country group
   <chr>   <chr>
 1 Algeria MENA 
 2 Bahrain MENA 
 3 Egypt   MENA 
 4 Iran    MENA 
 5 Iraq    MENA 
 6 Israel  MENA 
 7 Jordan  MENA 
 8 Kuwait  MENA 
 9 Lebanon MENA 
10 Morocco MENA 
# ℹ 105 more rows
save(list_countries, file = "./data/output/list_countries.rda")

We can also create a variable with the name of all countries:

names_countries <- list_countries$country |> unique()

The data can be obtained directly from GitHub:

# Run once
df_oxford <- str_c(
  "https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/",
  "OxCGRT_latest.csv"
) |>
  read.csv()
# Save the raw data
save(df_oxford, file = "./data/raw/df_oxford.rda")

To avoid downloading the data each time we run the program, we can save them and only load the previously downloaded data as follows:

load("./data/raw/df_oxford.rda")

We want to focus only on data at the country level: RegionName must be equal to the empty string.

confirmed_df <- 
  df_oxford |> 
  filter(RegionName == "") |> 
  select(
    country = CountryName, country_code = CountryCode,
    date = Date,
    value = ConfirmedCases, stringency_index = StringencyIndex) |> 
  filter(country %in% names_countries) |> 
  as_tibble() |> 
  mutate(
    date = ymd(date),
    days_since_2020_01_22 = lubridate::interval(
      lubridate::ymd("2020-01-22"), date
    ) / lubridate::ddays(1)
    )

The cumulative number of cases is contained in value. There are some bizarre values corresponding to statistical adjustments. We can smooth the series of the number of cases using a 7 days rolling windows. To do so, we compute the first difference of the cumulative number of cases to get daily cases, then smooth the data. Lastly, we compute the cumulative number of cases from the smoothed values.

confirmed_df <- 
  confirmed_df |> 
  group_by(country, country_code) |> 
  mutate(c = value - lag(value)) |> 
  mutate(c_7d = zoo::rollmean(c, k = 7, fill = NA)) |> 
  mutate(c_7d = ifelse(c_7d < 0, yes = 0, no = c_7d)) |> 
  group_by(country, country_code) |> 
  mutate(cumul = ifelse(is.na(c_7d), yes = 0, no = c_7d),
         cumul = cumsum(cumul),
         cumul = ifelse(is.na(value), yes = NA, no = cumul)) |> 
  ungroup()

Then, we replace cumulative number of confirmed cases with the value obtained using a moving average:

confirmed_df <- 
  confirmed_df |> 
  mutate(value = cumul)

The trailing NA values can be removed as follows:

confirmed_df <- 
  confirmed_df |> 
  group_by(country) |> 
  filter(rm_NA_headtail(value)) |> 
  ungroup()

The series can be plotted. For Mena countries:

split_chunks <- function(x,n) {
  split(x, cut(seq_along(x), n, labels = FALSE))
}

g <- split_chunks(1:length(mena), n = 2)

for (i in 1:length(g)) {
  p <- 
    ggplot(
      data = confirmed_df |> 
        filter(country %in% mena[g[[i]]]), 
      mapping = aes(x = date, y = value, colour = country)
    ) +
    geom_line() +
    scale_colour_discrete(NULL) +
    scale_x_date(
      breaks = ymd(pretty_dates(confirmed_df$date, n = 5)), 
      date_labels = "%Y-%m"
    ) +
    labs(
      x = NULL, y = NULL, 
      title = str_c(
        "Cumulative Number of Covid-19 cases [MENA (", i, "/", length(g), ")]"
      ),
      subtitle = "Source: JHU"
    ) +
    theme_paper()
  print(p)
}

For Sub-Saharan countries:

g <- split_chunks(1:length(ssa), n = 5)

for (i in 1:length(g)) {
  p <- ggplot(
    data = confirmed_df |> filter(country %in% ssa[g[[i]]]),
    mapping = aes(x = date, y = value, colour = country)
  ) +
    geom_line() +
    scale_colour_discrete(NULL) +
    scale_x_date(
      breaks = ymd(pretty_dates(confirmed_df$date, n = 5)), 
      date_labels = "%Y-%m"
    ) +
    labs(
      x = NULL, y = NULL, 
      title = str_c(
        "Cumulative Number of Covid-19 cases [Sub-Saharan Africa (",  i, "/", 
        length(g), ")]"
      ),
      subtitle = "Source: JHU") +
    theme_paper()
  print(p)
}

For Latin America:

g <- split_chunks(1:length(latin_america), n = 2)

for (i in 1:length(g)) {
  p <- ggplot(
    data = confirmed_df |> filter(country %in% latin_america[g[[i]]]),
    mapping = aes(x = date, y = value, colour = country)
  ) +
    geom_line() +
    scale_colour_discrete(NULL) +
    scale_x_date(breaks = ymd(pretty_dates(confirmed_df$date, n = 5)), 
                 date_labels = "%Y-%m") +
    labs(
      x = NULL, y = NULL, 
      title = str_c(
        "Cumulative Number of Covid-19 cases [Latin America (", 
        i, "/", length(g), ")]"
      ),
      subtitle = "Source: JHU"
    ) +
    theme_paper()
  print(p)
}

For Asia:

g <- split_chunks(1:length(asia), n = 2)

for (i in 1:length(g)) {
  p <- ggplot(
      data = confirmed_df |> filter(country %in% asia[g[[i]]]),
      mapping = aes(x = date, y = value, colour = country)
  ) +
    geom_line() +
    scale_colour_discrete(NULL) +
    scale_x_date(
      breaks = ymd(pretty_dates(confirmed_df$date, n = 5)), 
      date_labels = "%Y-%m"
    ) +
    labs(
      x = NULL, y = NULL, 
      title = str_c(
        "Cumulative Number of Covid-19 cases [Asia (", i, "/", length(g), ")]"
      ),
      subtitle = "Source: JHU") +
    theme_paper()
  print(p)
}

And for industrialized countries:

g <- split_chunks(1:length(indus), n = 2)

for (i in 1:length(g)) {
  p <- ggplot(
    data = confirmed_df |> filter(country %in% indus[g[[i]]]),
    mapping = aes(x = date, y = value, colour = country)
  ) +
    geom_line() +
    scale_colour_discrete(NULL) +
    scale_x_date(
      breaks = ymd(pretty_dates(confirmed_df$date, n = 5)), 
      date_labels = "%Y-%m"
    ) +
    labs(
      x = NULL, y = NULL, 
      title = str_c(
        "Cumulative Number of Covid-19 cases [Industrialized countries (", 
        i, "/", length(g), ")]"
      ),
      subtitle = "Source: JHU"
    ) +
    theme_paper()
  print(p)
}

Let us save the dataset:

covid_cases <- confirmed_df |> 
  select(country, date, country_code, value, stringency_index)
save(covid_cases, file = "./data/output//covid_cases.rda")

1.1.2 Socio-Economic Data

The socio-economic data come from five sources:

1.1.2.1 Creation of List of Variables to Keep

The variables that are kept in the study are divided in 6 categories, 5 of which that concern the WDI database. The 6th (Policy variables and governance) is extracted from the Oxford COVID-19Government Response Tracker.

# S1: Healthcare infrastructure
variables_to_keep_S1 <- c(
  "Physicians (per 1,000 people)",
  "Hospital beds (per 1,000 people)",
  "Nurses and midwives (per 1,000 people)",
  "Domestic general government health expenditure (% of general government expenditure)",
  "Domestic private health expenditure per capita (current US$)",
  "Domestic private health expenditure per capita, PPP (current international $)"
)

# S2: Vulnerability to comorbidities
variables_to_keep_S2 <- c(
  "Incidence of malaria (per 1,000 population at risk)",
  "Incidence of HIV, all (per 1,000 uninfected population)",
  "Incidence of tuberculosis (per 100,000 people)",
  "Diabetes prevalence (% of population ages 20 to 79)",
  "Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%)"
)

# S3: Vulnerability to natural environment
variables_to_keep_S3 <- c(
  "Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)",
  "PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",
  "PM2.5 pollution, population exposed to levels exceeding WHO Interim Target-1 value (% of total)",
  "PM2.5 pollution, population exposed to levels exceeding WHO Interim Target-2 value (% of total)",
  "PM2.5 pollution, population exposed to levels exceeding WHO Interim Target-3 value (% of total)",
  "Air transport, passengers carried",
  "International tourism, number of arrivals",
  "Ecological Footprint"
)

# S4: Living conditions
variables_to_keep_S4 <- c(
  # "Multidimensional poverty index (scale 0-1)",# No data...
  "People using at least basic drinking water services (% of population)",
  "People using at least basic sanitation services (% of population)",
  "Prevalence of undernourishment (% of population)",
  "Prevalence of anemia among women of reproductive age (% of women ages 15-49)",
  "Poverty headcount ratio at $3.65 a day (2017 PPP) (% of population)",
  "Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population)",
  "Poverty headcount ratio at national poverty lines (% of population)",
  "GDP per capita (current US$)",
  "GDP per capita, PPP (current international $)",
  "Urban population", # Will not be kept in PCA
  "Urban land area (sq. km)", # Will not be kept in PCA
  "Urban density"
)

#S5: Economic and societal characteristics
variables_to_keep_S5 <- c(
  "Population, total",
  "GDP per capita growth (annual %)",
  "International migrant stock (% of population)",
  "Population ages 65 and above (% of total population)",
  "Individuals using the Internet (% of population)",
  "Mobile cellular subscriptions (per 100 people)",
  "Gini index (World Bank estimate)", # Will no be kept in PCA
  "Gini index (CIA estimate)",
  "Shadow size Economy"
)

# c("Shadow size Economy", "Ecological Footprint", "Gini index (CIA estimate)")


#S6: Policy variables and governance
variables_to_keep_S6 <- c(
  "GovernmentResponseIndex",
  "ContainmentHealthIndex", 
  "StringencyIndex",
  "EconomicSupportIndex"
)

variables_to_keep <- c(
  variables_to_keep_S1, variables_to_keep_S2, variables_to_keep_S3,
  variables_to_keep_S4, variables_to_keep_S5, variables_to_keep_S6
)

For convenience, a table that contains the list of variable and their type can be created:

variables_to_keep_df <- 
  tibble(variable = variables_to_keep_S1, type = "S1") |> 
  bind_rows(tibble(variable = variables_to_keep_S2, type = "S2")) |> 
  bind_rows(tibble(variable = variables_to_keep_S3, type = "S3")) |> 
  bind_rows(tibble(variable = variables_to_keep_S4, type = "S4")) |> 
  bind_rows(tibble(variable = variables_to_keep_S5, type = "S5")) |> 
  bind_rows(tibble(variable = variables_to_keep_S6, type = "S6"))
variables_to_keep_df
# A tibble: 44 × 2
   variable                                                                type 
   <chr>                                                                   <chr>
 1 Physicians (per 1,000 people)                                           S1   
 2 Hospital beds (per 1,000 people)                                        S1   
 3 Nurses and midwives (per 1,000 people)                                  S1   
 4 Domestic general government health expenditure (% of general governmen… S1   
 5 Domestic private health expenditure per capita (current US$)            S1   
 6 Domestic private health expenditure per capita, PPP (current internati… S1   
 7 Incidence of malaria (per 1,000 population at risk)                     S2   
 8 Incidence of HIV, all (per 1,000 uninfected population)                 S2   
 9 Incidence of tuberculosis (per 100,000 people)                          S2   
10 Diabetes prevalence (% of population ages 20 to 79)                     S2   
# ℹ 34 more rows

1.1.2.2 WDI data

The study relies on socio-economic data. Most variables are extracted from the World Development Indicators (WDI) from the World Bank.

We downloaded a bulk CSV file version of the database. The data can then easily be loaded into R:

wdi_df <- read_csv("./data/raw/WDI_csv.zip")
Multiple files in zip: reading 'WDIData.csv'
New names:
Rows: 392882 Columns: 68
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): Country Name, Country Code, Indicator Name, Indicator Code
dbl (63): 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, ...
lgl  (1): ...68

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Since the last column contains only NA values, it should be removed:

last_col_name <- last(colnames(wdi_df))
wdi_df <- 
  wdi_df |> select(-!!last_col_name)

Let us extract the variable names:

variable_names <- wdi_df$`Indicator Name` |> 
  unique() |> 
  sort()
head(variable_names)
[1] "Access to clean fuels and technologies for cooking (% of population)"             
[2] "Access to clean fuels and technologies for cooking, rural (% of rural population)"
[3] "Access to clean fuels and technologies for cooking, urban (% of urban population)"
[4] "Access to electricity (% of population)"                                          
[5] "Access to electricity, rural (% of rural population)"                             
[6] "Access to electricity, urban (% of urban population)"                             

We have selected a few variables from the database. To be able to correctly identify the names of the variables we are interested in, we used the following piece of code (here is an example to identify variables that contain the word ‘poverty’):

variable_names[str_detect(variable_names, regex("poverty", ignore_case = TRUE))]

Let us rename some countries in the WDI dataset so that their names match those that can be found in the Oxford dataset.

wdi_df <- 
  wdi_df |> 
  mutate(
    `Country Name` = recode(
      `Country Name`,
      # old = new
      "Brunei Darussalam" = "Brunei",
      "Cabo Verde" = "Cape Verde",
      "Congo, Dem. Rep." = "Democratic Republic of Congo",
      "Congo, Rep." = "Congo",
      "Egypt, Arab Rep." = "Egypt",
      "Gambia, The" = "Gambia",
      "Hong Kong SAR, China" = "Hong Kong",
      "Iran, Islamic Rep." = "Iran",
      "Korea, Rep." = "South Korea",
      "Lao PDR" = "Laos",
      "Russian Federation" = "Russia",
      "Syrian Arab Republic"  = "Syria",
      "Venezuela, RB" = "Venezuela",
      "Yemen, Rep." = "Yemen"
    )
  )

1.1.2.3 Ecological Footprint

We extract the ecological footprint data from the Global Footprint Network National Footprint and Biocapacity Accounts. It measures, in gobal hectares, ‘how much area of biologically productive land and water an individual, population, or activity requires to produce all the resources it consumes and to absorb the waste it generates, using prevailing technology and resource management practices.’

To get the data, we made some queries to the API, after creating an account (for free). This notebook will present the way to get the data, but to avoid making unecessary requests to the API, the data are loaded from a previous call to the API.

library(rvest)
library(httr)

The username and API key need to be sent at each call.

username <- "replace_with_your_username"
key <- "replace_with_your_key"

We created a simple function, as explained in the R vignette of {httr}, to make a call to the API:

#' Request made to footprintnetwork.org API
#' 
#' @param url endpoint
#' url <- "http://api.footprintnetwork.org/v1/types"
request_api <- function(url) {
  resp <- GET(url, authenticate(username, key))
  
  if (http_type(resp) != "application/json") {
    stop("API did not return json", call. = FALSE)
  }
  
  jsonlite::fromJSON(content(resp, "text"), simplifyVector = TRUE)
}# End of request_api()

We defined a function that prepares the URL depending on the country code, the year and the type of data:

#' Fetch data from footprintnetwork.org API
#' @param country_code code of the country (e.g., 68 for France, or all for all countries)
#' @param year year
#' @param data_type Record type taken from types list or 'all' (e.g., pop)
get_data_country <- function(country_code, year, data_type) {
  url <- str_c(
    "http://api.footprintnetwork.org/v1/data/", 
    country_code, "/", year, "/", data_type
  )
  request_api(url)
}

Then, we make a simple call to the API to list of variables that can be asked:

types_footprint <- request_api("http://api.footprintnetwork.org/v1/types")
save(types_footprint, file = "./data/raw/footprint/types_footprint.rda")
load("./data/raw/footprint/types_footprint.rda")
if (knitr::is_html_output()) {
 types_footprint |> 
  DT::datatable(options = list(
    searching = TRUE,
    pageLength = 10,
    lengthMenu = c(5, 10, 15, 20)
  )) 
}

Another call is made to get the list of countries:

list_countries_footprint <- request_api("http://api.footprintnetwork.org/v1/countries")
save(
  list_countries_footprint,
  file = "./data/raw/footprint/list_countries_footprint.rda"
)
load("./data/raw/footprint/list_countries_footprint.rda")
if (knitr::is_html_output()) {
  list_countries_footprint |> 
  DT::datatable(options = list(
    searching = TRUE,
    pageLength = 10,
    lengthMenu = c(5, 10, 15, 20)
  ))
}

Then, we get the data from 2009, 2010 (for later use with the Influenza H1N1 epidemic), as well as the 2017 data (latest available).

footprint_2009 <- get_data_country(
  country_code = "all", year = 2009, data_type = "EFCpc"
)
footprint_2010 <- get_data_country(
  country_code = "all", year = 2010, data_type = "EFCpc"
)
footprint_2009_2010 <- 
  footprint_2009 |> 
  bind_rows(footprint_2010) |> 
  tibble()

footprint_2017 <- get_data_country(
  country_code = "all", year = 2017, data_type = "EFCpc"
)
footprint_2017 <- tibble(footprint_2017)

save(footprint_2009_2010, file = "./data/raw/footprint/footprint_2009_2010.rda")
save(footprint_2017, file = "./data/raw/footprint/footprint_2017.rda")
load("./data/raw/footprint/footprint_2017.rda")
if (knitr::is_html_output()) {
  footprint_2017 |> 
    DT::datatable(options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    ))
}

Let us only keep the following variables:

df_footprint <-
  footprint_2017 |>
  select(countryName, year, value)

Some countries need to be renamed to match those of the Oxford dataset:

df_footprint <- 
  df_footprint |> 
  mutate(
    country = countryName |> 
      recode(
        "Brunei Darussalam" = "Brunei",
        "Côte d'Ivoire" = "Cote d'Ivoire",
        "Congo, Democratic Republic of" = "Democratic Republic of Congo",
        "Cabo Verde" = "Cape Verde",
        "Iran, Islamic Republic of" = "Iran",
        "Korea, Republic of" = "South Korea",
        "Lao People's Democratic Republic" = "Laos",
        "Russian Federation" = "Russia",
        # "" = "South Sudan",# Missing
        # "" = "Seychelles",#Missing
        "Syrian Arab Republic" = "Syria",
        "Tanzania, United Republic of" = "Tanzania",
        # "" = "Uruguay",# Missing
        "United States of America" = "United States",
        "Venezuela, Bolivarian Republic of" = "Venezuela",
        "Viet Nam" = "Vietnam"
        # "" = "Vanuatu"# Missing
      )
  )

1.1.2.4 Size of the Shadow Economy

We rely on the estimates of the size of the Shadow Economy reported in Medina and Schneider (2019). The data reported in table A.1 from the Appendix of their paper have been manually extracted and saved in a CSV file.

df_shadow_economy <- read_csv2("./data/raw/shadow_economy.csv", skip = 3)
ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
Rows: 157 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr (1): ISO
num (1): shadow_size

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
if (knitr::is_html_output()) {
  df_shadow_economy |> 
  DT::datatable(options = list(
    searching = TRUE,
    pageLength = 10,
    lengthMenu = c(5, 10, 15, 20)
  ))
}

There will be some missing values:

unique(confirmed_df$country_code)[!unique(confirmed_df$country_code) %in% df_shadow_economy$ISO]
 [1] "AFG" "DJI" "IRQ" "PAN" "SDN" "SOM" "SSD" "SYC" "TLS" "VUT"

1.1.2.5 Gini Index

The WDI dataset does not give estimates for the Gini Index for many countries. We turn to data from the World Factbook, by the CIA.

df_cia <- 
  read_csv("./data/raw/gini_index_cia.csv") |> 
  select(name, date_of_information, value)
Rows: 173 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): name, slug, date_of_information, region
dbl (2): value, ranking

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Some countries need to be renamed to match those of the Oxford dataset:

df_cia <- df_cia |> 
  mutate(
    country = name |> 
      recode(
        # "" = "Bahrain",#Missing
        # "" = "Brunei",#Missing
        "Congo, Democratic Republic of the" = "Democratic Republic of Congo",
        "Congo, Republic of the" = "Congo",
        "Cabo Verde" = "Cape Verde",
        # "" = "Eritrea",#Missing
        "Gambia, The" = "Gambia",
        "Korea, South" = "South Korea"
        # "" = "Kuwait",#Missing
        # "" = "Myanmar",# Mssing
        # "" = "Oman",#Missin
        # "" = "Somalia",#Missing
        # "" = "Syria"# Missing
      )
  )
if (knitr::is_html_output()) {
  df_cia |> 
    DT::datatable(
      options = list(
        searching = TRUE,
        pageLength = 10,
        lengthMenu = c(5, 10, 15, 20)
      )
    )
}

1.1.2.6 Policy Variables

We extract policy variables from Hale et al. (2021).

df_oxford <- read_csv("./data/raw/OxCGRT_latest.csv") |> 
  select(country = CountryName,`Date`,all_of(variables_to_keep_S6))
Rows: 177528 Columns: 51
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): CountryName, CountryCode, RegionName, RegionCode, Jurisdiction
dbl (45): Date, C1_School closing, C1_Flag, C2_Workplace closing, C2_Flag, C...
lgl  (1): M1_Wildcard

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_oxford
# A tibble: 177,528 × 6
   country    Date GovernmentResponseIn…¹ ContainmentHealthIndex StringencyIndex
   <chr>     <dbl>                  <dbl>                  <dbl>           <dbl>
 1 Aruba    2.02e7                      0                      0               0
 2 Aruba    2.02e7                      0                      0               0
 3 Aruba    2.02e7                      0                      0               0
 4 Aruba    2.02e7                      0                      0               0
 5 Aruba    2.02e7                      0                      0               0
 6 Aruba    2.02e7                      0                      0               0
 7 Aruba    2.02e7                      0                      0               0
 8 Aruba    2.02e7                      0                      0               0
 9 Aruba    2.02e7                      0                      0               0
10 Aruba    2.02e7                      0                      0               0
# ℹ 177,518 more rows
# ℹ abbreviated name: ¹​GovernmentResponseIndex
# ℹ 1 more variable: EconomicSupportIndex <dbl>

We filter the data to keep the period from January 2020 to January 2021 and compute an average of the variables of interes (variables_to_keep_S6) over the period:

# Aggregation by year
df_oxford_year <- 
  df_oxford |> 
  mutate(Date = ymd(Date)) |> 
  filter(Date >= "2020-01-01" & Date <"2021-01-01") |>
  pivot_longer(cols = variables_to_keep_S6, names_to = "Indicator Name") |>
  group_by(country, `Indicator Name`) |> 
  summarise(value = mean(value, na.rm = T)) |> 
  mutate(type = "2020") |> 
  pivot_wider(names_from = "Indicator Name", values_from = c("value","type")) |> 
  select(
    country,
    str_c(
      rep(c("value_","type_"), length(variables_to_keep_S6)), 
      rep(variables_to_keep_S6, each = 2)
    ),
  )
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(variables_to_keep_S6)

  # Now:
  data %>% select(all_of(variables_to_keep_S6))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
`summarise()` has grouped output by 'country'. You can override using the
`.groups` argument.

The resulting averages can be shown as follows:

if (knitr::is_html_output()) {
 df_oxford_year |> 
  DT::datatable(options = list(
    searching = TRUE,
    pageLength = 10,
    lengthMenu = c(5, 10, 15, 20)
  )) 
}

1.1.2.7 Creation of the Datasets

We create a function that allows us to create and export a database for each group of countries. This database contains the 2020 value for each variable. When the 2020 value was missing in the data, we took the average value over the last 5 years. For the remaining missing values, we took the 10 years average, or the 15 years average. When there was no data available in the selected years, we replaced the missing value by the average of the region.

#'@param countries group of countries
#'@param file export database
get_out_countries <- function(countries, file) {
  
  df_countries_wdi <- 
    wdi_df |> 
    # Keeping countries of interest
    filter(`Country Name` %in% !!countries) |> 
    # And variables of interest
    filter(`Indicator Name` %in% !!variables_to_keep) |> 
    pivot_longer(cols = `1960`:`2020`, names_to = "year") |> 
    select(ISO = `Country Code`, country = `Country Name`, variable = `Indicator Name`, year, value)
  
  df_countries_wdi <- 
    df_countries_wdi |> 
    mutate(year = as.numeric(year)) |> 
    filter(year >= 2005) |> 
    mutate(
      value_2020    = ifelse(year == 2020, yes = value, no = NA),
      value_last_5  = ifelse(year %in% seq(2015,2020), yes = value, no = NA),
      value_last_10 = ifelse(year %in% seq(2010,2020), yes = value, no = NA),
      value_last_15 = ifelse(year %in% seq(2005,2020), yes = value, no = NA)
    ) |> 
    pivot_longer(
      cols = value_2020:value_last_15, 
      names_to = "var", 
      values_to = "replacement_val"
    ) |> 
    group_by(ISO, country, variable, var) |> 
    summarise(
      # nb_val = sum(!is.na(replacement_val)),
      replacement_val = mean(replacement_val, na.rm = TRUE)
    ) |> 
    pivot_wider(names_from = var, values_from = replacement_val) |> 
    mutate(
      # If missing value: average of the 5 previous years
      type = ifelse(
        is.na(value_2020) | is.nan(value_2020), 
        yes = "last 5", no = "value 2020"
      ),
      value = ifelse(
        is.na(value_2020) | is.nan(value_2020), 
        yes = value_last_5, no = value_2020
      )
    ) |> 
    mutate(
      # If missing value: average of the 10 previous years
      type = ifelse(
        is.na(value) | is.nan(value), 
        yes = "last 10", no = type
      ),
      value = ifelse(
        is.na(value) | is.nan(value), 
        yes = value_last_10, no = value
      )
    ) |> 
    mutate(
      # If missing value: average of the 15 previous years
      type = ifelse(
        is.na(value) | is.nan(value), 
        yes = "last 15", no = type
      ),
      value = ifelse(
        is.na(value) | is.nan(value), 
        yes = value_last_15, no = value
      )
    ) |> 
    mutate(
      # If missing value: NA
      type = ifelse(is.na(value) | is.nan(value), yes = "flag", no = type),
      value = ifelse(is.na(value) | is.nan(value), yes = NA, no = value)
    ) |> 
    select(-value_2020, -value_last_5, -value_last_10, -value_last_15)
  
  # Malaria set to 0 when it is NA, in 2020
  df_countries_wdi <- 
    df_countries_wdi |> 
    mutate(
      value = ifelse(
        variable == "Incidence of malaria (per 1,000 population at risk)" & 
          is.na(value),
        yes = 0, no = value)
    )
  
  # Ecological Footprint
  df_countries_footprint <- 
    # corresp_iso |> 
    # filter(country %in% countries) |> 
    # left_join(df_footprint, by = c("ISO", "country")) |> 
    df_footprint |> 
    filter(country %in% !!countries) |> 
    mutate(variable = "Ecological Footprint") |> 
    mutate(type = str_c(year, " est.")) |> 
    select(-year, -countryName)
  
  # Gini
  df_countries_gini <- 
    # corresp_iso |> 
    # filter(country %in% countries) |> 
    # left_join(df_cia, by = c("ISO", "country")) |> 
    df_cia |> 
    filter(country %in% !!countries) |> 
    rename(type = date_of_information) |> 
    select(country, value, type) |> 
    mutate(variable = "Gini index (CIA estimate)")
  
  
  # Shadow Size of the epidemic
  df_shadow_size <- 
    confirmed_df |> 
    select(country, country_code) |> 
    unique() |> 
    filter(country %in% countries) |> 
    left_join(
      df_shadow_economy, by = c("country_code" = "ISO")
    ) |> 
    rename(value = shadow_size) |> 
    mutate(
      variable = "Shadow size Economy",
      type = "2015 est."
    ) |> 
    select(country, value, type, variable)
  
  
  df_countries <- 
    df_countries_wdi |> ungroup() |> select(-ISO) |> 
    bind_rows(df_countries_footprint) |> 
    bind_rows(df_countries_gini) |> 
    bind_rows(df_shadow_size) |> 
    ungroup()
  
  df_countries <- 
    df_countries |> 
    complete(country, variable)
  
  df_countries <- 
    df_countries |> 
    mutate(type = ifelse(is.na(type), yes = "flag", no = type))
  
  # Average within the zone
  group_average <- 
    df_countries |> 
    group_by(variable) |> 
    summarise(value_group = mean(value, na.rm=TRUE))
  
  
  # Replacing missing values with group average
  df_countries <- 
    df_countries |> 
    left_join(group_average, by = "variable") |> 
    mutate(
      # If missing value: group average
      type = ifelse(
        is.na(value) | is.nan(value), yes = "group average", no = type
      ),
      value = ifelse(
        is.na(value) | is.nan(value), yes = value_group, no = value
      )
    ) |> 
    select(-value_group)
  
  # Variables in columns
  df_countries <- 
    df_countries |> 
    pivot_wider(names_from = variable, values_from = c(type, value))
  # select(
  #   ISO, country, str_c(c("value_", "type_"), rep(variables_to_keep, each = 2))
  # )
  
  df_countries <- 
    df_countries |> 
    # Computing Urban density
    mutate(`value_Urban density` = `value_Urban population` / `value_Urban land area (sq. km)`,
           `type_Urban density` = `type_Urban population`)
  
  # Adding Oxford indicators
  df_countries_excel <- 
    df_countries |>
    left_join(df_oxford_year, by = "country")
  
  # Table with summary of missing data
  df_missing <- 
    df_countries_excel |> 
    select(country, starts_with("type_")) |> 
    pivot_longer(cols = -c(country), names_to = "variable") |> 
    group_by(variable, value) |> 
    summarise(n = n()) |> 
    group_by(variable) |> 
    mutate(freq = scales::percent(n / sum(n), accuracy = 0.1)) |> 
    select(-n) |> 
    pivot_wider(names_from = value, values_from = freq, values_fill = "") |> 
    mutate(variable = str_remove(variable, "^type_"),
           variable = factor(variable, levels = variables_to_keep)) |> 
    arrange(variable)
  
  # reordering the columns
  vn <- c("variable", "2020", "last 5", "last 10", "last 15", "group average")
  vn <- vn[vn %in% colnames(df_missing)]
  vn2 <- sort(colnames(df_missing)[!colnames(df_missing)%in%vn])
  
  df_missing <- 
    df_missing |> 
    select(!!vn, !!vn2)  
  
  write_excel_csv(df_countries_excel, file = file)
  
  list(df_countries_excel = df_countries_excel, df_missing = df_missing)
} # End of get_out_countries()
1.1.2.7.1 Mena

The data for MENA:

mena
 [1] "Algeria"      "Bahrain"      "Egypt"        "Iran"         "Iraq"        
 [6] "Israel"       "Jordan"       "Kuwait"       "Lebanon"      "Morocco"     
[11] "Oman"         "Qatar"        "Saudi Arabia" "Syria"        "Tunisia"     
[16] "Yemen"       
file <- "./data/output/covid/df_mena.csv"

res_mena <- get_out_countries(countries=mena, file=file)
`summarise()` has grouped output by 'ISO', 'country', 'variable'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'variable'. You can override using the
`.groups` argument.
df_mena_missing <- res_mena$df_missing
df_mena <- res_mena$df_countries_excel
if (knitr::is_html_output()) {
  df_mena_missing |> 
  DT::datatable(options = list(
    searching = TRUE,
    pageLength = 10,
    lengthMenu = c(5, 10, 15, 20)
  ))
}
1.1.2.7.2 Sub-Saharan Africa
ssa
 [1] "Angola"                       "Benin"                       
 [3] "Burkina Faso"                 "Burundi"                     
 [5] "Cape Verde"                   "Cameroon"                    
 [7] "Central African Republic"     "Chad"                        
 [9] "Democratic Republic of Congo" "Congo"                       
[11] "Cote d'Ivoire"                "Djibouti"                    
[13] "Ethiopia"                     "Gabon"                       
[15] "Gambia"                       "Ghana"                       
[17] "Guinea"                       "Kenya"                       
[19] "Lesotho"                      "Liberia"                     
[21] "Madagascar"                   "Malawi"                      
[23] "Mali"                         "Mauritania"                  
[25] "Mauritius"                    "Mozambique"                  
[27] "Namibia"                      "Niger"                       
[29] "Nigeria"                      "Rwanda"                      
[31] "Senegal"                      "Seychelles"                  
[33] "Sierra Leone"                 "Somalia"                     
[35] "South Africa"                 "South Sudan"                 
[37] "Sudan"                        "Eswatini"                    
[39] "Tanzania"                     "Togo"                        
[41] "Uganda"                       "Zimbabwe"                    
file <- "./data/output/covid/df_sub_saharian_africa.csv"

res_ssa <- get_out_countries(countries=ssa, file=file)
`summarise()` has grouped output by 'ISO', 'country', 'variable'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'variable'. You can override using the
`.groups` argument.
df_ssa_missing <- res_ssa$df_missing
df_ssa <- res_ssa$df_countries_excel
if (knitr::is_html_output()) {
  df_ssa_missing |> 
    DT::datatable(options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    )) 
}
1.1.2.7.3 Latin America
latin_america
 [1] "Argentina"   "Bolivia"     "Brazil"      "Chile"       "Colombia"   
 [6] "Ecuador"     "El Salvador" "Guatemala"   "Haiti"       "Honduras"   
[11] "Jamaica"     "Mexico"      "Nicaragua"   "Panama"      "Paraguay"   
[16] "Peru"        "Uruguay"     "Venezuela"  
file <- "./data/output/covid/df_latin_america.csv"

res_la <- get_out_countries(countries=latin_america, file=file)
`summarise()` has grouped output by 'ISO', 'country', 'variable'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'variable'. You can override using the
`.groups` argument.
df_la_missing <- res_la$df_missing
df_la <- res_la$df_countries_excel
if (knitr::is_html_output()) {
  df_la_missing |> 
    DT::datatable(options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    )) 
}
1.1.2.7.4 Asia
asia
 [1] "Afghanistan"     "Bangladesh"      "Bhutan"          "Brunei"         
 [5] "Cambodia"        "India"           "Indonesia"       "Laos"           
 [9] "Malaysia"        "Mongolia"        "Myanmar"         "Nepal"          
[13] "Pakistan"        "Philippines"     "Russia"          "Solomon Islands"
[17] "South Korea"     "Sri Lanka"       "Thailand"        "Timor-Leste"    
[21] "Vanuatu"         "Vietnam"         "China"          
file <- "./data/output/covid/df_asia.csv"

res_asia <- get_out_countries(countries=asia, file=file)
`summarise()` has grouped output by 'ISO', 'country', 'variable'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'variable'. You can override using the
`.groups` argument.
df_asia_missing <- res_asia$df_missing
df_asia <- res_asia$df_countries_excel
if (knitr::is_html_output()) {
  df_asia_missing |> 
    DT::datatable(options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    ))
}
1.1.2.7.5 Industrialized Countries
indus
 [1] "Australia"      "Austria"        "Belgium"        "Canada"        
 [5] "Denmark"        "France"         "Germany"        "Greece"        
 [9] "Ireland"        "Italy"          "Japan"          "Netherlands"   
[13] "Spain"          "Sweden"         "United Kingdom" "United States" 
file <- "./data/output/df_industrialized_countries.csv"

res_indus <- get_out_countries(countries=indus, file=file)
`summarise()` has grouped output by 'ISO', 'country', 'variable'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'variable'. You can override using the
`.groups` argument.
df_indus_missing <- res_indus$df_missing
df_indus <- res_indus$df_countries_excel
if (knitr::is_html_output()) {
  df_indus_missing |> 
    DT::datatable(options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    ))
}
1.1.2.7.6 All countries

Let us aggregate the results in a single table:

df_all_covid <- 
  df_mena |> mutate(group = "MENA") |> 
  bind_rows(df_ssa |> mutate(group = "SSA")) |> 
  bind_rows(df_la |> mutate(group = "Latin America")) |> 
  bind_rows(df_asia |> mutate(group = "Asia")) |> 
  bind_rows(df_indus |> mutate(group = "Industrialized"))

save(df_all_covid, file = "./data/output/covid/df_all_countries.rda")

1.1.3 Socio-Economic Data

1.2 Population Data

For the epidemiological models, we need data on the population size of the countries. Let us extract this information from the WDI data

population <-
  wdi_df |>
  filter(`Indicator Name` == "Population, total") |>
  select(country = `Country Name`, ISO = `Country Code`,`2015`:`2019`) |>
  pivot_longer(cols = -c(country, ISO)) |>
  group_by(country, ISO) |>
  summarise(pop = mean(value, na.rm=TRUE)) |>
  ungroup()
`summarise()` has grouped output by 'country'. You can override using the
`.groups` argument.
save(population, file = "./data/output/population.rda")
if (knitr::is_html_output()) {
  population |> 
    DT::datatable(options = list(
      searching = TRUE,
      pageLength = 10,
      lengthMenu = c(5, 10, 15, 20)
    ))  
}

1.3 Summary

File Description Source
./data/output/covid_cases.rda Daily cumulative number of Covid-19 cases JHU
./data/output/covid/df_all_countries.csv Socio-Economic variables for countries for the Covid-19 period World Development Indicators, Global Footprint Network National Footprint and Biocapacity Accounts, Medina and Schneider (2019), CIA, Hale et al. (2021)
./data/output/population.rda Population World Development Indicators