5  Merging the files

In this chapter, the formatted data from the previous chapters are aggregated to produce the data table used in the local projections.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cli)

5.1 Load Intermediate Files

The weather data (Chapter 1) can be loaded:

load("../data/output/weather/weather_regions_df.rda")

The agricultural data (Chapter 2):

load("../data/output/minagri/dataset_agri_2001_2015.rda")

The macroeconomic data and commodity prices (Chapter 3):

load("../data/output/macro/df_macro.rda")
load("../data/output/macro/df_int_prices.rda")

The share of natural regions and the El Niño–Southern Oscillation (Chapter 4):

load("../data/output/natural_region_dep.rda")
load("../data/output/weather/ONI_temp.rda")

5.2 Merge the Files

We add ENSO data to the weather dataset:

Weather <- weather_regions_df |> 
  # Add ENSO data
  left_join(
    ONI_temp |> 
      mutate(
        Year = as.numeric(Year),
        month = as.numeric(month)
      ) |> 
      rename(
        enso_start = date_start,
        enso_end = date_end
      ),
    by = c(
      "year" = "Year",
      "month" = "month"
    )
  ) |> 
  group_by(IDDPTO, month, State) |> 
  mutate( 
    temp_min_dev_ENSO   = temp_min - mean(temp_min),
    temp_max_dev_ENSO   = temp_max - mean(temp_max),
    temp_mean_dev_ENSO  = temp_mean - mean(temp_mean),
    precip_sum_dev_ENSO = precip_sum - mean(precip_sum))|> 
  ungroup() |> 
  labelled::set_variable_labels(
    temp_min_dev_ENSO   = "Deviation of Min. Temperature from ENSO Normals",
    temp_max_dev_ENSO   = "Deviation of Max. Temperature from ENSO Normals",
    temp_mean_dev_ENSO  = "Deviation of Mean Temperature from ENSO Normals",
    precip_sum_dev_ENSO = "Deviation of Total Rainfall from ENSO Normals",
  )

Let us merge all these datasets in a single one:

data_total <- 
  data_total |> 
  # Add weather data and ENSO 
  left_join(
    #weather_regions_df |> 
    Weather |> 
      dplyr::select(-IDDPTO),
    by = c(
      "year" = "year",
      "month" = "month",
      "region" = "DEPARTAMEN", 
      "date" = "date"
      )
  ) |> 
  # Add macroeconomic data
  left_join(
    df_macro |> rename(gdp = y),
    by = "date"
  ) |> 
  # Add commodity prices data
  left_join(
    int_prices,
    by =  c(
      "date", "product", "product_eng")
  ) |> 
  # Add share of each type of region
  left_join(
    natural_region_dep,
    by = "region"
  )

Here are the first rows of that tibble:

data_total
# A tibble: 21,960 × 67
   region_id region   product       date       ln_prices ln_produc  year month
       <int> <chr>    <chr>         <date>         <dbl>     <dbl> <dbl> <dbl>
 1         1 AMAZONAS MAÍZ AMILÁCEO 2001-01-01     0          0     2001     1
 2         1 AMAZONAS MAÍZ AMILÁCEO 2001-02-01     0.536      6.58  2001     2
 3         1 AMAZONAS MAÍZ AMILÁCEO 2001-03-01    NA          6.26  2001     3
 4         1 AMAZONAS MAÍZ AMILÁCEO 2001-04-01     0.560      6.26  2001     4
 5         1 AMAZONAS MAÍZ AMILÁCEO 2001-05-01     0.565      6.04  2001     5
 6         1 AMAZONAS MAÍZ AMILÁCEO 2001-06-01    NA          7.00  2001     6
 7         1 AMAZONAS MAÍZ AMILÁCEO 2001-07-01     0.531      7.00  2001     7
 8         1 AMAZONAS MAÍZ AMILÁCEO 2001-08-01     0.525      8.37  2001     8
 9         1 AMAZONAS MAÍZ AMILÁCEO 2001-09-01    NA          6.53  2001     9
10         1 AMAZONAS MAÍZ AMILÁCEO 2001-10-01     0.519      6.53  2001    10
# ℹ 21,950 more rows
# ℹ 59 more variables: Value_prod <dbl>, surf_m <dbl>, Value_surfR <dbl>,
#   Value_prices <dbl>, campaign <dbl>, campaign_plain <chr>,
#   month_campaign <dbl>, surf_lag_calend <dbl>, product_eng <chr>,
#   perc_product <dbl>, perc_product_mean <dbl>, diff_plant_harv <dbl>,
#   exposition <dbl>, exposition_trend <dbl>, exposition_detrended <dbl>,
#   exposition_norm <dbl>, temp_min <dbl>, temp_max <dbl>, temp_mean <dbl>, …

Some descriptive statistics are shown in Chapter 6.

Lastly, the dataset can be saved for later use.

save(data_total, file = "../data/output/dataset_2001_2015.rda")
write_csv(data_total, file = "../data/output/dataset_2001_2015.csv")

5.3 Dataset for the Local Projections

Now, let us create the dataset specifically used to estimate the models.

Let us make sure that the region data are encoded as a factor.

data_total <- 
  data_total |> 
  mutate(region_id = factor(region_id))

The crops we focus on:

crops <- c("Rice", "Dent corn", "Potato", "Cassava")

The number of observation in each region, for each crop:

data_total |> 
  group_by(product_eng, region_id) |> 
  summarise(n = sum(Value_prod <= 0)) |> 
  arrange(desc(n))
`summarise()` has grouped output by 'product_eng'. You can override using the
`.groups` argument.
# A tibble: 122 × 3
# Groups:   product_eng [6]
   product_eng     region_id     n
   <chr>           <fct>     <int>
 1 Rice            14          171
 2 Cassava         17          161
 3 Wheat           18          154
 4 Wheat           20          135
 5 Amylaceous corn 18          129
 6 Rice            20          128
 7 Wheat           10          125
 8 Dent corn       20          123
 9 Amylaceous corn 20          122
10 Wheat           7           115
# ℹ 112 more rows

5.3.1 Detrending of the agricultural production

This section outlines a two-step procedure for detrending agricultural production data at the regional level for a specific crop and month. The procedure involves handling missing values and then performing the detrending process.

  • Step 1: Handling Missing Values

    In the first step, we address missing values by linear interpolation. This approach helps us estimate the missing values by considering the neighboring data points.

    • Step 1.1: Imputing missing values with linear interpolation.

      The missing values get replaced by linear interpolation. However, if there are more than two consecutive missing values, they are not replaced with interpolated values. Instead, the series for the specific crop in the given region is split based on the locations of the missing values. The split with the highest number of consecutive non-missing values is retained, while the other splits are discarded.

    • Step 1.2: Dropping Series with Remaining Missing Values

      After imputing missing values using the moving median, we check if any missing values still remain in the dataset. If there are any remaining missing values for a particular series, we choose to exclude that series from further analysis. By doing so, we ensure that the subsequent detrending process is performed only on reliable and complete data.

  • Step 2: Detrending the data

    Once we have addressed the missing values, we proceed to the second step, which involves detrending the data. Detrending aims to remove the long-term trend or seasonality from the dataset, leaving behind the underlying fluctuations and patterns. To remove the trend from the data, we follow a three-step process.

    • Step 2.1: Demeaning

      First, we compute the share of each data point, denoted as \(y_{c,i,m,t}^{demeaned}\), by dividing the raw data point \(y_{c,i,m,t}^{raw}\) by the sum of all raw data points for the given crop \(c\), region \(i\), and calendar month \(m\) over the entire time period \(T_c\): \[y_{c,i,m,t}^{demeaned}\frac{y^{raw}_{c,i,m,t}}{n_{T_C}\sum_{t=1}^{T_c}y^{raw}_{c,i,m}}\] Here, \(n_{T_c}\) represents the total number of data points for the given crop, region, and month.

    • Step 2.2: Quadratic Trend Estimation

      Next, we estimate a quadratic trend using ordinary least squares (OLS) regression. We model the demeaned data points \(y_{c,i,m,t}^{demeaned}\) as a quadratic function of time \(t\): \[y^{demeaned}_{c,i,m}=\beta_{c,i,m} t + \gamma_{c,i,m} t^{2} + \varepsilon_{c,i,m}\] In this equation, \(\varepsilon_{c,i,m}\) represents the error term assumed to follow a normal distribution.

    • Step 2.3: Detrending

      Once we have estimated the coefficients \(\beta_{c,i,m}\) and \(\gamma_{c,i,m}\) through OLS regression, we can remove the quadratic trend from the data. This is done by calculating the detrended data \(y_{c,i,m}^{d}\), which simply consists of the residuals \(e_{c,i,m}\): \[y^{d}_{c,i,m} = e_{c,i,m}\] The resulting detrended data \(y_{c,i,m}^{d}\) represents the original data with the quadratic trend component removed. We add the possibility to express the results in log.

Let us implement this process in R. First, we need to define two functions to handle the missing values:

  • The get_index_longest_non_na() function retrieves the indices of the longest consecutive sequence without missing values from a given input vector. It helps us identify the positions of elements in that sequence.

  • The keep_values_longest_non_na() function uses the obtained indices to create a logical vector. Each element of this vector indicates whether the corresponding element in the input vector belongs to the longest consecutive sequence of non-missing values. This allows us to filter the data and retain only the values from the longest consecutive sequence without missing values.

These two functions combined help us handle missing data in the weather series and ensure that we work with the most complete sequences for each region and crop.

The first function:

#' Returns the index of the longest sequence of non NA values in a vector
#'
#' @param y vector of numerical values
#' @export
get_index_longest_non_na <- function(y) {
  split_indices <- which(is.na(y))
  nb_obs <- length(y)

  if (length(split_indices) == 0) {
    res <- seq_len(nb_obs)
  } else {
    idx_beg <- c(1, split_indices)
    if (idx_beg[length(idx_beg)] != nb_obs) {
      idx_beg <- c(idx_beg, nb_obs)
    }
    lengths <- diff(idx_beg)
    ind_max <- which.max(lengths)
    index_beginning <- idx_beg[ind_max]
    if(!index_beginning == 1 | is.na(y[index_beginning])) {
      index_beginning <- index_beginning + 1
    }
    index_end <- idx_beg[ind_max] + lengths[ind_max]
    if(is.na(y[index_end])) {
      index_end <- index_end - 1
    }
    res <- seq(index_beginning, index_end)
  }
  res
}

The second one:

#' Returns a logical vector that identifies the longest sequence of non NA
#' values within the input vector
#' 
#' @param y numeric vector
keep_values_longest_non_na <- function(y) {
  ids_to_keep <- get_index_longest_non_na(y)
  ids <- seq(1, length(y))
  ids %in% ids_to_keep
}
Note

Those two functions are defined in weatherperu/R/utils.R.

We define a function, detrend_production(), that takes the data frame as input, as well as a crop name and a region ID. It returns the detrended production for that region, in a tibble. In the resuling tibble, the detrended values are in column named y.

#' Detrends the production data, using OLS
#'
#' @param df data
#' @param crop_name name of the crop
#' @param region_id id of the region
#' @param in_log if TRUE, the detrended values are expressed in log
#'
#' @returns data frame with the product, the region id, the date, and the
#'   detrended value for the production. If `in_log = TRUE` and there are zeros
#'   the function returns `NULL`. Similarly, if there are more than two
#' @export
#' @importFrom dplyr filter arrange mutate select row_number group_by
#' @importFrom tidyr nest unnest
#' @importFrom purrr map
#' @importFrom imputeTS na_interpolation
#' @importFrom stats lm predict residuals
detrend_production <- function(df,
                               crop_name,
                               region_id,
                               in_log = FALSE) {
  # The current data
  df_current <-
    df |>
    filter(
      product_eng == !!crop_name,
      region_id == !!region_id
    ) |>
    arrange(date)

  ## Dealing with missing values ----
  # Look for negative production values
  df_current <-
    df_current |>
    mutate(
      y_new = ifelse(Value_prod < 0, NA, Value_prod)
    )

  if (any(is.na(df_current$y_new))) {

    # Replacing NAs by interpolation
    # If there are more than two contiguous NAs, they are not replaced
    df_current <-
      df_current |>
      mutate(
        y_new = imputeTS::na_interpolation(y_new, maxgap = 3)
      )

    # Removing obs at the beginning/end if they are still missing
    df_current <-
      df_current |>
      mutate(
        row_to_keep = !(is.na(y_new) & row_number() %in% c(1:2, (n()-1):(n())))
      ) |>
      filter(row_to_keep) |>
      select(-row_to_keep)

    # Keeping the longest series of continuous non-NA values
    df_current <-
      df_current |>
      mutate(
        row_to_keep = keep_values_longest_non_na(y_new)
      ) |>
      filter(row_to_keep) |>
      select(-row_to_keep)
  }

  # # If contiguous zeros in Y
  # check_contiguous_zeros <- diff(
  #   which(is.na(df_current$y_new) | df_current$y_new <= 0)
  # )

  rle_y_new <- rle(df_current$y_new)
  check_contiguous_zeros <- rle_y_new$lengths[rle_y_new$values==0]

  # If there are more than 2 contiguous 0, the series is discarded
  if (any(check_contiguous_zeros > 2)) {
    resul <- NULL
  } else {
    ## Detrending
    df_current <-
      df_current |>
      group_by(month) |>
      mutate(
        y_new_normalized = y_new / mean(y_new)
      )

    resul <-
      df_current |>
      select(product_eng, region_id, date, month, y_new, y_new_normalized) |>
      group_by(month) |>
      arrange(date) |>
      mutate(t = row_number()) |>
      ungroup() |>
      nest(.by = c(product_eng, region_id, month)) |>
      # distinct OLS per month
      mutate(
        ols_fit   = map(data, ~ lm(y_new_normalized ~ -1 + t + I(t^2), data = .x)),
        resid     = map(ols_fit, residuals),
        fitted    = map(ols_fit, predict)
      ) |>
      unnest(cols = c(data, resid, fitted)) |>
      group_by(month) |>
      mutate(
        y = resid
      ) |>
      select(product_eng, region_id, month, date, y_new, y) |>
      ungroup() |>
      arrange(date)
    
    if (in_log) {
      resul <- resul |>
        mutate(y = log(y))
    }
  }
  resul
}

For example, for potatoes in region with id 1:

detrend_production(
  df = data_total, 
    crop_name = "Potato", 
    region_id = 1
)
# A tibble: 180 × 6
   product_eng region_id month date       y_new     y
   <chr>       <fct>     <dbl> <date>     <dbl> <dbl>
 1 Potato      1             1 2001-01-01 2360  0.435
 2 Potato      1             2 2001-02-01 2841  0.362
 3 Potato      1             3 2001-03-01 4585  0.628
 4 Potato      1             4 2001-04-01 4585  0.627
 5 Potato      1             5 2001-05-01 6065  0.946
 6 Potato      1             6 2001-06-01 3998. 0.434
 7 Potato      1             7 2001-07-01 3998. 0.453
 8 Potato      1             8 2001-08-01 5921  0.695
 9 Potato      1             9 2001-09-01 4751  0.508
10 Potato      1            10 2001-10-01 4751  0.642
# ℹ 170 more rows

We can apply this function to all crops of interest, in each region. Let us define a table that contains all the possible values for the combination of crops and regions:

product_and_regions <- 
  data_total |> 
  filter(product_eng %in% crops) |> 
  select(product_eng, region_id) |> 
  unique()

We will not transform the detrended values by applying the logarithm function:

# Shoudl the detrended value be transformed in log?
in_log <- FALSE

Then we apply the detrend_production() function to all these different cases, and store the results in a list named df_detrended_production:

df_detrended_production <- vector(mode = "list", length = nrow(product_and_regions))
cli_progress_bar(total = nrow(product_and_regions))
for(i in 1:nrow(product_and_regions)){
  df_detrended_production[[i]] <- detrend_production(
    df = data_total, 
    crop_name = product_and_regions$product_eng[i], 
    region_id = product_and_regions$region_id[i],
    in_log = in_log
  )
  cli_progress_update(set = i)
}
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

The elements of the list are all tibbles, with the same column names. We can merge them in a single tibble.

df_detrended_production <- bind_rows(df_detrended_production)

We can have a look at the number of months with 0 values for the agricultural production. Recall that the series where more than two contiguous 0 were observed were discarded from the result.

df_detrended_production |> 
  group_by(product_eng, region_id) |> 
  summarise(nb_0 = sum(y_new == 0)) |> 
  arrange(desc(nb_0))
`summarise()` has grouped output by 'product_eng'. You can override using the
`.groups` argument.
# A tibble: 54 × 3
# Groups:   product_eng [4]
   product_eng region_id  nb_0
   <chr>       <fct>     <int>
 1 Rice        19           45
 2 Cassava     23           19
 3 Dent corn   17           18
 4 Potato      22           12
 5 Dent corn   24            9
 6 Potato      5             9
 7 Potato      17            3
 8 Dent corn   19            2
 9 Dent corn   23            2
10 Potato      8             2
# ℹ 44 more rows

We discard the series where there are more than 10 months with 0 values for production:

df_detrended_production <- 
  df_detrended_production |> 
  group_by(product_eng, region_id) |> 
  mutate(nb_0 = sum(y_new == 0)) |> 
  filter(nb_0 < 10) |> 
  ungroup() |> 
  select(-nb_0)

Now, let us add the other columns to the tibble that contains the detrended data:

df <- df_detrended_production |> 
  left_join(
    data_total,
    join_by(product_eng, region_id, month, date)
  )

Let us also impute missing values for the weather variables.

weather_variables <- 
  weather_regions_df |> 
  select(where(is.numeric)) |> 
  select(-year, -month) |> 
  colnames()

The current number of missing values:

df |> 
  summarise(
    across(
      .cols = !!weather_variables,
      .fns = ~ sum(is.na(.x)),
      .names = "{.col}_nb_na"
    )
  ) |> 
  unlist()
         temp_min_nb_na          temp_max_nb_na         temp_mean_nb_na 
                      0                       0                       0 
       precip_sum_nb_na perc_gamma_precip_nb_na      temp_min_dev_nb_na 
                      0                       0                       0 
     temp_max_dev_nb_na     temp_mean_dev_nb_na    precip_sum_dev_nb_na 
                      0                       0                       0 
            spi_1_nb_na             spi_3_nb_na             spi_6_nb_na 
                      0                       0                       0 
           spi_12_nb_na            spei_1_nb_na            spei_3_nb_na 
                      0                       0                       0 
           spei_6_nb_na           spei_12_nb_na 
                      0                       0 

In case of missing values, we use linear interpolation to replace them:

df <- 
  df |> 
  mutate(
    across(
      .cols = !!weather_variables,
      .fns = ~ imputeTS::na_interpolation(.x, maxgap = 3)
    )
  )

The number of remaining missing values:

df |> 
  summarise(
    across(
      .cols = !!weather_variables,
      .fns = ~ sum(is.na(.x)),
      .names = "{.col}_nb_na"
    )
  ) |> 
  unlist()
         temp_min_nb_na          temp_max_nb_na         temp_mean_nb_na 
                      0                       0                       0 
       precip_sum_nb_na perc_gamma_precip_nb_na      temp_min_dev_nb_na 
                      0                       0                       0 
     temp_max_dev_nb_na     temp_mean_dev_nb_na    precip_sum_dev_nb_na 
                      0                       0                       0 
            spi_1_nb_na             spi_3_nb_na             spi_6_nb_na 
                      0                       0                       0 
           spi_12_nb_na            spei_1_nb_na            spei_3_nb_na 
                      0                       0                       0 
           spei_6_nb_na           spei_12_nb_na 
                      0                       0 

5.3.2 Saving the file

The dataset that can be used to estimate the impact of weather shocks on agricultural production can be saved in the data output folder:

save(df, file = "../data/output/df_lp.rda")