2  Agricultural Data

This section provides the code to download and format Peruvian agricultural data from the monthly reports “El Agro en Cifras” produced by the Ministry of Agriculture and Irrigation of Peru (MINAGRI). The downloaded reports cover the period from 2001 to 2019. However, starting from 2016, the observations provided are no longer monthly but quarterly. Therefore, they are excluded from the analysis.

Multiple R packages are required.

Tip

If you previously loaded the {raster} package, there will be conflicts between dplyr::select() and raster::select() functions. It is recommended to run the code in this chapter in a clear R session.

library(rJava)
options(java.parameters = "-Xmx2048m")
library(tidyverse)
library(rvest)
library(readxl)
library(xlsx)
library(data.table)
library(lubridate, warn.conflicts = FALSE)
library(cowplot)
library(jtools)
library(huxtable)
library(xtable)
library(texreg)
library(pdftools)
library(timetk)
library(seasonal)

2.1 Functions

In this section, we define several functions to download and format various datasets. These functions will be utilized later in Section 2.3 for importing and formatting the agricultural data in R.

2.1.1 Downloading Data

#' Directly download the Excels versions of the monthly reports available on the
#' Peruvian website MINAGRI.
#' 
#' @param y
download.data <- function(y){

  # HOME PAGE
  page <- str_c(
    "https://www.midagri.gob.pe/portal/",
    "boletin-estadistico-mensual-el-agro-en-cifras?start=",
    19-y
  )

  # EXTRACTION OF THE LINKS
  text_rvest <- read_html(page)
  results <- text_rvest |> html_nodes(".mainbody")
  first_result <- results[1]

  # LIST OF THE MONTHS
  list_of_months <- first_result |>
    html_nodes(".cabecera") |>
    html_text(trim = TRUE)
  list_of_months <- list_of_months[str_detect(list_of_months, "[aeiou]")]

  # LIST OF THE urls (=liens)
  liens <-first_result |> html_nodes("a[href]") |> html_attr("href")
  liens <- liens[(str_detect(liens, "zip"))]
  if (sum(as.numeric(str_detect(liens, "\\.pdf"))) > 0) {
    liens <- liens[-which(str_detect(liens, "\\.pdf"))]
  }
  if (sum(as.numeric(str_detect(liens, "pdf\\.zip"))) > 0) {
    liens <- liens[-which(str_detect(liens, "pdf\\.zip"))]
  }
  if ((y == 13) | (y == 14) | (y ==15)) {
    liens <- liens[which(str_detect(liens, "cuadros"))]
  }
  if (y == 16) {
    liens <- liens[which(str_detect(liens, "data"))]
  }
  if (y == 14) {
    if (sum(as.numeric(str_detect(liens, "abril"))) > 1) {
      line <- liens[which(str_detect(liens, "abril"))[1]]
      liens <- liens[-which(str_detect(liens, "abril"))]
    }
  }

  if (sum(as.numeric(duplicated(liens))) > 0) {
    liens <- liens[-which(duplicated(liens) == T)]
  }
  if (y == 14) {
    liens <- c(liens[1:8], line, liens[9:11])
  }

  annee <-  as.numeric(str_c("20", ifelse(y <10 , str_c("0",y), y)))

  # CONDITION 1: Checking the number of urls
  if (length(liens) == length(list_of_months)) {

    if (y == 7) {
      list_of_months <- list_of_months[-8]
      liens <- liens[-8]
    }

    dir.create(str_c("../data/", annee))

    # LOOP 1 - On months for downloading and extracting the ZIP files
    for (m in 1:length(liens)) {
      mois_Lettres <- list_of_months[m]

      mois <- case_when(
        str_detect(mois_Lettres, "iciembre") ~ "12",
        str_detect(mois_Lettres, "oviembre") ~ "11",
        str_detect(mois_Lettres, "ctubre")   ~ "10",
        str_detect(mois_Lettres, "tiembre")  ~ "09",
        str_detect(mois_Lettres, "osto")     ~ "08",
        str_detect(mois_Lettres, "ulio")     ~ "07",
        str_detect(mois_Lettres, "unio")     ~ "06",
        str_detect(mois_Lettres, "ayo")      ~ "05",
        str_detect(mois_Lettres, "bril")     ~ "04",
        str_detect(mois_Lettres, "arzo")     ~ "03",
        str_detect(mois_Lettres, "brero")    ~ "02",
        str_detect(mois_Lettres, "nero")     ~ "01"
      )

      # DOWNLOADING THE FILE
      AdresseFichier <- str_c("../data/raw/minagri/", annee, "/", mois, ".zip")
      download.file(
        url = str_c("https://www.midagri.gob.pe/", liens[m]),
        destfile = AdresseFichier
      )

      # Names of the files in the archive
      files <- unzip(zipfile = AdresseFichier, list = TRUE)
      folder_name <- files$Name[str_which(
        files$Name, regex("agr.*la", ignore_case = TRUE)
      )] |>
        str_extract("^(.*)/")

      # Unzip on Mac OS when files in the archive contain accents in names
      system(str_c("open ", AdresseFichier))
      print(str_c("FILE ", mois, "-", annee, " UNZIPED"))

      folder_name <- files$Name[str_which(
        files$Name, regex("agr.*la", ignore_case = TRUE)
      )] |>
        str_extract("^(.*)/")

      # Special case to handle year 2005
      if (annee == 2005) {
        AdresseFichier <- str_c("../data/raw/minagri/", annee, "/", mois)
        fichierZip <- list.files(AdresseFichier)[str_detect(list.files(AdresseFichier), ".zip")]
        for (i in 1:length(fichierZip)) {
          unzip(
            zipfile = str_c(AdresseFichier, "/", fichierZip[i]),
            exdir = AdresseFichier
          )
        }
      }

    }# End of LOOP 1

    # MANUAL TREATMENT FOR 2007
    if (annee == 2007) {
      list_of_months <- c(list_of_months[1:7], "Mayo", list_of_months[8:11])
      dir.create(str_c("../data/raw/minagri/", annee, "/05"))
      file.copy(
        from = "./MAYO_2007_NO_DELETE/AGRICOLA.xls" ,
        to = str_c("../data/raw/minagri/",annee,"/05/AGRICOLA.xls")
      )
    }

    # LOOP 2 - On months for creating the Excel files
    for (m in 1:length(list_of_months)) {

      if (y == 15 & m == 7) { next } else {
        # Checking the appopriate name of each month
        mois_Lettres <- list_of_months[m]
        mois <- case_when(
          str_detect(mois_Lettres, "iciembre") ~ "12",
          str_detect(mois_Lettres, "oviembre") ~ "11",
          str_detect(mois_Lettres, "ctubre")   ~ "10",
          str_detect(mois_Lettres, "tiembre")  ~ "09",
          str_detect(mois_Lettres, "osto")     ~ "08",
          str_detect(mois_Lettres, "ulio")     ~ "07",
          str_detect(mois_Lettres, "unio")     ~ "06",
          str_detect(mois_Lettres, "ayo")      ~ "05",
          str_detect(mois_Lettres, "bril")     ~ "04",
          str_detect(mois_Lettres, "arzo")     ~ "03",
          str_detect(mois_Lettres, "brero")    ~ "02",
          str_detect(mois_Lettres, "nero")     ~ "01"
        )

        # Selecting the right Zip file
        AdresseFichier <- str_c("../data/raw/minagri", annee, "/", mois, ".zip")

        if ((y == 12 & m == 10) | (y == 7 & m == 8)) {
          files <- ifelse(
            m == 10,
            yes = data.table(Name = list.files(
              "../data/raw/minagri/2012/cuadros-marzo12"
            )),
            no = data.table(Name = list.files(
              str_c("../data/raw/minagri/", annee, "/05/")
            ))
          )
        } else {
          files <- unzip(zipfile = AdresseFichier, list = TRUE)
        }

        if (y == 06 & (m == 10 | m == 12)) {
          for (i in 1:dim(files)[1]) {
            name <- files[i, 1]
            unzip(
              zipfile = str_c(
                "../data/raw/minagri/", annee, "/", mois, "/", name
              ),
              exdir = str_c(
                "../data/raw/minagri/", annee, "/", mois
              )
            )
          }
        }

        if (y == 16 & m < 12) {
          N_1 <- str_c("../data/raw/minagri/", annee, "/", files[1, 1])
          N <- str_c(N_1,"x")
          file.copy(from = N_1, to = N)

          if (m >= 6) {
            N <- str_c("../data/raw/minagri/", annee, "/", files[1, 1])
          } else {
            unlink(N_1, recursive = TRUE)
          }
        } else {
          folder_name <- files$Name[str_which(
            files$Name,
            regex("agr.*la", ignore_case = TRUE)
          )] |>
            str_extract("^(.*)/")

          if (is.na(folder_name[1])) {folder_name <- mois}
          if (y == 12 & m == 10) {folder_name <- "cuadros-marzo12"}

          # Adding .xls to the 2008-07 file
          if (y == 08 & m == 6) {
            N <- list.files(
              str_c("../data/raw/minagri/", annee, "/", folder_name),
              full.names = TRUE
            ) |>
              {\(x) str_which(x, regex("clima", ignore_case = TRUE))}()
            file.rename(from = N, to = str_c(N,".xls"))
          }

          # CHANGING "HIDRO.." FILES INTO "CLIMA"
          if (
            any(
              str_detect(
                list.files(
                  str_c("../data/raw/minagri/",annee,"/",folder_name),
                  full.names = TRUE
                ),
                regex("hidro.*",ignore_case = TRUE)
              )
            ) == TRUE) {
            N <- list.files(
              str_c("../data/raw/minagri/",annee,"/",folder_name),
              full.names = TRUE
            ) |>
              {\(x) str_which(w, regex("hidro.*", ignore_case = TRUE))}()
            file.rename(
              from = N,
              to = str_c( substr(N,0,nchar(N)-4),"clima.xls")
            )
          }

          # Final list of files in the unziped file
          N <- list.files(
            str_c("../data/raw/minagri/", annee, "/", folder_name),
            full.names = TRUE,
            pattern = "\\.xlsx?",
            ignore.case = TRUE
          )
          print(N)

          # SELECTING THE FILE FOR AGRICULTURAL DATA
          N1 <- N[str_which(N, regex("agr.*la", ignore_case = TRUE))]

          if (sum(as.numeric(duplicated(N1))) > 0) {
            N1 <- N1[-which(duplicated(N1) == T)]
          }
          if (length(N1) > 1) {
            N1 <- case_when(
              (y == 07 & m == 10) ~ N1[1],
              (y == 08 & m == 1)  ~ N1[2],
              (y == 08 & m == 8)  ~ N1[1],
              (y == 15 & m == 4)  ~ N1[2]
            )
          }
          print(N1)

          # SELECTING THE FILE FOR PRICES
          if (y < 14 | (y == 14 & m > 1) ) {
            N2 <- N[str_which(N, regex("pre.*os", ignore_case = TRUE))]
          } else {
            N2 <- N[str_which(N, regex("agr.*la", ignore_case = TRUE))]
          }

          if (y == 16) { N2 <- N }

          if (sum(as.numeric(duplicated(N2))) > 0) {
            N2 <- N2[-which(duplicated(N2)==T)]
          }
          if (length(N2) > 1) {
            N2 <- case_when(
              (y == 07 & m == 10) ~ N2[1],
              (y == 08 & m == 1)  ~ N2[2],
              (y == 08 & m == 8)  ~ N2[1],
              (y == 15 & m == 4)  ~ N2[2]
            )
          }
          print(N2)
          if (y == 10 & m == 3) {
            file.copy(
              from = N2 ,
              to = str_c(
                "../data/raw/minagri/2010/",
                "BEAM SETIEMBRE_DEF_2010/BEAM SETIEMBRE//Precios_oct.xls"
              )
            )
          }

          # SELECTING THE FILE FOR CLIMATE DATA
          N3 <- N[str_which(N, regex("c.*ima", ignore_case = TRUE))]
          if (y == 16) {N3 <- N}
          if (sum(as.numeric(duplicated(N3))) > 0){
            N3 <- N3[-which(duplicated(N3) == T)]
          }
          if (sum(as.numeric(str_detect(N3, regex("mp.*t", ignore_case = TRUE)))) > 0) {
            N3 <- N3[-which(str_detect(N3, regex("mp.*t", ignore_case = TRUE)))]
          }
          if (is_empty(N3)) {
            N3 <- N[str_which(N, regex("Bem.*", ignore_case = TRUE))]
            N3 <- N3[-which(str_detect(N3, regex("mp.*t", ignore_case = TRUE)))]
          }
          if (length(N3) > 1) {
            N3 <- case_when(
              (y == 07 & m == 10) ~ N3[1],
              (y == 08 & m == 1)  ~ N3[1],
              (y == 08 & m == 8)  ~ N3[7],
              (y == 08 & m == 9)  ~ N3[2],
              (y == 10 & m == 12) ~ N3[1],
              (y == 15 & m == 4)  ~ N3[2]
            )
          }
          print(N3)
          if (is_empty(N3)) {N3 <- ""}

          # Special attribution for the 2006_01 files
          if (y == 06 & m == 12) {
            N1 <- str_c(
              "../data/raw/minagri/", annee, "/", mois, "/AGRICOLA/AGRICOLA.xls"
            )
            N2 <- str_c(
              "../data/raw/minagri/", annee, "/", mois, "/PRECIOS/PRECIOS.xls"
            )
            N3 <- str_c(
              "../data/raw/minagri/", annee, "/", mois, "/CLIMA/Bemene2006.xls"
            )
          }

          type_xl <- "xls"
          if (y == 15) {
            type_xl <- "xlsx"
            if (m == 2) {type_xl <- "xls"}
          }

          if (y == 5) {
            folder_name <- files$Name[str_which(
              files$Name,
              regex("agri.*zip", ignore_case = TRUE)
            )]
            unzip(
              str_c("../data/raw/minagri/", annee, "/", mois, "/", folder_name),
              exdir = str_c("../data/raw/minagri/", annee,"/", mois)
            )
            N1 <- list.files(
              str_c("../data/raw/minagri/", annee, "/", mois),
              full.names = TRUE,
              pattern = "\\.xlsx?",
              ignore.case = TRUE
            )
          }
        }

        mois <- case_when(
          (y == 12 & mois == "04") ~ "05",
          (y == 12 & mois == "05") ~ "04",
          (y == 14 & mois == "09") ~ "10",
          (y == 14 & mois == "10") ~ "09",
          TRUE ~ as.character(mois)
        )

        # FILES FOR 2005 - 2013
        if (y <= 13) {
          # PRODUCTION
          name <- str_c(
            "../data/raw/minagri/Production/Production_", annee, ".xlsx"
          )
          for (i in 1:2) {
            if (y < 11) {
              table <- ifelse(i == 1, "c-26", "c-27")
            } else {
              table <- ifelse(i == 1, "c-28", "c-29")
            }
            if (y == 11 & m == 9 ) {table <- ifelse(i == 1, "c-27", "c-28")}
            if (y == 11 & m > 9 ) {table <- ifelse(i == 1, "c-26", "c-27")}
            Tableau <- read_excel(path = N1, sheet = table)

            write.xlsx(
              Tableau,
              name,
              sheetName = str_c(mois, annee, i),
              append = TRUE,
              showNA = FALSE
            )
            print(table)
          }

          # PLANTED AREAS
          name <- str_c(
            "../data/raw/minagri/Surface/Superficies_", annee, ".xlsx"
          )
          for (i in 1:2) {
            if (y < 11) {
              table <- ifelse(i == 1, "c-19", "c-20")
            } else {
              table <- ifelse(i == 1, "c-21", "c-22")
            }
            if( y == 11 & m > 9 ) {table <- ifelse(i == 1, "c-19", "c-20")}
            if( y == 11 & m == 9 ) {table <- ifelse(i == 1, "c-20", "c-21")}

            Tableau <- read_excel(path = N1, sheet = table, col_types = "text")
            write.xlsx(
              Tableau,
              name,
              sheetName = str_c(mois, annee, i),
              append = TRUE,
              showNA = FALSE
            )
          }

          # HARVESTED SUPERFICIES
          name <- str_c(
            "../data/raw/minagri/Surface_R/Superficies_R_", annee, ".xlsx"
          )
          for (i in 1:2) {
            if (y < 11) {
              table <- ifelse(i == 1, "c-23", "c-24")
            } else {
              table <- ifelse(i == 1, "c-25", "c-26")
            }
            if (y == 11 & m > 9 ) {table <- ifelse(i == 1, "c-23" ,"c-24")}
            if (y == 11 & m == 9 ) {table <- ifelse(i == 1, "c-24" ,"c-25")}

            Tableau <- read_excel(path = N1, sheet = table, col_types = "text")

            write.xlsx(
              Tableau,
              name,
              sheetName = str_c(mois, annee, i),
              append = TRUE,
              showNA = FALSE
            )
          }

          # PRICES
          # Name of the file
          name <- str_c("../data/raw/minagri/Prices/Prices_", annee, ".xlsx")

          # Selecting the names of the sheets
          if (y <= 10) {
            table <- "C-65"
          } else {
            table <- case_when(
              (annee == 2011 & m < 9  )   ~ "C-69",
              (annee == 2011 & m  == 9  ) ~ "C-68",
              (annee == 2011 & m >= 10 )  ~ "C-65",
              (annee > 2011 )             ~ "C-69"
            )
          }
          if (y == 13 & m == 1) {table <- regex("C-76", ignore_case = TRUE)}
          if (y == 10 & m == 4) {
            N2 <-  str_c(
              "../data/raw/minagri/2010/BEAM SETIEMBRE_DEF_2010/",
              "BEAM SETIEMBRE//Precios_oct.xls"
            )
          }
          if (y == 07 & m == 8) {
            N2 <-  "../data/raw/minagri/2007/04/PRECIOS.xls"
          }

          Tableau <- read_excel(path = N2, sheet = table, col_types = "text")
          write.xlsx(
            Tableau,
            name,
            sheetName = str_c(mois, annee),
            append = TRUE,
            showNA = FALSE
          )
        } else {
          # FILES FOR 2014 - 2018
          if (y == 16) {
            N1 <- N
            N2 <- N
            N3 <- N
          }
          if (y == 16 & m == 12) {
            N1 <- N[2]
            N2 <- N[2]
            N3 <- N[2]
          }
          # PRODUCTION
          name <- str_c(
            "../data/raw/minagri/Production/Production_", annee, ".xlsx"
          )
          if(y == 14) {
            table <- case_when(
              m ==1  ~ "c-12",
              m > 1 ~ "c-26"
            )
          }
          if (y == 15) {table <-"c-11"}
          if (y == 18) {table <-"c-18"}
          if (y == 15 & m ==  2) {table <-"C-11"}
          if (y == 15 & m ==  4) {table <-"C-11"}
          if (y == 15 & m ==  6) {table <-"C-11"}
          if (y == 15 & m == 10) {table <-"C-11"}
          if (y == 16 & m <   7) {table <-"C.12"}
          if (y == 16 & m ==  7) {table <-"C. 12"}
          if (y == 16 & m >=  8) {table <-"C.12"}
          if (y == 16 & m == 12) {table <-"c-11"}
          if (y == 14 & m >7) {
            for (i in 1:2) {
              table <- ifelse(i == 1, "c-28", "c-29")
              if (m == 11) {table <- ifelse(i == 1, "C-28", "C-29")}
              if (m == 8) {table <- ifelse(i == 1, "c-26", "c-26-a")}
              Tableau <- read_excel(path = N1, sheet = table)
              write.xlsx(
                Tableau,
                name,
                sheetName = str_c(mois, annee, i),
                append = TRUE,
                showNA = FALSE
              )
            }
          } else {
            Tableau <- read_excel(
              path = N1,
              sheet = regex(table, ignore_case = TRUE)
            )
            write.xlsx(
              Tableau,
              name,
              sheetName = str_c(mois, annee),
              append = TRUE,
              showNA = FALSE
            )
          }

          # PLANTED SUPERFICIES
          name <- str_c(
            "../data/raw/minagri/Surface/Superficies_",annee,".xlsx"
          )
          if(y ==14) {
            table <- case_when(
              m ==1 ~ "c-7",
              m > 1 ~ "c-21"
            )
          }
          if (y ==15) {table <-"c-6"}
          if (y ==18) {table <-"c-16"}
          if (y == 16) {table <-"C.9"}
          if (y == 15 & m == 2) {table <-"C-6"}
          if (y == 15 & m == 4) {table <-"C-6"}
          if (y == 15 & m == 6) {table <-"C-6"}
          if (y == 15 & m == 10) {table <-"C-6"}
          if (y == 16 & m ==  7) {table <-"C. 7"}
          if (y == 16 & m %in% c(8, 9, 10, 11, 12)) {table <-"C.6"}

          if (y == 14 & m > 7 ) {
            for (i in 1:2) {
              table <- ifelse(i == 1, "c-21", "c-22")
              if(m == 8 ) {table <- ifelse(i == 1, "c-21", "c-21-a")}
              Tableau <- read_excel(path = N1, sheet = table)
              write.xlsx(
                Tableau,
                name,
                sheetName = str_c(mois, annee, i),
                append = TRUE,
                showNA = FALSE
              )
            }
          } else {
            Tableau <- read_excel(path = N1, sheet = table, col_types = "text")
            write.xlsx(
              Tableau,
              name,
              sheetName = str_c(mois, annee),
              append = TRUE,
              showNA = FALSE
            )
          }

          # HARVESTED SUPERFICIES
          name <- str_c(
            "../data/raw/minagri/Surface_R/Superficies_R_", annee, ".xlsx"
          )
          if (y ==14) {
            table <- case_when(
              m ==1 ~ "c-10",
              m > 1 ~ "c-24"
            )
          }
          if (y ==15) {table <-"c-9"}
          if (y ==18) {table <-"c-16"}
          if (y == 16) {table <-"C.9"} #FALSE : Table do not exist in the 2016 file
          if (y == 15 & m == 2) {table <-"C-9"}
          if (y == 15 & m == 4) {table <-"C-9"}
          if (y == 15 & m == 6) {table <-"C-9"}
          if (y == 15 & m == 10) {table <-"c-9"}
          if (y == 16 & m ==  7) {table <-"C. 7"} #FALSE : Table do not exist in the 2016 file
          if (y == 16 & m %in% c(8, 9, 10, 11, 12)) {table <-"C.9"}

          if (y == 14 & m > 7) {
            for (i in 1:2) {
              table <- ifelse(i == 1,"c-25", "c-26")
              if (m == 11) {table <- ifelse(i == 1, "C25", "C26")}
              if (m == 8 ) {table <- ifelse(i == 1, "c-24", "c-24-a")}
              Tableau <- read_excel(path = N1, sheet = table)
              write.xlsx(
                Tableau,
                name,
                sheetName = str_c(mois, annee, i),
                append = TRUE,
                showNA = FALSE
              )
            }
          } else {
            Tableau <- read_excel(path = N1, sheet = table, col_types = "text")
            write.xlsx(
              Tableau,
              name,
              sheetName = str_c(mois, annee),
              append = TRUE,
              showNA = FALSE
            )
          }
          # PRICES
          # Name of the file
          name <- str_c(
            "../data/raw/minagri/Prices/Prices_", annee, ".xlsx"
          )

          # Selecting the names of the sheets
          table <- case_when(
            (annee == 2014 & m == 1)            ~ as.character(regex("c-17", ignore_case = TRUE)),
            (annee == 2014 & m == 2)            ~ as.character(regex("C-80", ignore_case = TRUE)),
            (annee == 2014 & m == 3)            ~ as.character(regex("C-72", ignore_case = TRUE)),
            (annee == 2014 & m == 4)            ~ as.character(regex("C-80", ignore_case = TRUE)),
            (annee == 2014 & m %in% 4:8)        ~ as.character(regex("C-72", ignore_case = TRUE)),
            (annee == 2014 & m >8)              ~ as.character(regex("C-76", ignore_case = TRUE)),
            (annee == 2015 & ! m %in% c(2,4,6)) ~ as.character(regex("c-16", ignore_case = TRUE)),
            (annee == 2015 & m %in% c(2,4,6))   ~ as.character(regex("C-16", ignore_case = TRUE)),
            (annee == 2016 & m <= 6)            ~ as.character(regex("C.15", ignore_case = TRUE)),
            (annee == 2016 & m >  6)            ~ as.character(regex("C.13", ignore_case = TRUE)),
            (annee == 2016 & m == 7)            ~ as.character(regex("C. 13", ignore_case = TRUE)),
          )
          if (y == 16 & m == 7) {table <-"C. 13"}

          Tableau <- read_excel(path = N2, sheet = table, col_types = "text")
          write.xlsx(
            Tableau,
            name,
            sheetName = str_c(mois, annee),
            append = TRUE,
            showNA = FALSE
          )
        }
        if (y == 16 & m < 12) {
          unlink(N, recursive = TRUE)
        } else {
          unlink(
            str_c("../data/raw/minagri/", annee, "/", folder_name),
            recursive = TRUE
          )
        }
      }
    }# End of LOOP 2
    # END OF CONDITION 1
  } else {
    print("ERROR - NB MOIS =! NB LIENS")
  }
}

We define a function to extract data from the PDFs:

#' Extract data from the PDF monthly reports from MINAGRI
#' 
#' @param annee year of the report
#' @param mois month of the report
#' @param adresse url of the report on www.midagri.gob.pe
#' @param page page number to extract
#' @param Cell1 name of the first cell to import
extract_pdf_data <- function(annee,
                             mois,
                             adresse,
                             page,
                             Cell1) {

  ## Defining the number of cultures in the table----
  PageDeDonnes <- pdf_data(adresse)[[page]]
  if (annee == 2 & mois == 2) {
    PageDeDonnes[which((PageDeDonnes[,"text"] == "Año.?")),"x"] <-
      as.numeric(PageDeDonnes[which((PageDeDonnes[,"text"] == Cell1)),"x"])
  }

  # Determining the begining of the table
  x_Cell1 <- as.numeric(PageDeDonnes[which((PageDeDonnes[,"text"] == Cell1)), "x"])
  y_Cell1 <- as.numeric(PageDeDonnes[which((PageDeDonnes[,"text"] == Cell1)), "y"])

  # Looking for the lines with the same position as Cell1 or above
  AboveLine1 <- PageDeDonnes[which((PageDeDonnes[,"x"] <= x_Cell1 + 1)), ]

  ## Special case for 01/2003 ----
  if ((annee == 3 & mois == 1) | (annee == 2 & mois < 8)) {
    AboveLine1 <- PageDeDonnes[which((PageDeDonnes[, "x"] <= x_Cell1 + 10)), ]
    x_Cell1 <- max(AboveLine1$x)}
  # end of special case

  x <-  which(
    str_detect(AboveLine1$text, "Año.?") |
      str_detect(AboveLine1$text, "paña")
  )
  AboveLine1[x, "x"] <- x_Cell1
  if (any((AboveLine1[,"text"] == "Mensual"))) {
    AboveLine1 <- AboveLine1[- which(AboveLine1[, "text"] == "Mensual"), ]
  }

  if (any((AboveLine1[, "x"] > x_Cell1))) {
    AboveLine1[which(AboveLine1[,"x"]>x_Cell1) ,"x"] <- x_Cell1
  }

  AboveLine1 <- AboveLine1[order(AboveLine1$x, -AboveLine1$y), ]
  positions  <- which((AboveLine1[, "text"] == Cell1))
  x <- as.numeric(AboveLine1[positions, "x"])

  # Line with the culture list
  Cultures <- AboveLine1[AboveLine1$x == x, ]
  if (dim(Cultures)[1] < 3) {
    Cultures <- AboveLine1[AboveLine1$x %in% c(x,x-1), ]
  }

  if ((annee == 3 & mois == 1) | (annee ==2 & mois < 6)) {
    colnames(Cultures) <- c(
      "width_1", "height_1", "x_1", "y_1", "space_1", "text_1"
    )
    Cultures_1_0103 <- Cultures
  }

  # Selection of the previous line to determine the complete name
  x_1 <- unique(AboveLine1$x)
  x_1 <- x_1[-which((x_1 == x))]
  Cultures_1 <- AboveLine1[AboveLine1$x == max(x_1), ]
  if ((annee == 3 & mois == 1) | (annee ==2 & mois < 6)) {
    Cultures_0103 <- Cultures_1
  }
  colnames(Cultures_1) <- c("width_1","height_1","x_1","y_1","space_1","text_1")

  # Row matching to obtain full crop names
  if ((annee == 3 & mois == 1) | (annee ==2 & mois < 6)) {
    Cultures <- merge(
      Cultures_0103,
      Cultures_1_0103,
      by.x = "y",
      by.y = "y_1",
      all.x = TRUE,
      all.y=TRUE
    )
  } else {
    Cultures <- merge(
      Cultures,
      Cultures_1,
      by.x = "y",
      by.y = "y_1",
      all.x = TRUE,
      all.y=TRUE
    )
  }

  for (ii in 1:dim(Cultures)[1]) {
    Cultures$nomcomplet[ii]  <- if (is.na(Cultures$text_1[ii])) Cultures$text[ii] else paste(Cultures$text_1[ii], Cultures$text[ii])
    if (annee == 3 & mois == 1 ) {Cultures$nomcomplet[ii]  <- if (is.na(Cultures$text[ii])) Cultures$text_1[ii] else paste(Cultures$text_1[ii], Cultures$text[ii])}
    Cultures$nomcomplet[ii]  <- str_replace_all(Cultures$nomcomplet[ii], "- ", "")
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "de azúcar") paste("Caña", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "loctao") paste("Frijol", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "palo") paste("Frijol de", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "grano seco**") paste("Frijol", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "amiláceo") paste("Maíz", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "hua") paste("Cañi" ,Cultures$nomcomplet[ii], sep="") else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "cáscara") paste("Arroz", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]
    Cultures$nomcomplet[ii]  <- if (Cultures$nomcomplet[ii] == "rama") paste("Algodón", Cultures$nomcomplet[ii]) else Cultures$nomcomplet[ii]

    if (! is.na(Cultures$text[ii]) & is.na(Cultures$text_1[ii]) & ii > 1 & ii < dim(Cultures)[1]) {
      Cultures$nomcomplet[ii] <- if(Cultures$y[ii] == Cultures$y[ii-1] + 1) paste(Cultures$text_1[ii-1], Cultures$text[ii]) else Cultures$nomcomplet[ii]
      Cultures$nomcomplet[ii] <- if(Cultures$y[ii] == Cultures$y[ii+1] - 1) paste(Cultures$text_1[ii+1], Cultures$text[ii]) else Cultures$nomcomplet[ii]
    }
  }

  # Discarding unmatched rows
  ii = which(is.na(Cultures$x))
  if (is.integer(ii) && length(ii) != 0) {Cultures <- Cultures[-ii, ]}

  ii = which(Cultures$text == "a.")
  if(is.integer(ii) && length(ii) != 0) {Cultures <- Cultures[-ii, ]}

  ii = which(Cultures$text == "de")
  if(is.integer(ii) && length(ii) != 0) {Cultures <- Cultures[-ii, ]}

  isPlatano <-any(str_detect(PageDeDonnes$text, "Plátano"))
  isFrijol <-any(str_detect(PageDeDonnes$text, "Frijol |seco"))

  ## Special case for 01/2003
  if ((annee == 3 & mois == 1) | annee == 2) {
    if (page %in% c(30, 31)) {
      Cultures <- as.data.table(
        c("Departamento","Campaña", "Total","Arroz cáscara", "Maíz amiláceo",
          "Frijol grano seco**","Frijol castlla", "Pallar","Zarandaja",
          "Frijol de palo", "Garbanzo","Frijol loctao", "Lentaja", "Papa",
          "Trigo", "Algodón rama", "Maíz duro", "Soya", "Sorgo grano",
          "Marigold")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(32, 33)) {
      Cultures <- as.data.table(
        c("Departamento", "Campaña", "Total", "Cebada grano", "Quinua",
          "Cañihua", "Kiwicha","Haba grano", "Arveja grano", "Chocho tarhui",
          "Olluco", "Oca", "Mashua", "Camote", "Yuca", "Cebolla", "Ajo",
          "Tomate")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(38, 39)) {
      Cultures <- as.data.table(
        c("Departamento","Años", "Total","Arroz cáscara", "Maíz amiláceo",
          "Frijol grano seco**","Frijol castlla", "Pallar","Zarandaja",
          "Frijol de palo", "Garbanzo","Frijol loctao", "Lentaja", "Papa",
          "Trigo", "Maíz duro","Soya", "Sorgo grano", "Marigold")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(40,41)) {
      Cultures <- as.data.table(
        c("Departamento", "Años", "Cebada grano", "Quinua","Cañihua","Kiwicha",
          "Haba grano", "Arveja grano", "Chocho tarhui", "Olluco", "Oca",
          "Mashua", "Camote", "Yuca", "Cebolla", "Ajo", "Tomate")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(44, 45)) {
      Cultures <- as.data.table(
        c("Departamento","Años", "Arroz cáscara", "Maíz amiláceo",
          "Frijol grano seco**","Frijol castlla", "Pallar","Zarandaja",
          "Frijol de palo", "Garbanzo","Frijol loctao", "Lentaja", "Papa",
          "Trigo","Plátano", "Algodón rama", "Maíz duro", "Soya",
          "Sorgo grano", "Caña de azúcar", "Café", "Espárrago", "Marigold")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(46, 47)) {
      Cultures <- as.data.table(
        c("Departamento", "Años", "Cebada grano", "Quinua","Cañihua",
          "Kiwicha","Haba grano", "Arveja grano", "Chocho tarhui", "Olluco",
          "Oca", "Mashua", "Camote", "Yuca", "Cebolla", "Ajo", "Tomate")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(112,113)) {
      Cultures <- as.data.table(
        c("Departamento", "Año", "Arroz cáscara", "Maíz amiláceo","Trigo",
          "Camote", "Papa", "Yuca", "Algodón rama", "Espárrago", "Maíz duro",
          "Marigold", "Soya", "Café")
      )
      colnames(Cultures) <- "nomcomplet"
    }

    if (page %in% c(114,115)) {
      Cultures <- as.data.table(
        c("Departamento", "Año", "Ajo", "Cebolla", "Maíz Choclo", "Tomate",
          "Arveja grano", "Haba grano","Limón", "Mandarina", "Manzana",
          "Naranja", "Papaya", "Palta", "Piña")
      )
      colnames(Cultures) <- "nomcomplet"
    }
  } else {
    Cultures <- Cultures[order(-Cultures$y), ]
    Cultures$nomcomplet[2] <-
      if (str_detect(Cultures$nomcomplet[2],"ña")) "Campaña" else Cultures$nomcomplet[2]

    # Selection des donnees d'interet
    Cultures <-Cultures[, c("x","y","nomcomplet")]  |>
      mutate(
        nomcomplet = case_when(
          str_detect(nomcomplet,"Cañi")      ~ "Cañihua",
          str_detect(nomcomplet,"seco")      ~ "Frijol grano seco**",
          str_detect(nomcomplet,"paña")      ~ "Campaña",
          str_detect(nomcomplet,"daja")      ~ "Zarandaja",
          str_detect(nomcomplet,"duro")      ~  "Maíz duro",
          str_detect(nomcomplet,"banzo")     ~  "Garbanzo",
          str_detect(nomcomplet,"de azúcar") ~  "Caña de azúcar",
          str_detect(nomcomplet,"Kiwi")      ~  "Kiwicha",
          TRUE ~ nomcomplet
        )
      ) |>
      mutate(n = duplicated(nomcomplet)) |>
      filter(! n ==T ) |>
      select(-n)

  }

  if (y == 4 & m > 6 & page %in% c(38, 39) ) {
    Cultures <- as.data.table(
      c("Departamento", "Años","Total", "Arroz cáscara",    "Maíz amiláceo",
        "Frijol grano seco**",  "Frijol castlla",   "Pallar",   "Zarandaja",
        "Frijol de palo",   "Garbanzo", "Frijol loctao",    "Lenteja",  "Papa",
        "Trigo",    "Maíz duro",    "Soya", "Sorgo grano",  "Caña de azúcar" )
    )
    colnames(Cultures) <- "nomcomplet"
  }

  if (page > 100) {
    if (page %in% c(108, 109, 110, 112, 113)) {
      if (isFrijol == T) {
        Cultures <- as.data.table(
          c("Departamento", "Año", "Arroz cáscara", "Maíz amiláceo",
            "Trigo", "Frijol grano seco", "Camote", "Papa", "Yuca",
            "Algodón rama", "Espárrago", "Maíz duro",  "Marigold", "Soya",
            "Café")
        )
        colnames(Cultures) <- "nomcomplet"
      } else {
        Cultures <- as.data.table(
          c("Departamento", "Año", "Arroz cáscara", "Maíz amiláceo","Trigo",
            "Camote", "Papa", "Yuca", "Algodón rama", "Espárrago", "Maíz duro",
            "Marigold", "Soya", "Café")
        )
        colnames(Cultures) <- "nomcomplet"
      }
    }

    if (page %in% c(111, 114, 115)) {
      if (isPlatano == T) {
        Cultures <- as.data.table(
          c("Departamento", "Año", "Ajo", "Cebolla", "Maíz Choclo", "Tomate",
            "Arveja grano", "Haba grano","Limón", "Mandarina", "Manzana",
            "Naranja", "Papaya", "Palta", "Piña","Plátano")
        )
        colnames(Cultures) <- "nomcomplet"
      } else {
        Cultures <- as.data.table(
          c("Departamento", "Año", "Ajo", "Cebolla", "Maíz Choclo", "Tomate",
            "Arveja grano", "Haba grano","Limón", "Mandarina", "Manzana",
            "Naranja", "Papaya", "Palta", "Piña")
        )
        colnames(Cultures) <- "nomcomplet"
      }
    }

    if (annee == 2 & mois < 8 & page %in% c(111, 112)) {
      if (isPlatano == T) {
        Cultures <- as.data.table(
          c("Departamento", "Año", "Ajo", "Cebolla", "Maíz Choclo", "Tomate",
            "Arveja grano", "Haba grano","Limón", "Mandarina", "Manzana",
            "Naranja", "Papaya", "Palta", "Piña","Plátano")
        )
        colnames(Cultures) <- "nomcomplet"
      } else {
        Cultures <- as.data.table(
          c("Departamento", "Año", "Ajo", "Cebolla", "Maíz Choclo", "Tomate",
            "Arveja grano", "Haba grano","Limón","Mandarina", "Manzana",
            "Naranja", "Papaya", "Palta", "Piña")
        )
        colnames(Cultures) <- "nomcomplet"}
    }

    if (annee == 2 & mois == 7 & page == 110) {
      Cultures <- as.data.table(
        c("Departamento", "Año", "Ajo", "Cebolla", "Maíz Choclo", "Tomate",
          "Arveja grano", "Haba grano","Limón", "Mandarina", "Manzana",
          "Naranja", "Papaya", "Palta", "Piña","Plátano")
      )
      colnames(Cultures) <- "nomcomplet"
    }
  }

  ## Downloading the table----
  tx <- pdf_text(adresse)[[page]]
  tx2 <- unlist(str_split(tx, "[\\r\\n]+"))
  tx3 <- as.data.frame(
    str_split_fixed(str_trim(tx2), "\\s{2,}", dim(Cultures)[1])
  )

  # Replacing the name of the row "Cultures"
  n <-  which(str_detect(tx3$V1, Cell1))

  tx3[n, ] <- Cultures$nomcomplet
  if(str_detect(tx3[n + 1, 2], "[:digit:]{4}|[:digit:]{2}[:punct:][:digit:]{2}") == F) {
    tx3 <- tx3[-(n + 1), ]
  }
  tx3 <- tx3[-(n - 1), ]

  if (any(str_detect(tx3$V1, "Estadística"))) {
    tx3 <- tx3[-which(str_detect(tx3$V1, "Estadística")), ]
  }

  if (any(str_detect(tx3$V1, "cáscara"))) {
    n <- which(str_detect(tx3$V1, "cáscara"))

    if (sum(str_detect(tx3[n-1,],"Arroz")) > 0) {
      tx3[n-1,which(str_detect(tx3[n - 1,],"Arroz"))[1]] <- "Arroz cáscara"
    }
    if (sum(str_detect(tx3[n-1,],"Maíz")) > 0) {
      tx3[n-1,which(str_detect(tx3[n - 1,],"Maíz"))[1]] <- "Maíz amiláceo"
    }
    if (sum(str_detect(tx3[n-1,],"Maíz")) > 1) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Maíz"))[2]] <- "Maíz duro"
    }
    if (sum(str_detect(tx3[n-1,],"Frijol")) > 0) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Frijol"))[1]] <- "Frijol grano seco"
    }
    if (sum(str_detect(tx3[n-1,],"Frijol")) > 1) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Frijol"))[2]] <- "Frijol castlla"
    }
    if (sum(str_detect(tx3[n-1,],"Frijol")) > 2) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Frijol"))[3]] <- "Frijol de palo"
    }
    if (sum(str_detect(tx3[n-1,],"Frijol")) > 3) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Frijol"))[4]] <- "Frijol loctao"
    }
    if (sum(str_detect(tx3[n-1,],"Algodón")) > 0) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Algodón"))[1]] <- "Algodón rama"
    }
    if (sum(str_detect(tx3[n-1,],"Sorgo")) > 0) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Sorgo"))[1]] <- "Sorgo grano"
    }
    if (sum(str_detect(tx3[n-1,],"Caña")) > 0) {
      tx3[n-1,which(str_detect(tx3[n - 1, ],"Caña"))[1]] <- "Caña de azúcar"
    }
    tx3 <- tx3[-n, ]
  }
  if (annee == 2 & mois == 1 ) {
    tx3 <- tx3 |>
      mutate(
        V1 = replace(V1, V1== 2001, 2002),
        V1 = replace(V1, V1== 2000, 2001),
        V2 = replace(V2, V2== 2001, 2002),
        V2 = replace(V2, V2== 2000, 2001)
      )
  }
  annee <-  ifelse(
    annee < 10,
    yes = paste("200",annee, sep=""),
    no = paste("20",annee, sep="")
  )

  if (any(str_detect(tx3$V1, c("Total nacional 200|Andahuaylas 200")))) {
    n <- str_which(
      tx3$V1,
      "(Total nacional) |(Huancavelica) |(Madre de Dios) |(Andahuaylas) [[:digit:]]{4}"
    )
    for (ii in n) {
      for (j in dim(Cultures)[1]:3) {
        tx3[ii, j] <- tx3[ii, j - 1]
      }
      text <- as.data.frame(str_split(tx3[ii,1], "200"))
      tx3[ii,1] <- text[1,]
      tx3[ii,2] <- paste("200", text[2,], sep = "")
    }
  }

  n <- which(
    tx3$V1 == as.character(as.numeric(annee)-1) | tx3$V1 == as.character(as.numeric(annee)) |
      tx3$V1 == paste(str_sub(as.numeric(annee) - 2, 3, 4), "-", str_sub(as.numeric(annee) - 1, 3, 4), sep = "") |
      tx3$V1 == paste(str_sub(as.numeric(annee) - 1, 3, 4), "-", str_sub(as.numeric(annee), 3, 4), sep = "") |
      tx3$V1 == paste(str_sub(as.numeric(annee), 3, 4), "-", str_sub(as.numeric(annee) + 1, 3, 4), sep = "")
  )

  for (ii in n) {
    for (j in dim(Cultures)[1]:2) {
      tx3[ii, j] <- tx3[ii, j - 1]
    }
    ifelse(tx3[ii - 1, 1] == Cell1, tx3[ii, 1] <-
             tx3[ii + 1, 1],  tx3[ii, 1] <- tx3[ii - 1, 1])
  }
  tx3
}
#' Importing data where table is missing
#'
#' @param y last digit of the year (20..)
#' @param mm month (numeric)
#' @param type type of variable ("Production" for production, "Superficies_R"
#'   for harvested surface, "Superficies" for planted surface)
missing_table <- function(y,
                          mm,
                          type = c("Production", "Superficies_R",
                                   "Superficies")) {
  data_mm <- NULL

  for (ii in 0:1) {
    if (mm == 12) {
      m_1 <- mm - 1
      y1  <- y + 1
      year <- 2000 + y
      year1 <- 2000 + y + 1

      if (type == "Production") {
        ## Production, m==12----
        # File name of the next year
        file_name <- str_c(
          "../data/raw/minagri/", type, "/", type, "_", year1, ".xlsx"
        )
        sheet <- str_c(mm, year1, ii + 1)
        data_m_y1 <- import_monthly_regional_values_P_year_P(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_y1 = as.numeric(gsub(" ", "", value))) |>
          select(-value)

        sheet <- str_c(m_1, year1, ii + 1)
        data_m_1_y1 <- import_monthly_regional_values_P_year_P(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1_y1 = as.numeric(gsub(" ", "", value))) |>
          rename(month_1 = month) |>
          select(-value)

        # Current year
        file_name <- str_c(
          "../data/raw/minagri/", type, "/", type, "_", year, ".xlsx"
        )
        sheet <- str_c(m_1, year, ii + 1)
        data_m_1_y <- import_monthly_regional_values_P_year_P(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1_y = as.numeric(gsub(" ", "", value))) |>
          select(-value)

        data_mm_y <- merge(data_m_y1,data_m_1_y1) |>
          mutate(diff = value_m_y1 - value_m_1_y1) |>
          select(region, product, diff) |>
          merge(data_m_1_y) |>
          mutate(value_num = value_m_1_y + diff) |>
          select(-diff, - value_m_1_y) |>
          mutate(month = as.numeric(month + 1)) |>
          relocate(product, .after = month)

        data_mm <- rbind(data_mm, data_mm_y)
      }

      if (type == "Superficies_R") {
        ## Superficies_R, m==12----
        # Following year
        file_name <- str_c(
          "../data/raw/minagri/Surface_R/", type, "_", year1, ".xlsx"
        )
        sheet <- str_c(mm,year1, ii + 1)
        data_m_y1 <- import_monthly_regional_values_year_SR(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_y1 = as.numeric(gsub(" ", "", value))) |>
          select(-value)

        sheet <- str_c(m_1, year1, ii + 1)
        data_m_1_y1 <- import_monthly_regional_values_year_SR(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1_y1 = as.numeric(gsub(" ", "", value))) |>
          rename(month_1 = month) |>
          select(-value)

        # Current year
        file_name <- str_c(
          "../data/raw/minagri/Surface_R/", type, "_", year, ".xlsx"
        )
        sheet <- str_c(m_1, year, ii + 1)
        data_m_1_y <- import_monthly_regional_values_year_SR(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0)  |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1_y = as.numeric(gsub(" ", "", value))) |>
          select(-value)

        data_mm_y <- merge(data_m_y1, data_m_1_y1) |>
          mutate(diff = value_m_y1 - value_m_1_y1) |>
          select(region, product, diff) |>
          merge(data_m_1_y) |>
          mutate(value_num = value_m_1_y + diff) |>
          select(-diff, - value_m_1_y) |>
          mutate(month = as.numeric(month + 1)) |>
          relocate(product, .after = month)

        data_mm <- rbind(data_mm, data_mm_y)
      }

      if (type == "Superficies") {
        ## Superficies, m==12----
        # Following year
        file_name <- str_c(
          "../data/raw/minagri/Surface/", type, "_", year1, ".xlsx"
        )

        sheet <- str_c(mm, year1, ii + 1)
        data_m_y1 <- import_monthly_regional_values_year_S(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_y1 = as.numeric(gsub(" ", "", value))) |>
          select(-value)

        sheet <- str_c(m_1, year1, ii + 1)
        data_m_1_y1 <- import_monthly_regional_values_year_S(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1_y1 = as.numeric(gsub(" ", "", value))) |>
          rename(month_1 = month) |>
          select(-value)

        # Current year
        file_name <- str_c(
          "../data/raw/minagri/Surface/", type, "_", year, ".xlsx"
        )
        sheet <- str_c(m_1, year, ii + 1)
        data_m_1_y <- import_monthly_regional_values_year_S(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        )  |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1_y = as.numeric(gsub(" ", "", value))) |>
          select(-value)

        data_mm_y <- merge(data_m_y1, data_m_1_y1) |>
          mutate(diff = value_m_y1 - value_m_1_y1) |>
          select(region, product, diff) |>
          merge(data_m_1_y) |>
          mutate(value_num = value_m_1_y + diff) |>
          select(-diff, - value_m_1_y) |>
          mutate(month = as.numeric(month + 1)) |>
          relocate(product, .after = month)

        data_mm <- rbind(data_mm, data_mm_y)
      }
    } else {
      # If m != 12
      m_1 <- mm - 1
      m1  <- mm + 1
      year <- 2000 + y

      if (type == "Production") {
        ## Production, m!=12----
        file_name <- str_c(
          "../data/raw/minagri/", type, "/", type, "_", year, ".xlsx"
        )

        # previous month
        sheet <- str_c(
          str_pad(m_1, width = 2, side = "left", pad = "0"),
          year,
          ii + 1
        )
        data_m_1 <- import_monthly_regional_values_P_year_P(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1 = as.numeric(gsub(" ", "", value))) |>
          rename(month_1 = month) |>
          select(-value)

        # following month
        sheet <- str_c(
          str_pad(m1, width = 2, side = "left", pad = "0"),
          year,
          ii + 1
        )
        data_m1 <- import_monthly_regional_values_P_year_P(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m1 = as.numeric(gsub(" ", "", value))) |>
          rename(month1 = month) |>
          select(-value)
      }

      if (type == "Superficies_R"){
        ## Superficies_R, m!=12----
        file_name <- str_c(
          "../data/raw/minagri/Surface_R/", type,"_", year, ".xlsx"
        )

        # previous month
        sheet <- str_c(
          str_pad(m_1, width = 2, side = "left", pad = "0"),
          year,
          ii + 1
        )
        data_m_1 <- import_monthly_regional_values_year_SR(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1 = as.numeric(gsub(" ", "", value))) |>
          rename(month_1 = month) |>
          select(-value)

        # following month
        sheet <- str_c(
          str_pad(m1, width = 2, side = "left", pad = "0"),
          year,
          ii + 1
        )
        data_m1 <- import_monthly_regional_values_year_SR(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m1 = as.numeric(gsub(" ", "", value))) |>
          rename(month1 = month) |>
          select(-value)
      }

      if (type == "Superficies") {
        ## Superficies, m!=12----
        file_name <- str_c(
          "../data/raw/minagri/Surface/", type, "_", year, ".xlsx"
        )

        # previous month
        sheet <- str_c(
          str_pad(m_1, width = 2, side = "left", pad = "0"),
          year,
          ii + 1
        )
        data_m_1 <- import_monthly_regional_values_year_S(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m_1 = as.numeric(gsub(" ", "", value))) |>
          rename(month_1 = month) |>
          select(-value)

        # following month
        sheet <- str_c(
          str_pad(m1, width = 2, side = "left", pad = "0"),
          year,
          ii + 1
        )
        data_m1 <- import_monthly_regional_values_year_S(
          sheet_name = sheet,
          file = file_name,
          anneesup = 0
        ) |>
          mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
          mutate(value_m1 = as.numeric(gsub(" ", "", value))) |>
          rename(month1 = month) |>
          select(-value)
      }

      data_mm_y <- merge(data_m1,data_m_1) |>
        mutate(
          month = as.numeric(month_1 + 1),
          value_num = (value_m1 - value_m_1) / 2 + value_m_1) |>
        select(-value_m1, -month1, -value_m_1, -month_1) |>
        relocate(product, .after = month)

      data_mm <- rbind(data_mm, data_mm_y)

    }
  }
  data_mm
}

2.1.2 Load Data

We define a function to format the header of the data.

#' Format header (removing unecessary values)
#' 
#' @param x (vector) of string
format_header <- function(x) {
  if (all(is.na(x))) {
    return("")
  }
  replace_na(x, "") |>
    str_replace_all("\\.{3}", "") |>
    str_remove("[[:digit:]]/") |>
    str_c(collapse = " ") |>
    str_to_lower() |>
    str_replace_all("[[:blank:]]{2,}", " ") |>
    str_remove("- ") |>
    str_trim()
}

2.1.2.1 Regional Production

#' Import monthly regional agricultural values from the Excel file,
#' for a given month
#' 
#' @param sheet_name name of the Excel sheet
#' @param file path to the Excel File
#' @param anneesup 
import_monthly_regional_values_P_year_P <- function(sheet_name,
                                                    file,
                                                    anneesup) {
  # The first two digits of the name: month
  # the next four: the four digits of the year
  # last digit: index of the sheet for a given month and
  # year (we will note use it)
  year  <- str_sub(sheet_name, 3, 6) |> as.numeric()
  month <- str_sub(sheet_name, 1, 2) |> as.numeric()
  tmp   <- suppressMessages(
    read_excel(
      path = file,
      sheet = sheet_name,
      col_types = "text", n_max = 15, col_names = FALSE
    )
  )

  # We can use the first occurrence of "Años" to determine
  # the beginning of the table
  ind_row_year <- str_which(tmp[[3]], regex("años?", ignore_case = TRUE)) |>
    first()
  if (year == 2005) {
    ind_row_year <- str_which(tmp[[2]], regex("años?", ignore_case = TRUE)) |>
      first()
  }

  # The first row of the header of the table is contained in the previous line
  skip_head <- ind_row_year

  # The body of the table
  prod_region_tmp <-
    suppressMessages(
      read_excel(
        path = file,
        sheet = sheet_name,
        col_types = "text", skip = skip_head, col_names = F
      )
    )

  header_1 <- suppressMessages(
    read_excel(
      path = file,
      sheet = sheet_name,
      skip = skip_head-2, n_max = 3, col_names = F
    )
  )
  if (year < 2005) {
    header_1 <- header_1[-nrow(header_1), ]
  }

  header_2 <- header_1 |>
    summarise(
      across(
        .cols = everything(),
        .fns = ~format_header(.x)
      )
    )

  colnames(prod_region_tmp) <- as.character(header_2)
  if (colnames(prod_region_tmp)[2] == "") {
    colnames(prod_region_tmp)[2] <- "departamento"
  }
  if (any(str_detect(colnames(prod_region_tmp), "departamento"), na.rm = T)) {
    ind_dep_current <- which(str_detect(colnames(prod_region_tmp), "departamento"))
    colnames(prod_region_tmp)[ind_dep_current] <- "departamento"
  }

  # Removing columns with no name
  ind <- !is.na(colnames(prod_region_tmp)) & (colnames(prod_region_tmp) != "")
  prod_region_tmp <-
    prod_region_tmp |>
    dplyr::select(!!!colnames(prod_region_tmp)[ind])

  # Removing rows with all NAs
  prod_region_tmp <-
    prod_region_tmp |>
    filter_all(any_vars(!is.na(.)))

  # Removing rows where "Continúa" is found
  prod_region_tmp <-
    prod_region_tmp |>
    filter_all(
      any_vars(
        !str_detect(., regex(pattern = "Continúa", ignore_case = TRUE))
      )
    )

  # Removing the first colomn if row index
  if (str_detect(colnames(prod_region_tmp)[1], "[aeiou]") == FALSE) {
    prod_region_tmp <- prod_region_tmp[-1]
  }

  # The name of the first column differs accross sheets (departemento or region)
  name_first_col <- colnames(prod_region_tmp)[1]

  if (year < 2015) {
    if (! name_first_col %in% c("departamento", "región")) {
      warning(str_c("Issue with sheet: ", sheet_name))
      return(NULL)
    }
  }

  prod_region_tmp <-
    prod_region_tmp |>
    rename(region := !!name_first_col)

  if (any(str_detect(colnames(prod_region_tmp), "^año$"))) {
    prod_region_tmp <-
      prod_region_tmp |> rename(year = año)
  } else {
    prod_region_tmp <-
      prod_region_tmp |> rename(year = años)
  }

  prod_region_tmp <-
    prod_region_tmp |>
    filter(!is.na(year), ! year %in% c("Años", "Año"))

  # The sheet may contain values for the department Cajamarca
  # AND the capital of that department named also Cajamarca
  prod_region_tmp <-
    prod_region_tmp |>
    group_by(region, year) |>
    mutate(
      region = ifelse(
        region == "Cajamarca" & row_number() == 1,
        yes = "Cajamarca_R",
        no = region
      )
    ) |>
    ungroup()

  prod_region_tmp <-
    prod_region_tmp |>
    mutate(
      region = ifelse(region == "Lima provincias", yes = "Lima", no = region)
    )


  # The production of each product is given in two rows, but the product name
  # is not repeated
  prod_region_tmp <-
    prod_region_tmp |>
    fill(region, .direction = "down")
  if (year == 2005) {
    prod_region_tmp$region[1]<- "Total Nacional"
  }

  # Removing sub-regional data
  prod_region_tmp <-
    prod_region_tmp |>
    filter(! region %in% c("Cajamarca", "Chota", "Jaén",
                           "Abancay","Andahuaylas")
    )

  if (year == 2015 & any(colnames(prod_region_tmp) == "región")) {
    q <- which(colnames(prod_region_tmp) == "región")
    if (q > 0) {
      prod_region_tmp <- prod_region_tmp |>
        dplyr::select(-all_of(q))
    }
  }

  resul <-
    prod_region_tmp |>
    pivot_longer(cols = -c(region, year), names_to = "product")

  if (anneesup == 1) {
    resul <-
      resul |>
      filter(year == max(year))
  } else {
    # Keeping only the last year available
    resul <-
      resul |>
      filter(year == min(year))
  }

  resul <-
    resul |>
    mutate(
      product = ifelse(product == "arveja gr. seco", yes = "arveja grano seco", no = product),
      product = ifelse(product == "frijol grano seco**", yes = "frijol grano seco", no = product),
      product = ifelse(product == "frijol palo", yes = "frijol de palo", no = product),
      product = ifelse(product == "frijol castlla", yes = "frijol castilla", no = product),
      product = ifelse(product == "lentaja", yes = "lenteja", no = product),
      product = ifelse(product == "frijol gr. seco", yes = "frijol grano seco", no = product),
      product = ifelse(product == "haba gr. seco", yes = "haba grano seco", no = product),
      product = ifelse(product == "maíz a. duro", yes = "maíz amarillo duro", no = product),
      product = ifelse(product == "arveja gr. verde", yes = "arveja grano verde", no = product),
      product = ifelse(product == "arveja gr. verde", yes = "arveja grano verde", no = product),
      product = ifelse(product == "haba gr. verde", yes = "haba grano verde", no = product),
      product = ifelse(product == "espá-rrago", yes = "espárrago", no = product),
      product = ifelse(product == "manda-rina", yes = "mandarina", no = product),
      product = ifelse(product == "maíz duro", yes = "maíz amarillo duro", no = product),
      product = ifelse(product == "maíz am. duro", yes = "maíz amarillo duro", no = product),
      product = ifelse(product == "pallar gr. seco", yes = "pallar grano seco", no = product),
      product = ifelse(product == "pallar seco", yes = "pallar grano seco", no = product),
      product = ifelse(product == "maíz amarillo duro", yes = "maíz amarillo duro", no = product)
    )

  resul |>
    dplyr::mutate(month = month) |>
    dplyr::select(region, year, month, product, value)
}# End of import_monthly_regional_values_P_year_P()
#' Import monthly regional agricultural values from the Excel file,
#' for all the months of the file.
#'
#' @param file path to the Excel File
#' @param anneesup 
#' @param timescale
import_monthly_regional_values_P <- function(file,
                                             anneesup,
                                             timescale) {

  sheet_names <- excel_sheets(file)
  prod_region_monthly <- map(
    sheet_names,
    import_monthly_regional_values_P_year_P,
    file = file,
    anneesup = anneesup
  ) |>
    list_rbind()

  # Cleaning years and changing type of values: from str to num
  prod_region_monthly <-
    prod_region_monthly |>
    mutate(
      year = str_remove(year, "p/?") |> as.numeric(),
      value_num = as.numeric(gsub(" ", "", value)),
      month = as.numeric(month)
    )

  # Values in 2014 to 2016 are expressed in thousands of tonnes
  # These need to be expressed in tonnes
  prod_region_monthly <-
    prod_region_monthly |>
    mutate(
      value_num = ifelse(year %in% 2014:2015, value_num * 10^3, value_num)
    )

  # In addition, for the `Production_2014.xlsx` file, months 10 to 12
  # report values in thousands of tonnes as well
  if (str_detect(file, "Production_2014.xlsx$")) {
    prod_region_monthly <-
      prod_region_monthly |>
      mutate(
        value_num = ifelse(
          month %in% c(10,11,12),
          yes = value_num * 10^3,
          no = value_num
        )
      )
  }


  # Removing the `value` columns to keep only the column with numerical values
  prod_region_monthly <-
    prod_region_monthly |>
    dplyr::select(-value)

  if (str_detect(file, "Production_2002.xlsx$")) {
    for (i in c(3,6,9,12)) {
      temp <- missing_table(y = 2, mm = i, type = "Production")
      prod_region_monthly <- rbind(prod_region_monthly, temp)
    }
  }

  if (str_detect(file, "Production_2003.xlsx$")) {
    temp <- missing_table(y = 3, mm = 3, type = "Production")
    prod_region_monthly <- rbind(prod_region_monthly, temp)

  }

  if (str_detect(file, "Production_2008.xlsx$")) {
    # Problem with value in June 2007 for Cassava: same value as in May
    # Let us put the value for June as NA
    # (For some regions)
    prod_region_monthly <-
      prod_region_monthly |>
      mutate(
        value_num = ifelse(
          month == 6 & product == "yuca" & region %in% c(
            "Apurímac", "Arequipa", "Ayacucho", "Cusco", "Huancavelica",
            "Junín", "Loreto", "Madre de Dios", "Moquegua", "Moquegua",
            "Puno", "San Martín", "Tacna", "Ucayali"),
          yes = NA,
          no = value_num
            )
      )
  }

  # Cleaning region names
  # prod_region_monthly$region |> unique() |> sort()
  prod_region_monthly <-
    prod_region_monthly |>
    mutate(region = replace(region, region=="Apurimac", "Apurímac")) |>
    mutate(region = replace(region, region=="Cajamarca_R", "Cajamarca")) |>
    mutate(region = replace(
      x = region,
      list = region %in% c("Nacional", "Total nacional", "TOTAL NACIONAL"),
      values = "Total Nacional")
    ) |>
    filter(! product == 0)


  if (str_detect(file, "Production_201(4|5).xlsx")) {
    prod_region_monthly <- prod_region_monthly |>
      mutate(
        product = case_when(
          str_detect(product, "arroz")            ~  "arroz cáscara",
          str_detect(product, "banano")           ~  "plátano",
          str_detect(product, "café")             ~  "café",
          str_detect(product, "banano")           ~  "plátano",
          str_detect(product, "caña")             ~  "caña de azúcar",
          str_detect(product, "chocho")           ~  "chocho",
          str_detect(product, "gar")              ~  "garbanzo",
          str_detect(product, "caña")             ~  "caña de azúcar",
          str_detect(product, "algodón")          ~  "algodón rama",
          str_detect(product, "maíz amilaceo")    ~  "maíz amiláceo",
          str_detect(product, "maíz amílaceo")    ~  "maíz amiláceo",
          str_detect(product, "caña de azúcar para azúcar")   ~  "caña de azúcar",
          str_detect(product, "arveja seca")      ~  "arveja grano seco",
          str_detect(product, "arveja verde")     ~  "arveja grano verde",
          str_detect(product, "loctao")           ~  "frijol loctao",
          str_detect(product, "frijol seco")      ~  "frijol grano seco",
          str_detect(product, "frijol castill")   ~  "frijol castilla",
          str_detect(product, "haba seca")        ~  "haba grano seco",
          str_detect(product, "haba seco")        ~  "haba grano seco",
          str_detect(product, "haba verde")       ~  "haba grano verde",
          str_detect(product, "limón sutil")      ~  "limón",
          str_detect(product, "haba verde")       ~  "haba grano verde",
          str_detect(product, "algodón rama")     ~  "algodón rama",
          str_detect(product, "café")             ~  "café",
          str_detect(product, "cañihua")          ~  "cañihua",
          str_detect(product, "espárrago")        ~  "espárrago",
          str_detect(product, "limón")            ~  "limón",
          str_detect(product, "maíz amiláceo")    ~  "maíz amiláceo",
          str_detect(product, "maíz choclo")      ~  "maíz choclo",
          str_detect(product, "piña")             ~  "piña",
          str_detect(product, "zaran-daja")       ~  "zarandaja",
          TRUE ~ product
        )
      )

  }

  Production <- prod_region_monthly |>
    arrange(region, product, desc(month)) |>
    mutate(value_num = ifelse(is.na(value_num), yes = 0, no = value_num))

  if (str_detect(file, "Production_2008.xlsx$")) {
    # Problem with value in June 2007 for Cassava: same value as in May
    # Let us put the value for June as NA
    # (For some regions)
    Production <-
      Production |>
      mutate(
        value_num = ifelse(
          month == 6 & product == "yuca" & region %in% c(
            "Apurímac", "Arequipa", "Ayacucho", "Cusco", "Huancavelica",
            "Junín", "Loreto", "Madre de Dios", "Moquegua", "Moquegua",
            "Puno", "San Martín", "Tacna", "Ucayali"),
          yes = NA,
          no = value_num
        )
      )
  }

  Production <- Production |>
    pivot_wider(names_from = month, values_from = value_num)

  if (timescale == 1) {
    # The production data in the file is a cumulative sum over the months
    Production <-
      cbind(
        Production[1:3],
        Production[4:14] - Production[5:15],
        Production[15]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "month")
    Production  <-
      Production[with(Production, order(region, year, as.numeric(month), product, value)), ]
  }

  if (timescale == 3) {
    Production <-
      cbind(
        Production[1:3],
        Production[4] - Production[7],
        Production[7] - Production[10],
        Production[10] - Production[13],
        Production[13]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "trim")
    Production  <-
      Production[with(Production, order(region, year, as.numeric(trim), product, value)), ]
  }

  if (timescale == 4) {
    Production <-
      cbind(
        Production[1:3],
        Production[4] - Production[8],
        Production[8] - Production[12],
        Production[12]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "quadrim")
    Production  <-
      Production[with(Production, order(region, year, as.numeric(quadrim), product, value)), ]
  }

  if (timescale == 6) {
    Production <-
      cbind(
        Production[1:3],
        Production[4] - Production[10],
        Production[10]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "biannual")
    Production  <-
      Production[with(Production, order(region, year, as.numeric(biannual), product, value)), ]
  }

  if (timescale == 12) {
    Production <-
      cbind(Production[1:3], Production[4]) |>
      rename(Prod_annual = "12")
  }

  if (str_detect(file, "Production_2015.xlsx$")) {
    if(timescale == 12) {
      Production <- Production |>
        filter(!is.na(Prod_annual))
    } else {
      Production <- Production |>
        filter(!is.na(value))
    }

  }
  Production
}# End of import_monthly_regional_values_P()

2.1.2.2 Regional Planted Surface

#' Import monthly regional agricultural values from the Excel file,
#' for a given month
#' 
#' @param sheet_name name of the Excel sheet
#' @param file path to the Excel File
#' @param anneesup
import_monthly_regional_values_year_S <- function(sheet_name,
                                                  file,
                                                  anneesup) {
  # The first two digits of the name: month
  # the next four: the four digits of the year
  # last digit: index of the sheet for a given month and year (we will note use it)

  month <- str_sub(sheet_name, 1, 2) |> as.numeric()
  year  <- str_sub(sheet_name, 3, 6) |> as.numeric()

  tmp   <- suppressMessages(
    read_excel(
      path = file,
      sheet = sheet_name,
      col_types = "text", n_max = 15, col_names = FALSE)
  )

  # We can use the first occurrence of "Campaña" to determine the beginning of the table
  ind_row_year <- str_which(tmp[[3]], regex("Cam?", ignore_case = TRUE)) |>
    first()
  if (sheet_name == "062015") {ind_row_year <-  7}

  # The first row of the header of the table is contained in the previous line
  skip_head <- ind_row_year

  # The body of the table
  sup_region_tmp <-
    suppressMessages(
      read_excel(
        path = file,
        sheet = sheet_name,
        col_types = "text", skip = skip_head, col_names = F
      )
    )

  header_1 <-
    suppressMessages(
      read_excel(
        path = file,
        sheet = sheet_name,
        skip = skip_head-2, n_max = 3, col_names = F
      )
    )

  if (sheet_name == "0420112") {
    header_1 <-
      suppressMessages(
        read_excel(
          path = file,
          sheet = sheet_name,
          skip = skip_head-2, n_max = 2, col_names = F
        )
      )
  }

  if (sheet_name == "062015") {
    header_1 <-
      suppressMessages(
        read_excel(
          path = file,
          sheet = sheet_name,
          skip = skip_head, n_max = 1, col_names = F
        )
      )
  }

  if (str_detect(file, "Superficies_2016.xlsx$")) {
    header_1 <-
      suppressMessages(
        read_excel(
          path = file,
          sheet = sheet_name,
          skip = skip_head-1, n_max = 1, col_names = F
        )
      )
  }

  if (year < 2005) {
    header_1 <- header_1[-dim(header_1)[1], ]
  }

  header_2 <-
    header_1 |>
    summarise(
      across(
        .cols = everything(),
        .fns = ~format_header(.x)
      )
    )

  colnames(sup_region_tmp) <- as.character(header_2)
  if (colnames(sup_region_tmp)[2] == ""){
    colnames(sup_region_tmp)[2] <- "departamento"
  }
  if (any(str_detect(colnames(sup_region_tmp), "departamento"), na.rm = T)) {
    ind_dep_current <- which(str_detect(colnames(sup_region_tmp), "departamento"))
    colnames(sup_region_tmp)[ind_dep_current] <- "departamento"
  }

  # Removing columns with no name
  ind <- !is.na(colnames(sup_region_tmp)) & (colnames(sup_region_tmp) != "")
  sup_region_tmp <-
    sup_region_tmp |>
    dplyr::select(!!!colnames(sup_region_tmp)[ind])


  if (str_detect(file, "Superficies_2016.xlsx$")) {
    sup_region_tmp <- sup_region_tmp |>
      mutate(
        campaña = ifelse(
          str_detect(campaña, regex(pattern = "Ago 14", ignore_case = TRUE)),
          yes = "14-15",
          no = campaña
        ),
        campaña = ifelse(
          str_detect(campaña, regex(pattern = "Ago 2014", ignore_case = TRUE)),
          yes = "14-15",
          no = campaña
        ),
        campaña = ifelse(
          str_detect(campaña, regex(pattern = "Ago 15", ignore_case = TRUE)),
          yes = "15-16",
          no = campaña
        ),
        campaña = ifelse(
          str_detect(campaña, regex(pattern = "Ago 2015", ignore_case = TRUE)),
          yes = "15-16",
          no = campaña
        ),
        campaña = ifelse(
          str_detect(campaña, regex(pattern = "Ago 16", ignore_case = TRUE)),
          yes = "16-17",
          no = campaña
        ),
        campaña = ifelse(
          str_detect(campaña, regex(pattern = "Ago 2016", ignore_case = TRUE)),
          yes = "16-17",
          no = campaña
        )
      )
  }

  # Removing rows with all NAs
  sup_region_tmp <-
    sup_region_tmp |>
    filter_all(any_vars(!is.na(.)))

  # Removing rows where "Continúa" is found
  sup_region_tmp <-
    sup_region_tmp |>
    filter_all(
      any_vars(
        !str_detect(., regex(pattern = "Continúa", ignore_case = TRUE))
      )
    )

  # Removing the first colomn if row index
  if (str_detect(colnames(sup_region_tmp)[1], "[aeiou]") == FALSE) {
    sup_region_tmp <-   sup_region_tmp[-1]
  }
  # The name of the first column differs accross sheets (departemento or region)
  name_first_col <- colnames(sup_region_tmp)[1]
  if (! name_first_col %in% c("departamento", "región")) {
    warning(str_c("Issue with sheet: ", sheet_name))
    return(NULL)
  }

  sup_region_tmp <-
    sup_region_tmp |>
    rename(region := !!name_first_col)

  if (sheet_name == "062015") {
    sup_region_tmp <-
      sup_region_tmp |>
      rename(campaña = colnames(sup_region_tmp)[2])
  }

  sup_region_tmp$campaña <-
    str_c("20",str_sub(sup_region_tmp$campaña, 1, 2)) |>
    as.numeric()

  sup_region_tmp <-
    sup_region_tmp |> rename(year = campaña)

  if (sheet_name == "062015") {
    p <- str_which(sup_region_tmp$region, "Cajamarca")
    sup_region_tmp$region[p] <- "Cajamarca_R"
  } else {
    p <- str_which(sup_region_tmp$region, "Cajamarca") |>
      first()
    sup_region_tmp$region[p] <- "Cajamarca_R"
  }

  sup_region_tmp <-
    sup_region_tmp |>
    filter(!is.na(year), ! year %in% "campaña")

  # The production of each product is given in two rows, but the product name is not repeated
  sup_region_tmp <-
    sup_region_tmp |>
    fill(region, .direction = "down")

  # Removing sub-regional data
  sup_region_tmp <-
    sup_region_tmp |>
    filter(
      ! region %in% c("Cajamarca", "Chota", "Jaén", "Abancay","Andahuaylas")
    )
  if (any(str_detect(colnames(sup_region_tmp), regex(pattern = "ago", ignore_case = TRUE)) == T)) {
    p <- which(str_detect(colnames(sup_region_tmp), regex(pattern = "ago", ignore_case = TRUE))==T)
    colnames(sup_region_tmp)[p] <- "TOTAL"
  }

  resul <-
    sup_region_tmp |>
    pivot_longer(cols = -c(region, year), names_to = "product")

  if (anneesup == 1) {
    resul <-
      resul |>
      filter(year == max(year))
  } else {
    resul <-
      resul |>
      filter(year == min(year))
  }

  resul <-
    resul |> rename(campana = year)

  resul <- resul |>
    mutate(month = month) |>
    dplyr::select(region, campana, month, product, value)

  resul <-
    resul |>
    mutate(
      product = ifelse(
        product == "arveja gr. seco",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol grano seco**",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol palo",
        yes = "frijol de palo",
        no = product
      ),
      product = ifelse(
        product == "frijol castlla",
        yes = "frijol castilla",
        no = product
      ),
      product = ifelse(
        product == "lentaja",
        yes = "lenteja",
        no = product
      ),
      product = ifelse(
        product == "frijol gr. seco",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba gr. seco",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "maíz a. duro",
        yes = "maíz amarillo duro",
        no = product
      ),
      product = ifelse(
        product == "arveja gr. verde",
        yes = "arveja grano verde",
        no = product
      ),
      product = ifelse(
        product == "arveja gr. verde",
        yes = "arveja grano verde",
        no = product
      ),
      product = ifelse(
        product == "haba gr. verde",
        yes = "haba grano verde",
        no = product
      ),
      product = ifelse(
        product == "espá-rrago",
        yes = "espárrago",
        no = product
      ),
      product = ifelse(
        product == "maíz duro",
        yes = "maíz amarillo duro",
        no = product
      ),
    ) |>
    filter(! str_detect(product, "contin")) |>
    filter(! str_detect(product, "conclusi"))
  resul
}# End of import_monthly_regional_values_year_S ()
#' Import monthly regional agricultural values from the Excel file,
#' for all the months of the file.
#'
#' @param file path to the Excel File
#' @param anneesup
#' @param timescale
import_monthly_regional_values_S <- function(file,
                                             anneesup,
                                             timescale){

  sheet_names <- excel_sheets(file)
  surf_region_monthly <- map(
    sheet_names,
    import_monthly_regional_values_year_S,
    file = file,
    anneesup = anneesup) |>
    list_rbind()

  # Cleaning years and changing type of values: from str to num
  surf_region_monthly <-
    surf_region_monthly |>
    mutate(year = str_sub(sheet_names[1], 3, 6) |> as.numeric()) |>
    mutate(year = ifelse(anneesup == 0, year - 1,  year)) |>
    mutate(campana = str_remove(campana, "p/?") |> as.numeric()) |>
    mutate(value_num = as.numeric(gsub(" ", "", value)))

  # Values in 2014 to 2016 are expressed in thousands of tonnes
  # These need to be expressed in tonnes
  surf_region_monthly <-
    surf_region_monthly |>
    mutate(
      value_num = ifelse(year %in% 2014:2015, value_num * 10^3, value_num),
      value_num = ifelse(
        str_detect(!!file, "Superficies_2014.xlsx$") & month %in% c(10,11,12),
        yes = value_num * 10^3,
        no = value_num
      )
    )


  # Removing the `value` columns to keep only the column with numerical values
  surf_region_monthly <-
    surf_region_monthly |>
    dplyr::select(-value)


  if (str_detect(file, "Superficies_2002.xlsx$")) {
    for (i in c(3,6,9,12)) {
      temp <- missing_table(y = 2, mm = i, type = "Superficies") |>
        mutate(year = year - 1)
      surf_region_monthly <- rbind(surf_region_monthly, temp)
    }
  }

  if (str_detect(file, "Superficies_2003.xlsx$")) {
    temp <- missing_table(y = 3, mm = 3, type = "Superficies") |>
      mutate(year = year -1)
    surf_region_monthly <- rbind(surf_region_monthly, temp)

  }

  # Cleaning region names
  surf_region_monthly$region |> unique() |> sort()
  surf_region_monthly <-
    surf_region_monthly |>
    mutate(region = replace(region, region=="Apurimac", "Apurímac")) |>
    mutate(region = replace(region, region=="Cajamarca_R", "Cajamarca")) |>
    mutate(region = replace(
      x = region,
      list = region %in% c("Nacional", "Total nacional", "TOTAL NACIONAL",
                           "TOTAL  NACIONAL"),
      values = "Total Nacional")
    )

  surf_region_monthly <- surf_region_monthly |>
    arrange(region, product, desc(month))

  # Cleaning product names
  surf_region_monthly <-
    surf_region_monthly |>
    # Replace extra spacing with only one space
    mutate(product = str_replace_all(product, "[[:blank:]]{2,}", " "))

  surf_region_monthly <-
    surf_region_monthly |>
    mutate(
      product = ifelse(
        product == "arveja gr. seco",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol castlla",
        yes = "frijol castilla",
        no = product
      ),
      product = ifelse(
        product == "frijol gr. seco",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba gr. seco",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "maíz a. duro",
        yes = "maíz amarillo duro",
        no = product
      ),
      product = ifelse(
        product == "algodón",
        yes = "algodón rama",
        no = product
      ),
      product = ifelse(
        product == "arroz",
        yes = "arroz cáscara",
        no = product
      ),
      product = ifelse(
        product == "arveja grano",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba grano",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "    maíz am duro",
        yes = "maíz amarillo duro",
        no = product
      )
    )


  if (str_detect(file, "Superficies_201(4|5).xlsx$")) {
    surf_region_monthly <- surf_region_monthly |>
      mutate(
        product = case_when(
          str_detect(product, "arroz")           ~  "arroz cáscara",
          str_detect(product, "banano")          ~  "plátano",
          str_detect(product, "café")            ~  "café",
          str_detect(product, "banano")          ~  "plátano",
          str_detect(product, "caña")            ~  "caña de azúcar",
          str_detect(product, "chocho")          ~  "chocho",
          str_detect(product, "gar")             ~  "garbanzo",
          str_detect(product, "caña")            ~  "caña de azúcar",
          str_detect(product, "algodón")         ~  "algodón rama",
          str_detect(product, "maíz a. duro")    ~  "maíz amarillo duro",
          str_detect(product, "maíz amilaceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz amílaceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz amiláceo")   ~  "maíz amiláceo",
          str_detect(product, "caña de azúcar para azúcar")   ~  "caña de azúcar",
          str_detect(product, "arveja seca")     ~  "arveja grano seco",
          str_detect(product, "arveja verde")    ~  "arveja grano verde",
          str_detect(product, "loctao")          ~  "frijol loctao",
          str_detect(product, "frijol seco")     ~  "frijol grano seco",
          str_detect(product, "frijol castill")  ~  "frijol castilla",
          str_detect(product, "haba seca")       ~  "haba grano seco",
          str_detect(product, "haba seco")       ~  "haba grano seco",
          str_detect(product, "haba verde")      ~  "haba grano verde",
          str_detect(product, "limón sutil")     ~  "limón",
          str_detect(product, "haba verde")      ~  "haba grano verde",
          str_detect(product, "algodón rama")    ~  "algodón rama",
          str_detect(product, "café")            ~  "café",
          str_detect(product, "cañihua")         ~  "cañihua",
          str_detect(product, "espárrago")       ~  "espárrago",
          str_detect(product, "limón")           ~  "limón",
          str_detect(product, "maíz amiláceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz choclo")     ~  "maíz choclo",
          str_detect(product, "piña")            ~  "piña",
          str_detect(product, "zaran-daja")      ~  "zarandaja",
          TRUE ~ product
        )
      )
  }

  surf_region_monthly <-
    surf_region_monthly |>
    mutate(value_num = ifelse(is.na(value_num), yes = 0, no = value_num))|>
    mutate(date = str_c(year, month, sep = "-")) |>
    unique()

  if (timescale == 12){
    surf_region_monthly <- surf_region_monthly |>
      filter(month == 7 ) |>
      dplyr::select(region, product, year, value_num) |>
      rename(surf_annual = value_num)
  }
  surf_region_monthly
}# End of import_monthly_regional_values_S()

2.1.2.3 Regional Harvested Surface

#' Import monthly regional agricultural values from the Excel file,
#' for a given month
#' 
#' @param sheet_name name of the Excel sheet
#' @param file path to the Excel File
import_monthly_regional_values_year_SR <- function(sheet_name,
                                                   file,
                                                   anneesup) {
  # The first two digits of the name: month
  # the next four: the four digits of the year
  # last digit: index of the sheet for a given month and year (we will note use it)

  year  <- str_sub(sheet_name, 3, 6) |> as.numeric()
  month <- str_sub(sheet_name, 1, 2) |> as.numeric()
  tmp   <- suppressMessages(
    read_excel(
      path = file,
      sheet = sheet_name,
      col_types = "text", n_max = 15, col_names = FALSE
    )
  )

  # We can use the first occurrence of "Años" to determine the beginning of the table
  ind_row_year <- str_which(tmp[[3]], regex("años?", ignore_case = TRUE))
  if(length(ind_row_year) > 0) ind_row_year <- first(ind_row_year)
  name_camp <- 0
  if (year == 2005) {
    ind_row_year <- str_which(tmp[[2]], regex("años?", ignore_case = TRUE)) |>
      first()
  }
  if (length(ind_row_year) == 0) {
    ind_row_year <- str_which(tmp[[3]], regex("Campaña?", ignore_case = TRUE)) |>
      first()
    name_camp <- 1
  }
  # The first row of the header of the table is contained in the previous line
  skip_head <- ind_row_year

  # The body of the table
  surfR_region_tmp <-
    suppressMessages(
      read_excel(
        path = file,
        sheet = sheet_name,
        col_types = "text", skip = skip_head, col_names = F
      )
    )

  header_1 <-
    suppressMessages(
      read_excel(
        path = file,
        sheet = sheet_name,
        skip = skip_head-2, n_max = 3, col_names = F
      )
    )

  if (year < 2005) {
    header_1 <- header_1[-dim(header_1)[1], ]}

  header_2 <-
    header_1 |>
    summarise(across(.cols = everything(), .fns = ~format_header(.x)))

  colnames(surfR_region_tmp) <- as.character(header_2)
  if (colnames(surfR_region_tmp)[2] == "") {
    colnames(surfR_region_tmp)[2] <- "departamento"
  }

  # Removing columns with no name
  ind <- !is.na(colnames(surfR_region_tmp)) & (colnames(surfR_region_tmp) != "")
  surfR_region_tmp <-
    surfR_region_tmp |> dplyr::select(!!!colnames(surfR_region_tmp)[ind])

  # Removing rows with all NAs
  surfR_region_tmp <-
    surfR_region_tmp |>
    filter_all(any_vars(!is.na(.)))

  # Removing rows where "Continúa" is found
  surfR_region_tmp <-
    surfR_region_tmp |>
    filter_all(
      any_vars(!str_detect(., regex(pattern = "Continúa", ignore_case = TRUE)))
    )

  # Removing the first colomn if row index
  if (str_detect(colnames(surfR_region_tmp)[1], "[aeiou]") == FALSE) {
    surfR_region_tmp <-  surfR_region_tmp[-1]
  }
  # The name of the first column differs accross sheets (departemento or region)
  name_first_col <- colnames(surfR_region_tmp)[1]
  if (name_first_col %in% c("(ha) departamento","mes : enero 2002-2003* departamento")) {
    colnames(surfR_region_tmp)[1] <- "departamento"
    name_first_col <- colnames(surfR_region_tmp)[1]
  }
  if (! name_first_col %in% c("departamento", "región")) {
    warning(str_c("Issue with sheet: ", sheet_name))
    return(NULL)
    # stop("First column is not region")
  }

  surfR_region_tmp <-
    surfR_region_tmp |>
    rename(region := !!name_first_col)
  if (name_camp == 1) {
    surfR_region_tmp <-
      surfR_region_tmp |> rename(años = campaña)
  }
  if (any(str_detect(colnames(surfR_region_tmp), "^año$"))) {
    surfR_region_tmp <-
      surfR_region_tmp |> rename(year = año)
  } else {
    surfR_region_tmp <-
      surfR_region_tmp |> rename(year = años)
  }


  if (any(str_detect(colnames(surfR_region_tmp), "ene"))) {
    p <- which(str_detect(colnames(surfR_region_tmp), "ene"))
    surfR_region_tmp <- surfR_region_tmp |>
      dplyr::select(-p)
  }

  surfR_region_tmp <-
    surfR_region_tmp |>
    filter(!is.na(year), ! year %in% c("Años", "Año"))

  p <- str_which(surfR_region_tmp$region,"Cajamarca") |>
    first()
  surfR_region_tmp$region[p] <- "Cajamarca_R"

  # The production of each product is given in two rows, but the product name is not repeated
  surfR_region_tmp <-
    surfR_region_tmp |>
    fill(region, .direction = "down")
  if (year == 2005) {
    surfR_region_tmp$region[1]<- "Total Nacional"
  }

  # Removing sub-regional data
  surfR_region_tmp <-
    surfR_region_tmp |>
    filter(
      ! region %in% c("Cajamarca", "Chota", "Jaén", "Abancay","Andahuaylas")
    ) |>
    filter(! region == "Región")

  resul <-
    surfR_region_tmp |>
    pivot_longer(cols = -c(region, year), names_to = "product")

  if (sheet_name == "062015") {
    resul <- resul |>
      filter(! year %in% c("Continúa", "29"))
  }
  if (anneesup == 1) {
    resul <-
      resul |>
      filter(year == max(year))
  } else {
    resul <-
      resul |>
      filter(year == min(year))
  }


  resul <-
    resul |>
    mutate(
      product = ifelse(
        product == "arveja gr. seco",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol grano seco**",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol palo",
        yes = "frijol de palo",
        no = product
      ),
      product = ifelse(
        product == "frijol castlla",
        yes = "frijol castilla",
        no = product
      ),
      product = ifelse(
        product == "lentaja",
        yes = "lenteja",
        no = product
      ),
      product = ifelse(
        product == "frijol gr. seco",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba gr. seco",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "maíz a. duro",
        yes = "maíz amarillo duro",
        no = product
      ),
      product = ifelse(
        product == "arveja gr. verde",
        yes = "arveja grano verde",
        no = product
      ),
      product = ifelse(
        product == "haba gr. verde",
        yes = "haba grano verde",
        no = product
      ),
      product = ifelse(
        product == "espá-rrago",
        yes = "espárrago",
        no = product
      ),
      product = ifelse(
        product == "maíz duro",
        yes = "maíz amarillo duro",
        no = product
      ),
    )

  resul |>
    mutate(month = month) |>
    dplyr::select(region, year, month, product, value) |>
    filter(! product == "total") |>
    filter(! str_detect(product, "contin")) |>
    filter(! str_detect(product, "conclusi"))

}# End of import_monthly_regional_values_P_year_P()
#' Import monthly regional agricultural values from the Excel file,
#' for all the months of the file.
#'
#' @param file path to the Excel File
#' @param anneesup
#' @param timescale
import_monthly_regional_values_SR<- function(file,
                                             anneesup,
                                             timescale) {
  sheet_names <- excel_sheets(file)
  surfR_region_monthly <- map(
    sheet_names,
    import_monthly_regional_values_year_SR,
    file = file,
    anneesup = anneesup) |>
    list_rbind()

  # Cleaning years and changing type of values: from str to num
  surfR_region_monthly <-
    surfR_region_monthly |>
    mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
    mutate(value_num = as.numeric(gsub(" ", "", value)))


  # Values in 2014 to 2016 are expressed in thousands of tonnes
  # These need to be expressed in tonnes
  surfR_region_monthly <-
    surfR_region_monthly |>
    mutate(
      value_num = ifelse(year %in% 2014:2015, value_num * 10^3, value_num),
      value_num = ifelse(
        str_detect(file, "Superficies_R_2014.xlsx$") & month %in% c(10,11,12),
        yes = value_num * 10^3,
        no = value_num
      )
    )



  # Removing the `value` columns to keep only the column with numerical values
  surfR_region_monthly <-
    surfR_region_monthly |>
    dplyr::select(-value)


  if (str_detect(file, "Superficies_R_2002.xlsx$")) {
    for (i in c(3,6,9,12)) {
      temp <- missing_table(y = 2, mm = i, type = "Superficies_R")
      surfR_region_monthly <- rbind(surfR_region_monthly, temp)
    }
  }

  if (str_detect(file, "Superficies_R_2003.xlsx$")) {
    temp <- missing_table(y = 3, mm = 3, type = "Superficies_R")
    surfR_region_monthly <- rbind(surfR_region_monthly, temp)
  }
  # Cleaning region names
  surfR_region_monthly <-
    surfR_region_monthly |>
    unique() |>
    mutate(region = replace(region, region=="Apurimac", "Apurímac")) |>
    mutate(region = replace(region, region=="Cajamarca_R", "Cajamarca")) |>
    mutate(
      region = replace(
        x = region,
        list = region %in% c("Nacional", "Total nacional", "TOTAL NACIONAL"),
        values = "Total Nacional"
      )
    )


  surfR_region_monthly <-
    surfR_region_monthly |>
    mutate(
      product = ifelse(
        product == "arveja gr. seco",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol castlla",
        yes = "frijol castilla",
        no = product
      ),
      product = ifelse(
        product == "frijol gr. seco",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba gr. seco",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "maíz a. duro",
        yes = "maíz amarillo duro",
        no = product
      ),
      product = ifelse(
        product == "algodón",
        yes = "algodón rama",
        no = product
      ),
      product = ifelse(
        product == "arroz",
        yes = "arroz cáscara",
        no = product
      ),
      product = ifelse(
        product == "arveja grano",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba grano",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "    maíz am duro",
        yes = "maíz amarillo duro",
        no = product
      )
    )


  if (str_detect(file, "Superficies_R_201(4|5).xlsx$")) {
    surfR_region_monthly <- surfR_region_monthly |>
      mutate(
        product = case_when(
          str_detect(product, "arroz")     ~  "arroz cáscara",
          str_detect(product, "banano")    ~  "plátano",
          str_detect(product, "café")      ~  "café",
          str_detect(product, "banano")    ~  "plátano",
          str_detect(product, "caña")      ~  "caña de azúcar",
          str_detect(product, "chocho")    ~  "chocho",
          str_detect(product, "gar")       ~  "garbanzo",
          str_detect(product, "caña")      ~  "caña de azúcar",
          str_detect(product, "algodón")   ~  "algodón rama",
          str_detect(product, "maíz amilaceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz amílaceo")   ~  "maíz amiláceo",
          str_detect(product, "caña de azúcar para azúcar")   ~  "caña de azúcar",
          str_detect(product, "arveja seca")     ~  "arveja grano seco",
          str_detect(product, "arveja verde")    ~  "arveja grano verde",
          str_detect(product, "loctao")          ~  "frijol loctao",
          str_detect(product, "frijol seco")     ~  "frijol grano seco",
          str_detect(product, "frijol castill")  ~  "frijol castilla",
          str_detect(product, "haba seca")       ~  "haba grano seco",
          str_detect(product, "haba seco")       ~  "haba grano seco",
          str_detect(product, "haba verde")      ~  "haba grano verde",
          str_detect(product, "limón sutil")     ~  "limón",
          str_detect(product, "haba verde")      ~  "haba grano verde",
          str_detect(product, "algodón rama")    ~  "algodón rama",
          str_detect(product, "café")            ~  "café",
          str_detect(product, "cañihua")         ~  "cañihua",
          str_detect(product, "espárrago")       ~  "espárrago",
          str_detect(product, "limón")           ~  "limón",
          str_detect(product, "maíz amiláceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz choclo")     ~  "maíz choclo",
          str_detect(product, "piña")            ~  "piña",
          str_detect(product, "zaran-daja")      ~  "zarandaja",
          TRUE ~ product
        )
      )
  }

  SurfaceR <- surfR_region_monthly |>
    arrange(region, product, desc(month)) |>
    mutate(value_num = ifelse(is.na(value_num), yes = 0, no = value_num))|>
    pivot_wider(names_from = month, values_from = value_num)

  if (timescale == 1) {
    SurfaceR <-
      cbind(SurfaceR[1:3], SurfaceR[4:14] - SurfaceR[5:15], SurfaceR[15]) |>
      pivot_longer(cols = -c(region, year, product), names_to = "month")
    SurfaceR  <- SurfaceR[with(SurfaceR, order(region, year, as.numeric(month), product, value)), ]
  }

  if (timescale == 3) {
    SurfaceR <-
      cbind(
        SurfaceR[1:3],
        SurfaceR[4] - SurfaceR[7],
        SurfaceR[7] - SurfaceR[10],
        SurfaceR[10] - SurfaceR[13],
        SurfaceR[13]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "trim")
    SurfaceR <- SurfaceR[with(SurfaceR, order(region, year, as.numeric(trim), product, value)), ]
  }

  if (timescale == 4) {
    SurfaceR <-
      cbind(
        SurfaceR[1:3],
        SurfaceR[4] - SurfaceR[8],
        SurfaceR[8] - SurfaceR[12],
        SurfaceR[12]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "quadrim")
    SurfaceR <- SurfaceR[with(SurfaceR, order(region, year, as.numeric(quadrim), product, value)), ]
  }

  if (timescale == 6) {
    SurfaceR <-
      cbind(
        SurfaceR[1:3],
        SurfaceR[4] - SurfaceR[10],
        SurfaceR[10]
      ) |>
      pivot_longer(cols = -c(region, year, product), names_to = "biannual")
    SurfaceR <- SurfaceR[with(SurfaceR, order(region, year, as.numeric(biannual), product, value)), ]
  }

  if (timescale == 12) {
    SurfaceR <-
      cbind(SurfaceR[1:3], SurfaceR[4]) |>
      rename(surf_R_annual = "12")
  }

  SurfaceR
}# End of import_monthly_regional_values_SR()

2.1.2.4 Regional Prices

#' Import monthly regional agricultural values from the Excel file,
#' for a given month
#' 
#' @param sheet_name name of the Excel sheet
#' @param file path to the Excel File
#' @param anneesup 
import_monthly_regional_values_year_Px <- function(sheet_name,
                                                   file,
                                                   anneesup) {
  # The first two digits of the name: month
  # the next four: the four digits of the year
  # last digit: index of the sheet for a given month and year (we will note use it)
  year  <- str_sub(sheet_name, 3, 6) |> as.numeric()
  month <- str_sub(sheet_name, 1, 2) |> as.numeric()
  tmp   <- suppressMessages(
    read_excel(
      path = file,
      sheet = sheet_name,
      col_types = "text",
      n_max = 15,
      col_names = FALSE
    )
  )

  # We can use the first occurrence of "Año" to determine the beginning of the table
  ind_row_year <- str_which(tmp[[4]], regex("año.?", ignore_case = TRUE))
  if (length(ind_row_year) == 0) {
    ind_row_year <- str_which(tmp[[3]], regex("año.?", ignore_case = TRUE)) |>
      first()
  } else {
    ind_row_year <- first(ind_row_year)
  }

  if (sheet_name == "062015") {ind_row_year <- 8}

  # Replacing column 2 into column 3
  if (y > 4) {
    tmp[1:ind_row_year+2, 3] <- tmp[1:ind_row_year + 2, 2]
    tmp[1:ind_row_year +2, 2] <- NA
  }

  # The first row of the header of the table is contained in the previous line
  skip_head <- ind_row_year

  # The body of the table
  prx_region_tmp <-
    suppressMessages(
      read_excel(
        path = file,
        sheet = sheet_name,
        col_types = "text", col_names = F
      )
    )

  positions <-
    which(prx_region_tmp[,4] == regex(pattern = "Año", ignore_case = TRUE))


  if (str_detect(file, "Prices_2015.xlsx$")) {
    positions <-
      which(prx_region_tmp[,3] == regex(pattern = "Año", ignore_case = TRUE))
  }
  if (length(positions) == 0) {
    positions <-
      which(prx_region_tmp[,3] == regex(pattern = "Año", ignore_case = TRUE))
  }

  resul <-NULL

  for (i in 1:length(positions)) {
    prx_region_tmp <-
      suppressMessages(
        read_excel(
          path = file,
          sheet = sheet_name,
          col_types = "text",
          col_names = F,
          skip = positions[i] - 1,
          n_max = ifelse(
            i == length(positions),
            yes = 1000,
            no = positions[i + 1] - positions[i]
          )-1
        )
      )

    header_1 <- prx_region_tmp[1:2, ]
    if (y < 6) {header_1 <- prx_region_tmp[1, ]}
    header_2 <-
      header_1 |>
      summarise(
        across(
          .cols = everything(),
          .fns = ~format_header(.x)
        )
      )
    if (str_detect(header_2[1,2], regex("Gobierno Regional", ignore_case = TRUE)) == T){
      header_2[2] <-  "departamento"
    }

    header_2[3] <- header_2[2]
    header_2[2] <- ""

    if (str_detect(file, "Prices_2015.xlsx$")) {
      header_2 <-
        header_1 |>
        summarise(
          across(
            .cols = everything(),
            .fns = ~format_header(.x)
          )
        )
    }

    colnames(prx_region_tmp) <- as.character(header_2)

    if (y < 6) {
      if (colnames(prx_region_tmp)[2] == "") {
        colnames(prx_region_tmp)[2] <- "departamento"
        colnames(prx_region_tmp)[3] <- "año"
      }
      prx_region_tmp <- prx_region_tmp[-1, ]
    } else {
      if (colnames(prx_region_tmp)[3] == "") {
        colnames(prx_region_tmp)[2] <- "departamento"
      }
      prx_region_tmp <- prx_region_tmp[-c(1:2), ]
    }

    cell <- ifelse(str_detect(file, "Prices_2015.xlsx$"), yes = 2, no = 3)

    if (!is.na(prx_region_tmp[1, 2]) &
        str_detect(prx_region_tmp[1, 2], regex("Promedio Nacional", ignore_case = TRUE)
        ) == T){
      prx_region_tmp[1, cell]  <- "PROMEDIO NACIONAL"
    }
    if (!is.na(prx_region_tmp[1, 2]) &
        str_detect(prx_region_tmp[1, 2], regex("Gobierno Regional", ignore_case = TRUE)) == T) {
      prx_region_tmp[1, cell]  <- "PROMEDIO NACIONAL"
    }

    if (sheet_name == "072007") {prx_region_tmp[, 3] <- prx_region_tmp[2]}
    # Removing columns with no name
    ind <- !is.na(colnames(prx_region_tmp)) & (colnames(prx_region_tmp) != "")
    prx_region_tmp <-
      prx_region_tmp |> dplyr::select(!!!colnames(prx_region_tmp)[ind]) |>
      {\(x) x[, -1]}()

    # Removing rows with all NAs
    prx_region_tmp <-
      prx_region_tmp |>
      filter_all(any_vars(!is.na(.)))

    # Removing rows where "Continúa" is found
    prx_region_tmp <-
      prx_region_tmp |>
      filter_all(
        any_vars(
          !str_detect(., regex(pattern = "Continúa", ignore_case = TRUE))
        )
      )

    # Removing the first colomn if row index
    if (str_detect(colnames(prx_region_tmp)[1], "[aeiou]") == FALSE) {
      prx_region_tmp <- prx_region_tmp[-1]
    }
    # The name of the first column differs accross sheets (departemento or region)
    name_first_col <- colnames(prx_region_tmp)[1]
    if (! name_first_col %in% c("departamento", "región")) {
      warning(str_c("Issue with sheet: ", sheet_name))
      return(NULL)
    }

    prx_region_tmp <-
      prx_region_tmp |>
      rename(region := !!name_first_col)

    prx_region_tmp <-
      prx_region_tmp |> rename(year = año)

    if (sheet_name == "062015") {
      p <- str_which(prx_region_tmp$region,"Cajamarca")
      prx_region_tmp$region[p] <- "Cajamarca_R"
    } else {
      p <- str_which(prx_region_tmp$region,"Cajamarca") |>
        first()
      prx_region_tmp$region[p] <- "Cajamarca_R"
    }

    # The production of each product is given in two rows, but the product name is not repeated
    prx_region_tmp <-
      prx_region_tmp |>
      fill(region, .direction = "down")

    # Removing sub-regional data
    prx_region_tmp <-
      prx_region_tmp |>
      filter(
        ! region %in% c("Cajamarca", "Chota", "Jaén", "Abancay","Andahuaylas")
      ) |>
      filter(! year %in% c(NA))

    resul_current <-
      prx_region_tmp |>
      pivot_longer(cols = -c(region, year), names_to = "product")

    resul <- resul |>
      bind_rows(resul_current)

    if(sheet_name == "062015"){
      resul <- resul |>
        filter(year %in% c(2014, 2015))
    }
  }

  # Correction for product names
  resul <-
    resul |>
    mutate(
      product = ifelse(
        product == "arveja gr. seco",
        yes = "arveja grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol grano seco**",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "frijol palo",
        yes = "frijol de palo",
        no = product
      ),
      product = ifelse(
        product == "frijol castlla",
        yes = "frijol castilla",
        no = product
      ),
      product = ifelse(
        product == "lentaja",
        yes = "lenteja",
        no = product
      ),
      product = ifelse(
        product == "frijol gr. seco",
        yes = "frijol grano seco",
        no = product
      ),
      product = ifelse(
        product == "haba gr. seco",
        yes = "haba grano seco",
        no = product
      ),
      product = ifelse(
        product == "maíz a. duro",
        yes = "maíz amarillo duro",
        no = product
      ),
      product = ifelse(
        product == "arveja gr. verde",
        yes = "arveja grano verde",
        no = product
      ),
      product = ifelse(
        product == "haba gr. verde",
        yes = "haba grano verde",
        no = product
      ),
      product = ifelse(
        product == "espá-rrago",
        yes = "espárrago",
        no = product
      ),
      product = ifelse(
        product == "maíz duro",
        yes = "maíz amarillo duro",
        no = product
      ),
    )

  if (anneesup == 1) {
    resul <-
      resul |>
      filter(year == max(year))
  } else {
    # Keeping only the last year available
    resul <-
      resul |>
      filter(year == min(year))
  }

  resul |>
    mutate(month = month) |>
    dplyr::select(region, year, month, product, value)
}# End of import_monthly_regional_values_year_Px ()
#' Import monthly regional agricultural values from the Excel file,
#' for all the months of the file.
#'
#' @param file path to the Excel File
#' @param anneesup
#' @param timescale
import_monthly_regional_values_Px <- function(file,
                                              anneesup,
                                              timescale) {
  sheet_names <- excel_sheets(file)
  prx_region_monthly <- map(
    sheet_names,
    import_monthly_regional_values_year_Px,
    file = file,anneesup= anneesup
  ) |>
    list_rbind()
  # Cleaning years and changing type of values: from str to num
  prx_region_monthly <-
    prx_region_monthly |>
    mutate(year = str_remove(year, "p/?") |> as.numeric()) |>
    mutate(
      value_num = str_remove_all(value, pattern = " ") |>
        str_replace_all(pattern = ",", replacement = ".") |>
        as.numeric()
    )

  # Values in 2015 are expressed in thousands of tonnes
  # These need to be expressed in tonnes
  if (str_detect(file, "Prices_2015.xlsx$")) {
    prx_region_monthly <-
      prx_region_monthly |>
      mutate(
        value_num = value_num,
        value_num = ifelse(month > 4,
          yes = value_num / 10^3,
          no = value_num
        )
      )
  }


  # Removing the `value` columns to keep only the column with numerical values
  prx_region_monthly <-
    prx_region_monthly |>
    dplyr::select(-value)


  # Cleaning region names
  prx_region_monthly$region |> unique() |> sort()
  prx_region_monthly <-
    prx_region_monthly |>
    mutate(region = replace(region, region=="Apurimac", "Apurímac")) |>
    mutate(region = replace(region, region=="Cajamarca_R", "Cajamarca")) |>
    mutate(
      region = replace(
        x = region,
        list = region %in% c("Nacional", "Promedio nacional", "PROMEDIO NACIONAL"),
        values = "Total Nacional"
      )
    )

  # Cleaning product names
  prx_region_monthly <- prx_region_monthly
  prx_region_monthly <- prx_region_monthly |>
    # Replace extra spacing with only one space
    mutate(product = str_replace_all(product, "[[:blank:]]{2,}", " "))

  prx_region_monthly <-
    prx_region_monthly |>
    mutate(
      product = ifelse(
        product == "arveja gr. seco", yes = "arveja grano seco", no = product
      ),
      product = ifelse(
        product == "frijol castlla", yes = "frijol castilla", no = product
      ),
      product = ifelse(
        product == "frijol gr. seco", yes = "frijol grano seco", no = product
      ),
      product = ifelse(
        product == "haba gr. seco", yes = "haba grano seco", no = product
      ),
      product = ifelse(
        product == "maíz a. duro", yes = "maíz amarillo duro", no = product
      )
    )


  if (str_detect(file, "Prices_201(4|5).xlsx$")) {
    prx_region_monthly <- prx_region_monthly |>
      mutate(
        product = case_when(
          str_detect(product, "arroz")           ~  "arroz cáscara",
          str_detect(product, "banano")          ~  "plátano",
          str_detect(product, "café")            ~  "café",
          str_detect(product, "banano")          ~  "plátano",
          str_detect(product, "caña")            ~  "caña de azúcar",
          str_detect(product, "chocho")          ~  "chocho",
          str_detect(product, "gar")             ~  "garbanzo",
          str_detect(product, "caña")            ~  "caña de azúcar",
          str_detect(product, "algodón")         ~  "algodón rama",
          str_detect(product, "maíz amilaceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz amílaceo")   ~  "maíz amiláceo",
          str_detect(product, "caña de azúcar para azúcar")   ~  "caña de azúcar",
          str_detect(product, "arveja seca")     ~  "arveja grano seco",
          str_detect(product, "arveja verde")    ~  "arveja grano verde",
          str_detect(product, "loctao")          ~  "frijol loctao",
          str_detect(product, "frijol seco")     ~  "frijol grano seco",
          str_detect(product, "frijol castill")  ~  "frijol castilla",
          str_detect(product, "haba seca")       ~  "haba grano seco",
          str_detect(product, "haba seco")       ~  "haba grano seco",
          str_detect(product, "haba verde")      ~  "haba grano verde",
          str_detect(product, "limón sutil")     ~  "limón",
          str_detect(product, "haba verde")      ~  "haba grano verde",
          str_detect(product, "algodón rama")    ~  "algodón rama",
          str_detect(product, "café")            ~  "café",
          str_detect(product, "cañihua")         ~  "cañihua",
          str_detect(product, "espárrago")       ~  "espárrago",
          str_detect(product, "limón")           ~  "limón",
          str_detect(product, "maíz amiláceo")   ~  "maíz amiláceo",
          str_detect(product, "maíz choclo")     ~  "maíz choclo",
          str_detect(product, "piña")            ~  "piña",
          str_detect(product, "zaran-daja")      ~  "zarandaja",
          TRUE ~ product
        )
      )
  }

  prx_region_monthly <-
    prx_region_monthly |>
    mutate(value_num = ifelse(is.na(value_num), yes = 0, no = value_num))|>
    mutate(date = str_c(year, month, sep = "-")) |>
    unique()

  prx_region_monthly
}# End of import_monthly_regional_values_Px ()

2.2 Download the Data

Once the functions to downloaded the data are defined, we can use them.

# Production 
data_P_TOTAL <- NULL
data_P_TOTAL_A <- NULL
# Planted surface
data_S_TOTAL <- NULL
data_S_TOTAL_A <- NULL
# Harvested surface 
data_SR_TOTAL <- NULL
data_SR_TOTAL_A <- NULL
# Prices
data_Px_TOTAL <- NULL
download_again <- FALSE
if (download_again) {
  for (y in c(2:14)) {
    year <-  ifelse( y < 10, paste("200", y, sep=""), paste("20", y, sep=""))
    print(year)
    # Code for years in PDF files
    if (y %in% c(2,3,4)) {
      for (m in 12:1) {
        print(m)
        #Months not available in PDF Files
        if ( m == 3 & y == 3) {next}
        if ( m == 3 & y == 2) {next}
        if ( m == 6 & y == 2) {next}
        if ( m == 9 & y == 2) {next}
        if ( m == 12 & y == 2) {next}
        
        # URL to download the pdf files
        link <- str_c(
          "https://www.midagri.gob.pe/portal/download/pdf",
          "/herramientas/boletines/boletineselectronicos/",
          "estadisticaagrariamensual/", year, "/EAM",
          str_sub(year, 3, 4),
          ifelse(m < 10, paste("0", m, sep = ""), m),
          ".pdf"
        )
        
        # # Production - Pages 44,45, 46 and 47 of the PDF Files
        name <- str_c(
          "../data/raw/minagri/Production/Production_", year, ".xlsx"
        )
        for (i in 0:1) {
          page1 <- extract_pdf_data(
            annee = y,
            mois = m,
            adresse = link,
            page = 44 + 2 * i, 
            Cell1 = "Departamento"
          )
          page2 <- extract_pdf_data(
            annee = y, 
            mois = m, 
            adresse = link,
            page = 45 + 2 * i,
            Cell1 = "Departamento"
          )
          Tableau <- rbind(page1, page2)
          
          write.xlsx(
            Tableau, 
            name, 
            sheetName = str_c(
              ifelse(m < 10, paste("0",m, sep=""), m),
              year,
              i + 1
            ),
            append = TRUE,
            showNA = FALSE
          )
          print(paste("Production", i, "ok "))
        }
        # Planted surface - Pages 30, 31, 32, 33 of the PDF Files
        name <- str_c(
          "../data/raw/minagri/Surface/Superficies_", year, ".xlsx"
        )
        for (i in 0:1) {
          page1 <- extract_pdf_data(
            annee = y,
            mois = m,
            adresse = link,
            page = 30 + 2 * i,
            Cell1 = "Departamento"
          )
          page2 <- extract_pdf_data(
            annee = y, 
            mois = m,
            adresse = link,
            page = 31 + 2 * i,
            Cell1 = "Departamento"
          )
          Tableau <-  rbind(page1, page2)
          write.xlsx(
            Tableau,
            name,
            sheetName = str_c(
              ifelse(m < 10, paste("0", m, sep = ""), m), 
              year,
              i + 1
            ), 
            append = TRUE, 
            showNA = FALSE
          )
          print(paste("Surface", i, "ok "))
        }
        
        # Harvested surface - Pages 38,45, 46 and 47 of the PDF Files
        name <- str_c(
          "../data/raw/minagri/Surface_R/Superficies_R_", year, ".xlsx"
        )
        for (i in 0:1) {
          page1 <- extract_pdf_data(
            annee = y,
            mois = m, 
            adresse = link, 
            page = 38 + 2 * i,
            Cell1 = "Departamento"
          )
          page2 <- extract_pdf_data(
            annee = y, 
            mois = m, 
            adresse = link, 
            page = 39 + 2 * i, 
            Cell1 = "Departamento"
          )
          Tableau <-  rbind(page1, page2)
          write.xlsx(
            Tableau,
            name,
            sheetName = str_c(
              ifelse(m < 10, paste("0", m, sep = ""), m),
              year,
              i + 1
            ), 
            append = TRUE,
            showNA = FALSE
          )
          print(paste("Surface R", i, "ok "))
        }
        
        # Prices - Pages 109,110, 111, 112, 113, 114 or 115 depending on the PDF Files
        name <- str_c(
          "../data/raw/minagri/Prices/Prices_", year, ".xlsx"
        )
        for (i in 0:1) {
          if (y < 3 & m < 8) {
            page2002 <- case_when(m == 7 ~ 4, TRUE ~ 3)
          } else {
            page2002 <- 0
          }
          page1 <- extract_pdf_data(
            annee = y,
            mois = m,
            adresse = link, 
            page = 112 +2 * i - page2002, 
            Cell1 = "Departamento"
          )
          page2 <- extract_pdf_data(
            annee = y,
            mois = m,
            adresse = link, 
            page = 113 + 2 * i - page2002,
            Cell1 = "Departamento"
          )
          
          if((ncol(page1) != ncol(page2)) & 
             any(str_detect(page2[,ncol(page2)], "Plátano"))) {
            page2 <- page2[,-ncol(page2)]
          }
          Tableau <-  rbind(page1, page2)
          write.xlsx(
            Tableau, 
            name,
            sheetName = str_c(
              ifelse(m < 10, paste("0", m, sep = ""), m),
              year,
              i + 1
            ),
            append = TRUE,
            showNA = FALSE
          )
        }
      }
    } else {
      # Code for years in Excel files - 2006 and above
      # Remark : 2005 data obtained from the 2006 files
      if(y == 5) {
        next
      } else {
        # Downloading and extracting the excel sheets of interest
        download.data(y, out_folder = "../data/raw/minagri/")
      }
    }
  }
}

2.3 Import Data in R

We loop over the years to import the downloaded data.

for (y in 2:16) {
  if (y %in% c(5, 16)) {
    # Extracting the data of year y (only for the last year)
    # last digits of the year
    fn_digits_year <- ifelse(y < 10, str_c("0", y - 1), y - 1)
    
    data_P  <- import_monthly_regional_values_P(
      file = str_c(
        "../data/raw/minagri/Production/Production_20", 
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1,
      timescale = 1
    )
    data_Px <- import_monthly_regional_values_Px(
      file = str_c(
        "../data/raw/minagri/Prices/Prices_20", 
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1,
      timescale = 1
    )
    data_S  <- import_monthly_regional_values_S(
      file = str_c(
        "../data/raw/minagri/Surface/Superficies_20", 
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1, timescale = 1
    )
    data_SR <- import_monthly_regional_values_SR(
      file = str_c(
        "../data/raw/minagri/Surface_R/Superficies_R_20", 
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1, 
      timescale = 1
    )
    
    data_P_annual  <- import_monthly_regional_values_P(
      file = str_c(
        "../data/raw/minagri/Production/Production_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1,
      timescale = 12
    )
    data_S_annual  <- import_monthly_regional_values_S(
      file = str_c(
        "../data/raw/minagri/Surface/Superficies_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1,
      timescale = 12
    )
    data_SR_annual <- import_monthly_regional_values_SR(
      file = str_c(
        "../data/raw/minagri/Surface_R/Superficies_R_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 1,
      timescale = 12
    )
  } else {
    # Extracting the data of year y-1
    fn_digits_year <- str_pad(y, width = 2, side = "left", pad = 0)
    data_P  <- import_monthly_regional_values_P(
      file = str_c(
        "../data/raw/minagri/Production/Production_20", 
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 1
    )
    data_S  <- import_monthly_regional_values_S(
      file = str_c(
        "../data/raw/minagri/Surface/Superficies_20", 
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 1
    )
    data_SR <- import_monthly_regional_values_SR(
      file = str_c(
        "../data/raw/minagri/Surface_R/Superficies_R_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0, 
      timescale = 1
    )
    data_Px <- import_monthly_regional_values_Px(
      file = str_c(
        "../data/raw/minagri/Prices/Prices_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 1
    )
    
    data_P_TRIM <- import_monthly_regional_values_P(
      file = str_c(
        "../data/raw/minagri/Production/Production_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 4
    )
    data_P_annual  <- import_monthly_regional_values_P(
      file = str_c(
        "../data/raw/minagri/Production/Production_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 12
    )
    data_S_annual  <- import_monthly_regional_values_S(
      file = str_c(
        "../data/raw/minagri/Surface/Superficies_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 12
    )
    data_SR_annual <- import_monthly_regional_values_SR(
      file = str_c(
        "../data/raw/minagri/Surface_R/Superficies_R_20",
        fn_digits_year, ".xlsx"
      ),
      anneesup = 0,
      timescale = 12
    )
  }
  
  print(str_c(y," ok"))
  
  data_P <-
    data_P |>
    mutate(date = lubridate::ymd(str_c(year, month, "01", sep = "-")))
  colnames(data_P) <- c(
    "region", "year", "product", "month", "Value_prod", "date"
  )

  data_S <-
    data_S |>
    mutate(date = lubridate::ymd(str_c(year, month, "01", sep = "-")))
  colnames(data_S) <- c(
    "region", "campaign", "month", "product", "year","Value_surf", "date"
  )
  
  data_SR <-
    data_SR |>
    mutate(date = lubridate::ymd(str_c(year, month, "01", sep = "-")))
  colnames(data_SR) <- c(
    "region", "year", "product", "month", "Value_surfR", "date"
  )
  
  data_Px <-
    data_Px |>
    mutate(date = lubridate::ymd(str_c(year, month, "01", sep = "-")))
  colnames(data_Px) <- c(
    "region", "year", "month", "product", "Value_prices", "date"
  )
  
  # # Adding the new year to the global mensual files
  data_P_TOTAL[[y_ix]]  <- data_P
  data_S_TOTAL[[y_ix]]  <- data_S
  data_SR_TOTAL[[y_ix]] <- data_SR
  data_Px_TOTAL[[y_ix]] <- data_Px
  
  rm(data_P, data_S, data_SR, data_Px)
} # End of Loop 1

All the year-elements in a single tibble:

data_P_TOTAL    <- list_rbind(data_P_TOTAL)
data_S_TOTAL    <- list_rbind(data_S_TOTAL)
data_SR_TOTAL   <- list_rbind(data_SR_TOTAL)
data_Px_TOTAL   <- list_rbind(data_Px_TOTAL)

2.3.1 Campaign Data

We add the agricultural campaign data to the Planted surfaces data.

data_S_TOTAL <- data_S_TOTAL |> 
  mutate(
    campaign = as.numeric(str_sub(campaign, 3, 4)),
    campaign_plain = str_c(campaign,"/",campaign + 1)) |> 
  unique() |> 
  # Retrieving the campaign month (starting in August)
  mutate(
    month_campaign = case_when(
      month == 1  ~ 6,
      month == 2  ~ 7,
      month == 3  ~ 8,
      month == 4  ~ 9,
      month == 5  ~ 10,
      month == 6  ~ 11,
      month == 7  ~ 12,
      month == 8  ~ 1,
      month == 9  ~ 2,
      month == 10 ~ 3,
      month == 11 ~ 4,
      month == 12 ~ 5
    )
  ) |> 
  mutate(
    product = ifelse(
      product == "maíz duro",
      yes =  "maíz amarillo duro",
      no = product
    )
  ) |> 
  group_by(region, product) |> 
  # Harmonizing the cumulative values of surface (if the lead and the lag are equals)
  mutate(
    Value_surf = ifelse(
      lead(Value_surf) == lag(Value_surf, default = 0) & 
        Value_surf < lag(Value_surf, default = 0),
      yes = lag(Value_surf),
      no = Value_surf
    )
  )

Then, we check whether there are some errors:

# Checking for errors  
dup_surf <- data_S_TOTAL |>
  filter(! product == "total") |> 
  filter(! product == "TOTAL") |> 
  group_by(region, date, product) |> 
  mutate(n = n()) |> 
  filter(n > 1) |> 
  select(date) |> 
  unique()

2.3.2 Growth Duration

Let us determine the growth duration and the corresponding lags from the agricultural calendars.[REFERENCE NEEDED HERE]

2.3.2.1 Calendar

The Excel files with the agricultural calendar for specific crops are in the data folder, within the Calendario agricola sub-folder. Note: soya seems to be missing.

N <- list.files(
  path = "../data/raw/Calendario agricola/", 
  pattern = "xls$",
  full.names = TRUE
)

Let us define a small function that imports a specific calendar.

#' Import calendar from the Excel files
#' 
#' @param x full path to a Calendar (Excel file)
import_calendar <- function(x) {
  region <- str_extract(x, "//cal_(.*)\\.xls") |> 
    str_remove("//cal_") |> 
    str_remove("\\.xls")
  
  df_cal_1 <- read_excel(x)
  
  row_prod_mes <- str_which(
    df_cal_1$`CALENDARIO AGRICOLA NACIONAL`, "Producto/Mes"
  )
  ind_first <- first(row_prod_mes)
  ind_last <- last(row_prod_mes)
  
  df_cal_planting <- read_excel(
    x, 
    skip = ind_first, 
    n_max = ind_last-ind_first - 3
  )
  df_cal_planting <- 
    df_cal_planting |> 
    pivot_longer(
      cols = -`Producto/Mes`,
      names_to = "month_spanish",
      values_to = "pct"
    ) |> 
    mutate(region = region, period = "planting")
  
  df_cal_harvest <- read_excel(x, skip = ind_last)
  df_cal_harvest <- 
    df_cal_harvest |> 
    filter(!is.na(`Producto/Mes`)) |> 
    mutate(across(-`Producto/Mes`, ~as.numeric(.))) |> 
    pivot_longer(
      cols = -`Producto/Mes`,
      names_to = "month_spanish",
      values_to = "pct"
    ) |> 
    mutate(region = region, period = "harvest")
  
  df_cal_planting |> 
    bind_rows(df_cal_harvest)
}

All the calendars can then be imported:

calendar <- map(N, import_calendar) |> 
  list_rbind()

The dates are written using Spanish month names. Let us create a table with the corresponding month numbers.

calendar <- 
  calendar |> 
  filter(month_spanish != "...14")

spanish_months <- 
  tibble(
    month_spanish = c(
      "Ene", "Feb", "Mar", "Abr", "May", "Jun",
      "Jul", "Ago", "Set", "Oct", "Nov", "Dic"
    ),
    month = 1:12
  )

The corresponding month numbers can then be associated with the data in the calendars.

calendar <- 
  calendar |> 
  left_join(spanish_months) |> 
  select(-month_spanish)

Some renaming for the regions:

calendar <- calendar |> 
  mutate(
    region = ifelse(region == "lalibertad", "la libertad", region),
    region = ifelse(region == "madrededios", "madre de dios", region),
    region = ifelse(region == "sanmartin", "san martin", region)
  ) |> 
  mutate(region = str_to_upper(region))

When no data is filled in the Excel file, we assume it corresponds to no production.

calendar <- 
  calendar |> 
  rename(product = `Producto/Mes`) |> 
  mutate(pct = ifelse(is.na(pct), 0, pct))

The calendar for planting dates:

calendar1 <- calendar |> 
  filter(period == "planting") |> 
  group_by(region, product) |> 
  mutate(val_max = max(pct)) |> 
  ungroup() |> 
  slice(which(pct == val_max)) |> 
  select(-val_max) 

For the harvest season:

calendar2 <- calendar |> 
  filter(period == "harvest") |> 
  group_by(region, product) |> 
  mutate(val_max = max(pct)) |> 
  ungroup() |> 
  slice(which(pct == val_max)) |> 
  slice(-2) |> 
  slice(-163) |> 
  select(-val_max) |> 
  full_join(calendar1, by = c("region","product")) |> 
  mutate(
    growth_duration = month.x - month.y,
    growth_duration = ifelse(
      growth_duration < 0, 
      yes = month.x + 13 - month.y, 
      no = growth_duration
    ),
    growth_duration = ifelse(growth_duration == 0,12, growth_duration)
  ) |> 
  slice(-which(is.na(growth_duration))) |> 
  mutate(
    region = toupper(iconv(region, to = "ASCII//TRANSLIT")),
    product = toupper(product)
  )
calendar3 <- calendar |> 
  filter(period == "harvest") |> 
  mutate(month =   str_c("month", month)) |> 
  pivot_wider(names_from = month, values_from = pct) |> 
  mutate(
    cum_sum1  = month1, 
    cum_sum2  = month2 + cum_sum1, 
    cum_sum3  = month3 + cum_sum2, 
    cum_sum4  = month4 + cum_sum3, 
    cum_sum5  = month5 + cum_sum4, 
    cum_sum6  = month6 + cum_sum5, 
    cum_sum7  = month7 + cum_sum6, 
    cum_sum8  = month8 + cum_sum7, 
    cum_sum9  = month9 + cum_sum8, 
    cum_sum10 = month10 + cum_sum9, 
    cum_sum11 = month11 + cum_sum10, 
    cum_sum12 = month12 + cum_sum11) |> 
  select(
    product, region,
    cum_sum1, cum_sum2, cum_sum3, cum_sum4,cum_sum5, cum_sum6,
    cum_sum7, cum_sum8, cum_sum9, cum_sum10, cum_sum11, cum_sum12
  ) |> 
  pivot_longer(
    cols = c(
      cum_sum1, cum_sum2, cum_sum3, cum_sum4,cum_sum5, cum_sum6,
      cum_sum7, cum_sum8, cum_sum9, cum_sum10, cum_sum11, cum_sum12
    ), 
    names_to = "month"
  ) |> 
  rename(perc_cum_harv = value) |> 
  mutate(
    month = case_when(
      month == "cum_sum1"  ~ 1, 
      month == "cum_sum2"  ~ 2, 
      month == "cum_sum3"  ~ 3, 
      month == "cum_sum4"  ~ 4, 
      month == "cum_sum5"  ~ 5, 
      month == "cum_sum6"  ~ 6, 
      month == "cum_sum7"  ~ 7, 
      month == "cum_sum8"  ~ 8, 
      month == "cum_sum9"  ~ 9, 
      month == "cum_sum10" ~ 10, 
      month == "cum_sum11" ~ 11, 
      month == "cum_sum12" ~ 12, 
    )
  )

The planting season:

calendar4 <- calendar |> 
  filter(period == "planting") |> 
  mutate(month =   str_c("month", month)) |> 
  pivot_wider(names_from = month, values_from = pct) |> 
  mutate(
    cum_sum1  = month8, 
    cum_sum2  = month9 + cum_sum1, 
    cum_sum3  = month10 + cum_sum2, 
    cum_sum4  = month11 + cum_sum3, 
    cum_sum5  = month12 + cum_sum4, 
    cum_sum6  = month1 + cum_sum5, 
    cum_sum7  = month2 + cum_sum6, 
    cum_sum8  = month3 + cum_sum7, 
    cum_sum9  = month4 + cum_sum8, 
    cum_sum10 = month5 + cum_sum9, 
    cum_sum11 = month6 + cum_sum10, 
    cum_sum12 = month7 + cum_sum11
  ) |> 
  select(
    product, region, 
    cum_sum1, cum_sum2, cum_sum3, cum_sum4,cum_sum5, cum_sum6, 
    cum_sum7, cum_sum8, cum_sum9, cum_sum10, cum_sum11, cum_sum12
  ) |> 
  pivot_longer(
    cols = c(
      cum_sum1, cum_sum2, cum_sum3, cum_sum4,cum_sum5, cum_sum6, 
      cum_sum7, cum_sum8, cum_sum9, cum_sum10, cum_sum11, cum_sum12
    ), names_to = "month") |> 
  rename(perc_cum_plan = value) |> 
  mutate(
    month = case_when(
      month == "cum_sum1"  ~ 8, 
      month == "cum_sum2"  ~ 9, 
      month == "cum_sum3"  ~ 10, 
      month == "cum_sum4"  ~ 11, 
      month == "cum_sum5"  ~ 12, 
      month == "cum_sum6"  ~ 1, 
      month == "cum_sum7"  ~ 2, 
      month == "cum_sum8"  ~ 3, 
      month == "cum_sum9"  ~ 4, 
      month == "cum_sum10" ~ 5, 
      month == "cum_sum11" ~ 6, 
      month == "cum_sum12" ~ 7, 
    )
  )

We save those calendars

save(calendar, file = "../data/output/Calendario agricola/calendar.rda")
save(calendar2, file = "../data/output/Calendario agricola/calendar2.rda")
save(calendar3, file = "../data/output/Calendario agricola/calendar3.rda")
save(calendar4, file = "../data/output/Calendario agricola/calendar4.rda")
calendar2 <- calendar2 |> 
  mutate(
    region = str_replace_all(region, "á", "a"),
    region = str_replace_all(region, "í", "i"),
    region = str_replace_all(region, "é", "e"),
    region = str_replace_all(region, "ó", "o"),
    region = str_replace_all(region, "ú", "u"),
    region = str_replace_all(region, "ñ", "n")
  ) |> 
  mutate(
    region = toupper(region), 
    product = toupper(product),
    product = ifelse(str_detect(product, "ARROZ"), "ARROZ CÁSCARA", product)
  )
calendar2
# A tibble: 142 × 9
   product  pct.x region period.x month.x pct.y period.y month.y growth_duration
   <chr>    <dbl> <chr>  <chr>      <int> <dbl> <chr>      <int>           <dbl>
 1 ARROZ C…  15.5 AMAZO… harvest        8  17.3 planting       3               5
 2 FRIJOL …  24.6 AMAZO… harvest        7  31.5 planting      11               9
 3 MAÍZ AM…  15.4 AMAZO… harvest        6  15.7 planting       2               4
 4 MAÍZ AM…  20.9 AMAZO… harvest        8  27.1 planting      11              10
 5 PAPA      11.8 AMAZO… harvest        4  12.8 planting      11               6
 6 TRIGO     27.5 AMAZO… harvest        8  22.2 planting       3               5
 7 YUCA      13.3 AMAZO… harvest       11  14.4 planting      11              12
 8 ALGODÓN   19.2 ANCASH harvest        3  25.5 planting       8               8
 9 ARROZ C…  25.4 ANCASH harvest        3  32.5 planting      11               5
10 CEBADA …  62.2 ANCASH harvest        7  44.6 planting       1               6
# ℹ 132 more rows
data_S_TOTAL <- data_S_TOTAL |>
  group_by(region, product, campaign) |>
  arrange(region, product, campaign, month_campaign) |> 
  mutate(
    surf_m = case_when(
      month == "8" ~ Value_surf,
      month != "8" ~ Value_surf - lag(Value_surf))
  ) |>
  ungroup() |>  
  group_by(region, product) |>
  mutate(id = row_number()) |> 
  ungroup() |> 
  mutate(
    region = str_replace_all(region, "á", "a"),
    region = str_replace_all(region, "í", "i"),
    region = str_replace_all(region, "é", "e"),
    region = str_replace_all(region, "ó", "o"),
    region = str_replace_all(region, "ú", "u"),
    region = str_replace_all(region, "ñ", "n")
  ) |> 
  mutate(
    region = toupper(iconv(region, to = "ASCII//TRANSLIT")),
    product = toupper(product)
  ) |>
  left_join(calendar2, by = c("region","product"))

Let us add a variable with the lagged surface:

data_S_TOTAL <- 
  data_S_TOTAL |> 
  arrange(region, product, id) |> 
  group_by(region, product) |>
  mutate(
    surf_lag_calend = case_when(
      growth_duration == 1  & !is.na(lag(surf_m, 1))  ~ lag(surf_m, 1),
      growth_duration == 2  & !is.na(lag(surf_m, 2))  ~ lag(surf_m, 2),
      growth_duration == 3  & !is.na(lag(surf_m, 3))  ~ lag(surf_m, 3),
      growth_duration == 4  & !is.na(lag(surf_m, 4))  ~ lag(surf_m, 4),
      growth_duration == 5  & !is.na(lag(surf_m, 5))  ~ lag(surf_m, 5),
      growth_duration == 6  & !is.na(lag(surf_m, 6))  ~ lag(surf_m, 6),
      growth_duration == 7  & !is.na(lag(surf_m, 7))  ~ lag(surf_m, 7),
      growth_duration == 8  & !is.na(lag(surf_m, 8))  ~ lag(surf_m, 8),
      growth_duration == 9  & !is.na(lag(surf_m, 9))  ~ lag(surf_m, 9),
      growth_duration == 10 & !is.na(lag(surf_m, 10)) ~ lag(surf_m, 10),
      growth_duration == 11 & !is.na(lag(surf_m, 11)) ~ lag(surf_m, 11),
      growth_duration == 12 & !is.na(lag(surf_m, 12)) ~ lag(surf_m, 12),
      TRUE ~ NA
    )
  )

Harmonizing the data:

data_S_TOTAL <- data_S_TOTAL |>  
  mutate(
    surf_lag_calend = ifelse(id <= growth_duration, NA, surf_lag_calend)
  ) |> 
  filter(! product %in% c("TOTAL", "total")) |> 
  rename(
    "gr_duration_calend" = "growth_duration", 
    "month_plan_calend" = "month.y", 
    "month_harv_calend" = "month.x"
  ) |> 
  select(- period.x, - pct.x, -pct.y, -period.y)

rm(calendar2)

2.3.3 Saving the raw data

We can save the raw data:

save(data_P_TOTAL, file = "../data/output/minagri/data_P_TOTAL.rda")
save(data_S_TOTAL, file = "../data/output/minagri/data_S_TOTAL.rda")
save(data_SR_TOTAL, file = "../data/output/minagri/data_SR_TOTAL.rda")
save(data_Px_TOTAL, file = "../data/output/minagri/data_Px_TOTAL.rda")

2.4 Aggregation of Agricultural Datasets

We can load the datasets obtained previously:

load("../data/output/minagri/data_P_TOTAL.rda")
load("../data/output/minagri/data_S_TOTAL.rda")
load("../data/output/minagri/data_SR_TOTAL.rda")
load("../data/output/minagri/data_Px_TOTAL.rda")

2.4.1 Harmonization

Harmonizing the production, surfaces and prices datasets.

Production prices:

data_Px_TOTAL <- data_Px_TOTAL |> 
  mutate(
    region = str_replace_all(region, "á", "a"),
    region = str_replace_all(region, "í", "i"),
    region = str_replace_all(region, "é", "e"),
    region = str_replace_all(region, "ó", "o"),
    region = str_replace_all(region, "ú", "u"),
    region = str_replace_all(region, "ñ", "n")
  ) |> 
  mutate(region = toupper(iconv(region, to = "ASCII//TRANSLIT"))) |> 
  mutate(
    region = toupper(region), 
    product = toupper(product), 
    product = ifelse(
      product == "FRIJOL GRANO SECO**",
      yes = "FRIJOL GRANO SECO", 
      no = product
    )
  )

Harvested surface:

data_SR_TOTAL <- data_SR_TOTAL |> 
  mutate(
    region = str_replace_all(region, "á", "a"),
    region = str_replace_all(region, "í", "i"),
    region = str_replace_all(region, "é", "e"),
    region = str_replace_all(region, "ó", "o"),
    region = str_replace_all(region, "ú", "u"),
    region = str_replace_all(region, "ñ", "n")
  ) |> 
  mutate(region = toupper(iconv(region, to = "ASCII//TRANSLIT"))) |> 
  mutate(region = toupper(region), 
         product = toupper(product), 
         product = ifelse(
           product == "FRIJOL GRANO SECO**",
           yes = "FRIJOL GRANO SECO",
           no = product)
  )

2.4.2 Aggregation in a single tibble

data <- data_P_TOTAL |> 
  mutate(
    region = str_replace_all(region, "á", "a"),
    region = str_replace_all(region, "í", "i"),
    region = str_replace_all(region, "é", "e"),
    region = str_replace_all(region, "ó", "o"),
    region = str_replace_all(region, "ú", "u"),
    region = str_replace_all(region, "ñ", "n")) |> 
  mutate(region = toupper(iconv(region, to = "ASCII//TRANSLIT"))
  ) |> 
  mutate(
    region = toupper(region), 
    product = toupper(product), 
    month = as.numeric(month)
  ) |> 
  full_join(
    data_S_TOTAL |> 
      select(-c(year,month)), 
    by = c("region","product","date")
  ) |>  
  filter(! is.na(year)) |> 
  # No consequences on the aggregation because the missing data are not in the
  # product sample used hereafter. (except for Lima but without production value)
  full_join(
    data_Px_TOTAL |> 
      select(-c(year,month)), 
    by = c("region","product","date")
  ) |>  
  filter(product %in% c(
    "PAPA", "CEBADA GRANO", "MAÍZ AMARILLO DURO", "MAÍZ AMILÁCEO",
    "ARROZ CÁSCARA", "SORGO GRANO", "YUCA", "TRIGO")
  ) |> 
  select(
    region, product, year, month, date, 
    Value_prod, surf_m, surf_lag_calend, Value_prices, 
    campaign, month_campaign, campaign_plain, id
  ) |> 

  full_join(
    data_SR_TOTAL |> 
      select(- month, - year), 
    by = c("region","product","date")
  ) |>
  filter(product %in% c(
    "PAPA", "CEBADA GRANO", "MAÍZ AMARILLO DURO", "MAÍZ AMILÁCEO",
    "ARROZ CÁSCARA", "SORGO GRANO", "YUCA", "TRIGO")
  ) |>
  filter(! region %in% c(
    "Lima Metropolitana", "Callao", "LIMA METROPOLITANA",
    "CALLAO", "PROMEDIO NACIONAL")
  ) |>
  filter(! is.na(region)) |> 
  filter(! product %in% c("SORGO GRANO", "CEBADA GRANO"))

Some values are missing. Let us deal with those.

missing_values <- readxl::read_excel(
  path = "../data/raw/Macro/Datos_INEI1.xlsx",
  sheet = "SELECTED2",
  col_types = "text"
) |> 
  mutate(
    region = str_replace_all(region, "á", "a"),
    region = str_replace_all(region, "í", "i"),
    region = str_replace_all(region, "é", "e"),
    region = str_replace_all(region, "ó", "o"),
    region = str_replace_all(region, "ú", "u"),
    region = str_replace_all(region, "ñ", "n")
  ) |> 
  mutate(
    region = toupper(iconv(region, to = "ASCII//TRANSLIT")),
    region = toupper(region), 
    product = toupper(product)
  ) |> 
  select(-Indicador) |> 
  pivot_longer(cols = c(NOV,DEC), names_to = "name_month") |> 
  mutate(
    year = 2008, 
    month = ifelse(name_month == "NOV", 11, 12)
  ) |> 
  select(-name_month)
data_agri <- data |> 
  left_join(
    missing_values,
    by = c("region" ,"product" ,"month","year")
  ) |> 
  mutate(
    Value_prod = ifelse(
      # The price data in Nov. 2008 are the same as that of Oct...
      year == 2008 & month %in% c(11,12),
      yes = as.numeric(value), 
      no = Value_prod
    )
  ) |> 
  select(-value) |> 
  select(-id) |> 
  relocate(
    region, product, year, month, date, 
    Value_prod, surf_m,
    Value_surfR, Value_prices, 
    campaign, campaign_plain, month_campaign
  ) |> 
  filter(
    ! is.na(year), 
    ! region %in% c(
      "TOTAL NACIONAL", "CALLAO", "LIMA METROPOLITANA",
      "PROMEDIO NACIONAL", "LIMA PROVINCIAS"
    )
  )
rm(data)

Let us save that table:

save(data_agri, file = "../data/output/minagri/data_agri.rda")

2.5 Final Aggregation

load("../data/output/minagri/data_agri.rda")
length(unique(data_agri$region)) * 
  length(unique(data_agri$product)) *
  length(unique(data_agri$date))
[1] 25920

Let us fill the table if some rows are missing for the triplet region x product x date:

data_agri <- 
  data_agri |> 
  complete(region, product, date)

Are there any duplicated triplet?

data_agri |> 
  group_by(region, product, date) |> 
  count() |> 
  filter(n>1)
# A tibble: 0 × 4
# Groups:   region, product, date [0]
# ℹ 4 variables: region <chr>, product <chr>, date <date>, n <int>
data_total <-  
  data_agri |> 
  unique() |> 
  filter(! region == "TOTAL NACIONAL") |> 
    # Adding the english names of the selected crops
  mutate(
    product_eng = case_when(
      product == "ARROZ CÁSCARA"      ~ "Rice",
      product == "MAÍZ AMARILLO DURO" ~ "Dent corn",
      product == "MAÍZ AMILÁCEO"      ~ "Amylaceous corn", 
      product == "PAPA"               ~ "Potato", 
      product == "TRIGO"              ~ "Wheat", 
      product == "YUCA"               ~ "Cassava", 
      TRUE ~ product)
  ) |> 
  group_by(region, product, year) |> 
  # Computing the share of the annual production harvested at month m 
  mutate(
    perc_product = ifelse(Value_prod == 0, NA, Value_prod) / 
      sum( ifelse(Value_prod == 0, NA, Value_prod), na.rm = T)
  ) |> 
  ungroup() |> 
  group_by(region, product, month) |> 
  # Computing the average share of the annual production harvested at month m
  mutate(perc_product_mean = mean(perc_product, na.rm = T)) |> 
   ungroup()
data_total <- data_total |>
  arrange(region, product, date) |> 
  group_by(region) |> 
  mutate(region_id = cur_group_id()) |> 
  ungroup() |> 
  # Computing the log of the quantities and prices
  mutate(
    ln_prices = log(Value_prices + 1), 
    ln_produc = log(Value_prod + 1)
  ) |> 
  relocate(region_id, region, product, date, ln_prices, ln_produc)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `ln_produc = log(Value_prod + 1)`.
Caused by warning in `log()`:
! NaNs produced

Let us determine the start of the harvest season:

data_total <- 
  data_total |> 
  arrange(region, product_eng, date) |> 
  # Computing the difference between planted and harvested surfaces at time t 
  mutate(diff_plant_harv = surf_m - Value_surfR) |> 
  group_by(region, product_eng) |> 
 # Computing the cumulative difference and normalizing the detrended component
  mutate(exposition = cumsum(replace_na(diff_plant_harv, 0))) |> 
  mutate(exposition_trend = as.vector(
    mFilter::hpfilter(
      exposition, freq = 14400, type = "lambda", drift = FALSE
    )$trend)
  ) |> 
  mutate(exposition_detrended = exposition - exposition_trend) |> 
  mutate(
    exposition_norm = (exposition_detrended - min(exposition_detrended)) / 
      (max(exposition_detrended) - min(exposition_detrended))
  ) |> 
  ungroup() 

We add labels to each column:

data_total <- 
  data_total |> 
  labelled::set_variable_labels(
    region_id = "Region numerical ID",
    region = "Name of the region",
    product = "Name of the crop (in Spanish)",
    date = "Date (YYYY-MM-DD)",
    ln_prices = "Product price (log)",
    ln_produc = "Production (log of tons)",
    year = "Year (YYYY)",
    month = "Month (MM)",
    Value_prod = "Production (tons)",
    surf_m = "Planted Surface during the current month (hectares)",
    surf_lag_calend = "Planted Surface laggued by the growth duration computed from the caledars (hectares)",
    Value_surfR = "Harvested Surface (hectares)",
    Value_prices = "Unit Price (Pesos)",
    campaign = "ID of the planting campaing (starting in August)",
    campaign_plain = "Years of the planting campaing (starting in August)",
    month_campaign = "Month of the planting campaing (August = 1)",
    product_eng = "Name of the Product (in English)",
    perc_product = "Share of the annual production harvested at month m",
    perc_product_mean = "Average share of the annual production harvested at month m",
    diff_plant_harv = "Difference between planted and harvested surfaces during month m",
    exposition = "Cumulative difference between planted and harvested surfaces",
    exposition_trend = "Trend of the exposition using HP filter",
    exposition_detrended = "Difference between the exposition and its trend",
    exposition_norm = "Normalisation of the detrended exposition"
  )

And lastly, let us save the resulting data

save(data_total, file = "../data/output/minagri/dataset_agri_2001_2015.rda")
write.csv(data_total, "../data/output/minagri/dataset_agri_2001_2015.rda")

2.6 Content of the dataset

Table 2.1: Variables in the dataset_agri_2001_2015.rda file
Variable name Type Description
region_id integer Region numerical ID
region character Name of the region
product character Name of the crop (in Spanish)
date Date Date (YYYY-MM-DD)
ln_prices numeric Product price (log)
ln_produc numeric Production (log of tons)
year numeric Year (YYYY)
month numeric Month (MM)
Value_prod numeric Production (tons)
surf_m numeric Planted Surface during the current month (hectares)
Value_surfR numeric Harvested Surface (hectares)
Value_prices numeric Unit Price (Pesos)
campaign numeric ID of the planting campaing (starting in August)
campaign_plain character Years of the planting campaing (starting in August)
month_campaign numeric Month of the planting campaing (August = 1)
surf_lag_calend numeric Planted Surface laggued by the growth duration computed from the caledars (hectares)
product_eng character Name of the Product (in English)
perc_product numeric Share of the annual production harvested at month m
perc_product_mean numeric Average share of the annual production harvested at month m
diff_plant_harv numeric Difference between planted and harvested surfaces during month m
exposition numeric Cumulative difference between planted and harvested surfaces
exposition_trend numeric Trend of the exposition using HP filter
exposition_detrended numeric Difference between the exposition and its trend
exposition_norm numeric Normalisation of the detrended exposition