The calibration is first tested by using the first two weeks of the data and using the next 40 days for verification as described in the Supplementary information of the paper.

The results have been calculated by the scripts test_calibration.r and test_calibration_ml.r given the computational time required. The results are stored in output/calibration/test_2weeks/.

source("utilities/utilities.R")
source("utilities/nested_models.r")
source("utilities/variables.r")

input_folder <- "../output/calibration/test_2weeks/"

Data preparation

file_list <-
  list.files(
    path = input_folder,
    pattern = "res_.*",
    full.names = T,
    recursive = T
  )

# Parse the result files.
res <- data.frame()
res_bus <- data.frame()
for (file in file_list) {
  name <- strsplit(file, "/")[[1]][5]
  scenario <- sub(".rds", "", strsplit(name, "_")[[1]][[5]])
  method <- strsplit(name, "_")[[1]][[2]]
  dataset_type <- strsplit(name, "_")[[1]][[3]]
  tmp <- readRDS(file)
  model <- tmp$res
  model$scenario <- scenario
  model$dataset_type <- dataset_type
  res <- bind_rows(res, model)
  
  model <- tmp$res_bus
  model$scenario <- scenario
  model$dataset_type <- dataset_type
  res_bus <- bind_rows(res_bus, model)
}

res <- res %>%
  mutate(method = str_split(method, " ", simplify = T)[, 1])
res_bus <- res_bus %>%
  mutate(method = str_split(method, " ", simplify = T)[, 1])

# Format the names of the methods
res$method <-
  fct_recode(
    res$method,
    `GBM` = "gbm",
    `GBM+RH` = "gbmrh",
    `Koehler_mass+LR` = "koehlermasslr",
    `Koehler_mass+OLS` = "koehlermassols",
    `Koehler_size+LR` = "koehlersizelr",
    `Koehler_size+OLS` =  "koehlersizeols",
    `Laulainen+LR` = "laulainenlr",
    `Laulainen+OLS` = "laulainenols",
    `MLR_part.` =  "lmpart",
    `MLR_part.+RH` =   "lmpartrh",
    `LR` = "lr",
    `MLR_RH` = "mlr1" ,
    `OLS` =  "ols",
    `RLM` = "rlmhuber",
    `RLM_part.+RH`  = "rlmpartrh",
    `RLM+RH` = "rlmrh",
    `SVM` = "svm",
    `SVM+RH` = "svmrh",
    `Koehler_mass_RH80` = "koehlermassrh80" ,
    `Koehler_mass_RH80+OLS` = "koehlermassrh80ols",
    `Koehler_size` = "koehlersize",
    `Koehler_size_RH80` = "koehlersizerh80",
    `Koehler_size_RH80+OLS` = "koehlersizerh80ols" ,
    `Laulainen` = "laulainen",
    `SVM_part.` = "svmpart",
    `SVM_part.+RH` = "svmpartrh",
    `GBM_part.` = "gbmpart",
    `GBM_part.+RH` = "gbmpartrh",
    `Koehler_mass` = "koehlermass",
    `RLM_part.` = "rlmpart"
  )

# Order the names of the methods
res$method <- fct_relevel(
  res$method,
  "Initial",
  "OLS",
  "LR",
  "MLR_RH",
  "MLR_part.",
  "MLR_part.+RH",
  "RLM",
  "RLM+RH",
  "RLM_part.",
  "RLM_part.+RH",
  "Koehler_mass",
  "Koehler_mass+LR",
  "Koehler_mass+OLS",
  "Koehler_mass_RH80" ,
  "Koehler_mass_RH80+OLS",
  "Koehler_size",
  "Koehler_size+LR",
  "Koehler_size+OLS",
  "Koehler_size_RH80",
  "Koehler_size_RH80+OLS",
  "Laulainen",
  "Laulainen+LR",
  "Laulainen+OLS",
  "GBM",
  "GBM+RH",
  "GBM_part.",
  "GBM_part.+RH",
  "SVM",
  "SVM+RH",
  "SVM_part.",
  "SVM_part.+RH"
)


res_bus$method <-
  fct_recode(
    res_bus$method,
    `GBM` = "gbm",
    `GBM+RH` = "gbmrh",
    `Koehler_mass+LR` = "koehlermasslr",
    `Koehler_mass+OLS` = "koehlermassols",
    `Koehler_size+LR` = "koehlersizelr",
    `Koehler_size+OLS` =  "koehlersizeols",
    `Laulainen+LR` = "laulainenlr",
    `Laulainen+OLS` = "laulainenols",
    `MLR_part.` =  "lmpart",
    `MLR_part.+RH` =   "lmpartrh",
    `LR` = "lr",
    `MLR_RH` = "mlr1" ,
    `OLS` =  "ols",
    `RLM` = "rlmhuber",
    `RLM_part.+RH`  = "rlmpartrh",
    `RLM+RH` =      "rlmrh",
    `SVM` = "svm",
    `SVM+RH` = "svmrh" ,
    `Koehler_mass_RH80` = "koehlermassrh80" ,
    `Koehler_mass_RH80+OLS` = "koehlermassrh80ols",
    `Koehler_size` = "koehlersize",
    `Koehler_size_RH80` = "koehlersizerh80",
    `Koehler_size_RH80+OLS` = "koehlersizerh80ols" ,
    `Laulainen` = "laulainen",
    `SVM_part.` = "svmpart",
    `SVM_part.+RH` = "svmpartrh",
    `GBM_part.` = "gbmpart",
    `GBM_part.+RH` = "gbmpartrh" ,
    `Koehler_mass` = "koehlermass",
    `RLM_part.` = "rlmpart"
  )


res_bus$method <-
  fct_relevel(
    res_bus$method,
    "Initial",
    "OLS",
    "LR",
    "MLR_RH",
    "MLR_part.",
    "MLR_part.+RH",
    "RLM",
    "RLM+RH",
    "RLM_part.",
    "RLM_part.+RH",
    "Koehler_mass",
    "Koehler_mass+LR",
    "Koehler_mass+OLS",
    "Koehler_mass_RH80" ,
    "Koehler_mass_RH80+OLS",
    "Koehler_size",
    "Koehler_size+LR",
    "Koehler_size+OLS",
    "Koehler_size_RH80",
    "Koehler_size_RH80+OLS",
    "Laulainen",
    "Laulainen+LR",
    "Laulainen+OLS",
    "GBM",
    "GBM+RH",
    "GBM_part.",
    "GBM_part.+RH",
    "SVM",
    "SVM+RH",
    "SVM_part.",
    "SVM_part.+RH"
  )


res$sensor_type <- ifelse(grepl("SPS", res$sensor), "SPS", "PMS")

# remove air quality monitor Nocs-2 that stopped working in the middle of the 
# study
res <- res[!grepl("N2", res$sensor), ]
res_bus <- res_bus[!grepl("N2", res_bus$combination), ]

Results

All results

p <- res %>%
  dplyr::filter(type == "Full") %>%
  dplyr::mutate(dataset_type = ifelse(dataset_type == "train", "Training", "Evaluation")) %>%
  ggplot(aes(x = method, y = RMSE)) +
  stat_boxplot(geom = 'errorbar', width = 0.5, size = 0.1) +
  geom_boxplot(
    fatten = 0.8,
    outlier.size = 0.8,
    outlier.alpha = 0.5,
    lwd = 0.2
  ) +
  custom_theme +
  ylab(quickText("RMSE (ug/m3)")) +
  xlab("Calibration method") +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 0.95,
    vjust = 0.2
  )) +
  facet_grid(sensor_type ~ dataset_type, scales = "free_y")
p

The machine learning methods worked very well for the training dataset but much less so for the calibration datasets.

For the PMS, all methods improved the results compared to the initial state. I will not take the >80RH for Koehler corrections as they do not bring improvement compared to the Koehler correction on the full data.

RMSE for paper

In this graph, I removed the Koehler correction with RH>80 to simplify the view given the number of methods involved in the comparison.

p <- res %>%
  dplyr::filter(!(
    method %in% c(
      "Koehler_mass",
      "Koehler_size",
      "Laulainen",
      "Koehler_mass+OLS",
      "Koehler_size+OLS",
      "Laulainen+OLS"
    )
  )) %>%
  dplyr::filter(type == "Full") %>%
  dplyr::mutate(dataset_type = ifelse(dataset_type == "train", "Training", "Evaluation")) %>%
  dplyr::filter(!grepl("RH80", method)) %>%
  ggplot(aes(x = method, y = RMSE)) +
  stat_boxplot(geom = 'errorbar', width = 0.5, size = 0.1) +
  geom_boxplot(
    fatten = 0.8,
    outlier.size = 0.8,
    outlier.alpha = 0.5,
    lwd = 0.2
  ) +
  custom_theme +
  ylab(quickText("RMSE (ug/m3)")) +
  xlab("Calibration method") +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 0.95,
    vjust = 0.2
  )) +
  # ggtitle("Training") +
  facet_grid(sensor_type ~ dataset_type, scales = "free_y")

p

p %>%
  ggsave2(
    filename = paste0(PLOT_OUTPUT_FOLDER, "test_calibration_rmse.png"),
    height = 140,
    units = "mm"
  )

Between unit uncertainty for paper

p <- res_bus %>%
  dplyr::filter(!(
    method %in% c(
      "Koehler_mass",
      "Koehler_size",
      "Laulainen",
      "Koehler_mass+OLS",
      "Koehler_size+OLS",
      "Laulainen+OLS"
    )
  )) %>%
  mutate(sensor_type = ifelse(grepl("SPS", combination), "SPS30", "PMS5003")) %>%
  dplyr::filter(type == "Full") %>%
  dplyr::filter(!grepl("RH80", method)) %>%
  dplyr::mutate(dataset_type = ifelse(dataset_type == "train", "Training", "Evaluation")) %>%
  ggplot(aes(x = method, y = usb)) +
  stat_boxplot(geom = 'errorbar', width = 0.5, size = 0.1) +
  geom_boxplot(
    fatten = 0.8,
    outlier.size = 0.8,
    outlier.alpha = 0.5,
    lwd = 0.2
  ) +
  custom_theme +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 0.95,
    vjust = 0.2
  )) +
  xlab("Calibration method") +
  ylab("Between unit uncertainty (ug/m3)") +
  facet_grid(sensor_type ~ dataset_type)
p

p %>%
  ggsave2(
    filename = paste0(PLOT_OUTPUT_FOLDER, "test_calibration_buu.png"),
    height = 140,
    units = "mm"
  )
#ggplotly(p)

Clipping of the Machine Learning methods

Data preparation

file_list <-
  list.files(
    path = input_folder,
    pattern = "df_.*",
    full.names = T,
    recursive = T
  )


#fInitialize data.frames
df_cal_pms <- data.frame()
df_train_pms <- data.frame()
df_cal_sps <- data.frame()
df_train_sps <- data.frame()

file <- file_list[1]

for (file in file_list) {
  tmp <- readRDS(file)
  name <- strsplit(file, "/")[[1]][5]
  if (grepl("initial", name)) {
    next
  }
  if (!grepl("train", name)) {
    #message(file)
    if (grepl("SPS", name)) {
      if (dim(df_cal_sps)[1] == 0) {
       # message("Fist sps cal")
        df_cal_sps <- tmp
      }
      else{
       # message("other sps cal")
        df_cal_sps <-
          inner_join(df_cal_sps, tmp, by = c("date", "sensor"))
      }
    }
    else{
      if (dim(df_cal_pms)[1] == 0) {
       # message("Fist pms cal")
        df_cal_pms <- tmp
      }
      else{
       # message("other pms cal")
        df_cal_pms <-
          inner_join(df_cal_pms, tmp, by = c("date", "sensor"))
      }
    }
  }
  else{
    #message("train")
    if (grepl("SPS", name)) {
      if (dim(df_train_sps)[1] == 0) {
        #message("first sps train")
        df_train_sps <- tmp
      }
      else{
       # message("other sps train")
        df_train_sps <-
          inner_join(df_train_sps, tmp, by = c("date", "sensor"))
      }
    }
    else{
      if (dim(df_train_pms)[1] == 0) {
        df_train_pms <- tmp
      }
      else{
        df_train_pms <-
          inner_join(df_train_pms, tmp, by = c("date", "sensor"))
      }
    }
  }
  
  
}
df_2min <- readRDS(DF_JOINED)

Clipping

df_tmp <-
  df_2min[df_2min$sensor == "SPS-21N4" &
            df_2min$date >= as.POSIXct("2020-08-09") &
            df_2min$date <= as.POSIXct("2020-08-14"), ]

df_cal_sps_short <-
  df_cal_sps[df_cal_sps$sensor == "SPS-21N4" &
               df_cal_sps$date >= as.POSIXct("2020-08-09") &
               df_cal_sps$date <= as.POSIXct("2020-08-14"), ] %>%
  pivot_longer(
    cols = starts_with("PM25_corr"),
    names_to = "method",
    values_to = "PM25"
  ) %>%
  dplyr::filter(!grepl("koehler_size", method)) %>%
  dplyr::filter(grepl("gbm|svm", method))

df_cal_sps_short$method <-
  fct_recode(
    df_cal_sps_short$method,
    `GBM+RH` = "PM25_corr_gbm_rh",
    `GBM` = "PM25_corr_gbm",
    `SVM` = "PM25_corr_svm",
    `SVM+RH` = "PM25_corr_svm_rh",
    `GBM_part.` = "PM25_corr_gbm_part",
    `GBM_part.+RH` = "PM25_corr_gbm_part_rh",
    `SVM_part.` = "PM25_corr_svm_part",
    `SVM_part.+RH` = "PM25_corr_svm_part_rh",
    `SVM+RH` = "PM25_corr_svmrh"
  )
df_cal_sps_short$method <-
  fct_relevel(
    df_cal_sps_short$method,
    "GBM",
    "GBM+RH",
    "GBM_part.",
    "GBM_part.+RH",
    "SVM",
    "SVM+RH",
    "SVM_part.",
    "SVM_part.+RH"
  )

p <- df_cal_sps_short %>%
  ggplot() +
  geom_line(aes(x = date, y = PM25, colour = "SPS-21N4 corrected"), size =
              0.1) +
  geom_line(
    data = df_tmp,
    aes(x = date, y = median_PM25, colour = "SPS-21N4 uncorrected"),
    alpha = 0.5,
    size = 0.1
  ) +
  geom_line(
    data = df_tmp,
    aes(x = date, y = PM2.5, colour = "Fidas 200S"),
    alpha = 0.5,
    size = 0.1
  ) +
  custom_theme +
  ylab(quickText("PM2.5 (ug/m3)")) +
  labs(colour = "Instrument") +
  facet_wrap( ~ method, nrow =  2)  +
  theme(axis.text.x = element_text(
    angle = 45,
    vjust = 1,
    hjust = 1
  ))
p

p %>%
  ggsave2(
    filename = paste0(PLOT_OUTPUT_FOLDER, "clipping_machine_learning_sps21.png"),
    height = 150,
    units = "mm"
  )

ggplotly(p, dynamicTicks = T)

Interactive version of expanded uncertainty graphs for training and calibration

p <- res %>%
  dplyr::filter(grepl("PMS", sensor)) %>%
  dplyr::filter(type == "Full") %>%
  dplyr::filter(dataset_type == "cal") %>%
  ggplot() +
  geom_boxplot(aes(x = method, y = expanded_uncertainty, colour = method)) +
  custom_theme +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 0.95,
    vjust = 0.2
  ))  +
  ggtitle("Calibration")
ggplotly(p, dynamicTicks = T)
p <- res %>%
  
  dplyr::filter(grepl("PMS", sensor)) %>%
  dplyr::filter(type == "Full") %>%
  dplyr::filter(dataset_type == "train") %>%
  ggplot() +
  geom_boxplot(aes(x = method, y = expanded_uncertainty, colour = method)) +
  custom_theme +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 0.95,
    vjust = 0.2
  )) +
  ggtitle("Training")
ggplotly(p, dynamicTicks = T)