The calibration is first tested by using the first two weeks of the data and using the next 40 days for verification as described in the Supplementary information of the paper.
The results have been calculated by the scripts
test_calibration.r
and test_calibration_ml.r
given the computational time required. The results are stored in
output/calibration/test_2weeks/
.
source("utilities/utilities.R")
source("utilities/nested_models.r")
source("utilities/variables.r")
input_folder <- "../output/calibration/test_2weeks/"
file_list <-
list.files(
path = input_folder,
pattern = "res_.*",
full.names = T,
recursive = T
)
# Parse the result files.
res <- data.frame()
res_bus <- data.frame()
for (file in file_list) {
name <- strsplit(file, "/")[[1]][5]
scenario <- sub(".rds", "", strsplit(name, "_")[[1]][[5]])
method <- strsplit(name, "_")[[1]][[2]]
dataset_type <- strsplit(name, "_")[[1]][[3]]
tmp <- readRDS(file)
model <- tmp$res
model$scenario <- scenario
model$dataset_type <- dataset_type
res <- bind_rows(res, model)
model <- tmp$res_bus
model$scenario <- scenario
model$dataset_type <- dataset_type
res_bus <- bind_rows(res_bus, model)
}
res <- res %>%
mutate(method = str_split(method, " ", simplify = T)[, 1])
res_bus <- res_bus %>%
mutate(method = str_split(method, " ", simplify = T)[, 1])
# Format the names of the methods
res$method <-
fct_recode(
res$method,
`GBM` = "gbm",
`GBM+RH` = "gbmrh",
`Koehler_mass+LR` = "koehlermasslr",
`Koehler_mass+OLS` = "koehlermassols",
`Koehler_size+LR` = "koehlersizelr",
`Koehler_size+OLS` = "koehlersizeols",
`Laulainen+LR` = "laulainenlr",
`Laulainen+OLS` = "laulainenols",
`MLR_part.` = "lmpart",
`MLR_part.+RH` = "lmpartrh",
`LR` = "lr",
`MLR_RH` = "mlr1" ,
`OLS` = "ols",
`RLM` = "rlmhuber",
`RLM_part.+RH` = "rlmpartrh",
`RLM+RH` = "rlmrh",
`SVM` = "svm",
`SVM+RH` = "svmrh",
`Koehler_mass_RH80` = "koehlermassrh80" ,
`Koehler_mass_RH80+OLS` = "koehlermassrh80ols",
`Koehler_size` = "koehlersize",
`Koehler_size_RH80` = "koehlersizerh80",
`Koehler_size_RH80+OLS` = "koehlersizerh80ols" ,
`Laulainen` = "laulainen",
`SVM_part.` = "svmpart",
`SVM_part.+RH` = "svmpartrh",
`GBM_part.` = "gbmpart",
`GBM_part.+RH` = "gbmpartrh",
`Koehler_mass` = "koehlermass",
`RLM_part.` = "rlmpart"
)
# Order the names of the methods
res$method <- fct_relevel(
res$method,
"Initial",
"OLS",
"LR",
"MLR_RH",
"MLR_part.",
"MLR_part.+RH",
"RLM",
"RLM+RH",
"RLM_part.",
"RLM_part.+RH",
"Koehler_mass",
"Koehler_mass+LR",
"Koehler_mass+OLS",
"Koehler_mass_RH80" ,
"Koehler_mass_RH80+OLS",
"Koehler_size",
"Koehler_size+LR",
"Koehler_size+OLS",
"Koehler_size_RH80",
"Koehler_size_RH80+OLS",
"Laulainen",
"Laulainen+LR",
"Laulainen+OLS",
"GBM",
"GBM+RH",
"GBM_part.",
"GBM_part.+RH",
"SVM",
"SVM+RH",
"SVM_part.",
"SVM_part.+RH"
)
res_bus$method <-
fct_recode(
res_bus$method,
`GBM` = "gbm",
`GBM+RH` = "gbmrh",
`Koehler_mass+LR` = "koehlermasslr",
`Koehler_mass+OLS` = "koehlermassols",
`Koehler_size+LR` = "koehlersizelr",
`Koehler_size+OLS` = "koehlersizeols",
`Laulainen+LR` = "laulainenlr",
`Laulainen+OLS` = "laulainenols",
`MLR_part.` = "lmpart",
`MLR_part.+RH` = "lmpartrh",
`LR` = "lr",
`MLR_RH` = "mlr1" ,
`OLS` = "ols",
`RLM` = "rlmhuber",
`RLM_part.+RH` = "rlmpartrh",
`RLM+RH` = "rlmrh",
`SVM` = "svm",
`SVM+RH` = "svmrh" ,
`Koehler_mass_RH80` = "koehlermassrh80" ,
`Koehler_mass_RH80+OLS` = "koehlermassrh80ols",
`Koehler_size` = "koehlersize",
`Koehler_size_RH80` = "koehlersizerh80",
`Koehler_size_RH80+OLS` = "koehlersizerh80ols" ,
`Laulainen` = "laulainen",
`SVM_part.` = "svmpart",
`SVM_part.+RH` = "svmpartrh",
`GBM_part.` = "gbmpart",
`GBM_part.+RH` = "gbmpartrh" ,
`Koehler_mass` = "koehlermass",
`RLM_part.` = "rlmpart"
)
res_bus$method <-
fct_relevel(
res_bus$method,
"Initial",
"OLS",
"LR",
"MLR_RH",
"MLR_part.",
"MLR_part.+RH",
"RLM",
"RLM+RH",
"RLM_part.",
"RLM_part.+RH",
"Koehler_mass",
"Koehler_mass+LR",
"Koehler_mass+OLS",
"Koehler_mass_RH80" ,
"Koehler_mass_RH80+OLS",
"Koehler_size",
"Koehler_size+LR",
"Koehler_size+OLS",
"Koehler_size_RH80",
"Koehler_size_RH80+OLS",
"Laulainen",
"Laulainen+LR",
"Laulainen+OLS",
"GBM",
"GBM+RH",
"GBM_part.",
"GBM_part.+RH",
"SVM",
"SVM+RH",
"SVM_part.",
"SVM_part.+RH"
)
res$sensor_type <- ifelse(grepl("SPS", res$sensor), "SPS", "PMS")
# remove air quality monitor Nocs-2 that stopped working in the middle of the
# study
res <- res[!grepl("N2", res$sensor), ]
res_bus <- res_bus[!grepl("N2", res_bus$combination), ]
p <- res %>%
dplyr::filter(type == "Full") %>%
dplyr::mutate(dataset_type = ifelse(dataset_type == "train", "Training", "Evaluation")) %>%
ggplot(aes(x = method, y = RMSE)) +
stat_boxplot(geom = 'errorbar', width = 0.5, size = 0.1) +
geom_boxplot(
fatten = 0.8,
outlier.size = 0.8,
outlier.alpha = 0.5,
lwd = 0.2
) +
custom_theme +
ylab(quickText("RMSE (ug/m3)")) +
xlab("Calibration method") +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0.95,
vjust = 0.2
)) +
facet_grid(sensor_type ~ dataset_type, scales = "free_y")
p
The machine learning methods worked very well for the training dataset but much less so for the calibration datasets.
For the PMS, all methods improved the results compared to the initial state. I will not take the >80RH for Koehler corrections as they do not bring improvement compared to the Koehler correction on the full data.
In this graph, I removed the Koehler correction with RH>80 to simplify the view given the number of methods involved in the comparison.
p <- res %>%
dplyr::filter(!(
method %in% c(
"Koehler_mass",
"Koehler_size",
"Laulainen",
"Koehler_mass+OLS",
"Koehler_size+OLS",
"Laulainen+OLS"
)
)) %>%
dplyr::filter(type == "Full") %>%
dplyr::mutate(dataset_type = ifelse(dataset_type == "train", "Training", "Evaluation")) %>%
dplyr::filter(!grepl("RH80", method)) %>%
ggplot(aes(x = method, y = RMSE)) +
stat_boxplot(geom = 'errorbar', width = 0.5, size = 0.1) +
geom_boxplot(
fatten = 0.8,
outlier.size = 0.8,
outlier.alpha = 0.5,
lwd = 0.2
) +
custom_theme +
ylab(quickText("RMSE (ug/m3)")) +
xlab("Calibration method") +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0.95,
vjust = 0.2
)) +
# ggtitle("Training") +
facet_grid(sensor_type ~ dataset_type, scales = "free_y")
p
p %>%
ggsave2(
filename = paste0(PLOT_OUTPUT_FOLDER, "test_calibration_rmse.png"),
height = 140,
units = "mm"
)
p <- res_bus %>%
dplyr::filter(!(
method %in% c(
"Koehler_mass",
"Koehler_size",
"Laulainen",
"Koehler_mass+OLS",
"Koehler_size+OLS",
"Laulainen+OLS"
)
)) %>%
mutate(sensor_type = ifelse(grepl("SPS", combination), "SPS30", "PMS5003")) %>%
dplyr::filter(type == "Full") %>%
dplyr::filter(!grepl("RH80", method)) %>%
dplyr::mutate(dataset_type = ifelse(dataset_type == "train", "Training", "Evaluation")) %>%
ggplot(aes(x = method, y = usb)) +
stat_boxplot(geom = 'errorbar', width = 0.5, size = 0.1) +
geom_boxplot(
fatten = 0.8,
outlier.size = 0.8,
outlier.alpha = 0.5,
lwd = 0.2
) +
custom_theme +
theme(axis.text.x = element_text(
angle = 90,
hjust = 0.95,
vjust = 0.2
)) +
xlab("Calibration method") +
ylab("Between unit uncertainty (ug/m3)") +
facet_grid(sensor_type ~ dataset_type)
p
p %>%
ggsave2(
filename = paste0(PLOT_OUTPUT_FOLDER, "test_calibration_buu.png"),
height = 140,
units = "mm"
)
#ggplotly(p)
file_list <-
list.files(
path = input_folder,
pattern = "df_.*",
full.names = T,
recursive = T
)
#fInitialize data.frames
df_cal_pms <- data.frame()
df_train_pms <- data.frame()
df_cal_sps <- data.frame()
df_train_sps <- data.frame()
file <- file_list[1]
for (file in file_list) {
tmp <- readRDS(file)
name <- strsplit(file, "/")[[1]][5]
if (grepl("initial", name)) {
next
}
if (!grepl("train", name)) {
#message(file)
if (grepl("SPS", name)) {
if (dim(df_cal_sps)[1] == 0) {
# message("Fist sps cal")
df_cal_sps <- tmp
}
else{
# message("other sps cal")
df_cal_sps <-
inner_join(df_cal_sps, tmp, by = c("date", "sensor"))
}
}
else{
if (dim(df_cal_pms)[1] == 0) {
# message("Fist pms cal")
df_cal_pms <- tmp
}
else{
# message("other pms cal")
df_cal_pms <-
inner_join(df_cal_pms, tmp, by = c("date", "sensor"))
}
}
}
else{
#message("train")
if (grepl("SPS", name)) {
if (dim(df_train_sps)[1] == 0) {
#message("first sps train")
df_train_sps <- tmp
}
else{
# message("other sps train")
df_train_sps <-
inner_join(df_train_sps, tmp, by = c("date", "sensor"))
}
}
else{
if (dim(df_train_pms)[1] == 0) {
df_train_pms <- tmp
}
else{
df_train_pms <-
inner_join(df_train_pms, tmp, by = c("date", "sensor"))
}
}
}
}
df_2min <- readRDS(DF_JOINED)
df_tmp <-
df_2min[df_2min$sensor == "SPS-21N4" &
df_2min$date >= as.POSIXct("2020-08-09") &
df_2min$date <= as.POSIXct("2020-08-14"), ]
df_cal_sps_short <-
df_cal_sps[df_cal_sps$sensor == "SPS-21N4" &
df_cal_sps$date >= as.POSIXct("2020-08-09") &
df_cal_sps$date <= as.POSIXct("2020-08-14"), ] %>%
pivot_longer(
cols = starts_with("PM25_corr"),
names_to = "method",
values_to = "PM25"
) %>%
dplyr::filter(!grepl("koehler_size", method)) %>%
dplyr::filter(grepl("gbm|svm", method))
df_cal_sps_short$method <-
fct_recode(
df_cal_sps_short$method,
`GBM+RH` = "PM25_corr_gbm_rh",
`GBM` = "PM25_corr_gbm",
`SVM` = "PM25_corr_svm",
`SVM+RH` = "PM25_corr_svm_rh",
`GBM_part.` = "PM25_corr_gbm_part",
`GBM_part.+RH` = "PM25_corr_gbm_part_rh",
`SVM_part.` = "PM25_corr_svm_part",
`SVM_part.+RH` = "PM25_corr_svm_part_rh",
`SVM+RH` = "PM25_corr_svmrh"
)
df_cal_sps_short$method <-
fct_relevel(
df_cal_sps_short$method,
"GBM",
"GBM+RH",
"GBM_part.",
"GBM_part.+RH",
"SVM",
"SVM+RH",
"SVM_part.",
"SVM_part.+RH"
)
p <- df_cal_sps_short %>%
ggplot() +
geom_line(aes(x = date, y = PM25, colour = "SPS-21N4 corrected"), size =
0.1) +
geom_line(
data = df_tmp,
aes(x = date, y = median_PM25, colour = "SPS-21N4 uncorrected"),
alpha = 0.5,
size = 0.1
) +
geom_line(
data = df_tmp,
aes(x = date, y = PM2.5, colour = "Fidas 200S"),
alpha = 0.5,
size = 0.1
) +
custom_theme +
ylab(quickText("PM2.5 (ug/m3)")) +
labs(colour = "Instrument") +
facet_wrap( ~ method, nrow = 2) +
theme(axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
))
p
p %>%
ggsave2(
filename = paste0(PLOT_OUTPUT_FOLDER, "clipping_machine_learning_sps21.png"),
height = 150,
units = "mm"
)
ggplotly(p, dynamicTicks = T)