This notebook explores the impact of the following factors on the readings of the PMS5003:

  • RH

  • number of UFP

  • number of particles below 0.8um

  • number of particles above 0.8um

  • total particle number

  • source of PM

This is done only on peak concentration, with all the data from each experiment. Aggregating the results per experiment yield data sets that are too small to be meaningful.

The target is the PNC reported by the sensors.

This is done using different feature selection algorithms: - Boruta - Ridge - recursive feature selection with SVM

Data preparation

source("variables.r")
source("utilities.r")
#source("../../../AQ_data_preparation/utilities.r")
require(relaimpo)
require(Boruta)
require(lubridate)

df_sensors <- readRDS(sensors_file) %>%
  get_sensor_type()
df_ops <- readRDS(ops_file_transformed)
df_nanotracer <- readRDS(nanotracer_file)


df_sensors_ft <- df_sensors %>%
  ungroup() %>%
  group_by(date, variation, source, exp, sensor_type) %>%
  summarise(across(where(is.numeric), ~mean(., na.rm=T))) 


df_nanotracer$date <- round_date(df_nanotracer$date, unit = "10 s")
df_nanotracer_ft <- df_nanotracer %>%
  dplyr::select(date, N.1.)

df_ops_ft <- df_ops %>%
  mutate(lw08um = Bin.1 + Bin.2 + Bin.3 + Bin.4 + Bin.5 + Bin.6*(0.8 - 0.721)/(0.897 - 0.721),
         gr08um = Bin.6*(0.897 - 0.8)/(0.897 - 0.721) + Bin.7 + Bin.8 + Bin.9 + Bin.10 + Bin.11 + Bin.12 + Bin.13 + Bin.14 + Bin.15 + Bin.16 + Bin.17,
         total_particle_number = Bin.1 + Bin.2 + Bin.3 + Bin.4 + Bin.5 + Bin.6 + Bin.7 + Bin.8 + Bin.9 + Bin.10 + Bin.11 + Bin.12 + Bin.13 + Bin.14 + Bin.15 + Bin.16 + Bin.17) %>%
  dplyr::select( -variation, -exp, -source)


df_joined <- df_sensors_ft %>%
  ungroup() %>%
  inner_join(df_ops_ft, by = c("date")) %>%
  inner_join(df_nanotracer_ft, by = c("date"))

head(df_joined)
df_joined_ft <- df_joined %>%
  #dplyr::filter(exp != "") %>%
  dplyr::filter(variation == "Peaks") %>%
         mutate(rh = ifelse(grepl("54%", exp), 54,
                     ifelse(grepl("69%", exp), 69,
                            ifelse(grepl("72%", exp), 72,
                                   ifelse(grepl("76%", exp), 76, 79)
                                   )
                            )
                     ),
         source = ifelse(source == "Incense", 1, 0))
   

df_joined_ft <- df_joined_ft %>%
  rename(Bin.1_0.3um = Bin.1,
         Bin.2_0.374um = Bin.2,
         Bin.3_0.465um = Bin.3,
         Bin.4_0.579um = Bin.4,
         Bin.5_0.721um = Bin.5,
         Bin.6_0.897um = Bin.6,
         Bin.7_1.117um = Bin.7,
         Bin.8_1.391um = Bin.8,
         Bin.9_1.732um = Bin.9,
         Bin.10_2.156um = Bin.10,
         Bin.11_2.685um = Bin.11,
         Bin.12_3.343um = Bin.12,
         Bin.13_4.162um = Bin.13,
         Bin.14_5.182um = Bin.14,
         Bin.15_6.451um = Bin.15,
         Bin.16_8.031um = Bin.16,
         Bin.17_10um = Bin.17
         )



frmla <- gr03um  ~  source + lw08um + N.1. + total_particle_number + rh 


#frmla <- bias ~  source + lw08um + N.1. + total_particle_number + rh 
#frmla_inter <- bias ~  (source + lw08um + N.1. + total_particle_number + rh )^2

require(caret)
trControl <- trainControl(method = "repeatedcv",
    number = 10,
    search = "grid",
    repeats = 10)

require(doParallel)
no_cores <- parallel::detectCores() - 1
cluster <- makePSOCKcluster(no_cores)
registerDoParallel(cluster)

Feature selection

Boruta

PMS5003

gr03um

frmla <- gr03um  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "PMS5003"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.804271 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance - gr03um"
)  

gr05um

frmla <- gr05um  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "PMS5003"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.53075 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance -  gr05um"
)  

gr10um

frmla <- gr10um  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "PMS5003"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5618711 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance - gr10um"
)  

gr25um

frmla <- gr25um  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "PMS5003"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.4402349 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance - gr25um"
)  

SPS30

n05

frmla <- n05  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "SPS030"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5585799 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance - n05"
)  

n1

frmla <- n1  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "SPS030"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5134611 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance - n1"
)  

n25

frmla <- n25  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "SPS030"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5500081 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance - n25"
)  

OPCR1

Bin0 - 0.35/0.7um

frmla <-
  Bin0  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "OPCR1"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5618341 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance -  bin0"
)  

Bin1 - 0.7/1.1um

frmla <-
  Bin1  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "OPCR1"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5594571 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance -  bin1"
)  

Bin2 - 1.1/1.5um

frmla <-
  Bin2  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "OPCR1"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5615249 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance -  bin2"
)  

Bin3 - 1.5/1.9um

frmla <-
  Bin3  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "OPCR1"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.5959251 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance -  bin3"
)  

Bin4 - 1.9/2.4um

frmla <-
  Bin4  ~  source + lw08um + N.1. + total_particle_number + rh

boruta_output <-
  Boruta(
    frmla,
    data = dplyr::filter(df_joined_ft, sensor_type == "OPCR1"),
    doTrace = 0
  )

boruta_output
## Boruta performed 9 iterations in 0.4809341 secs.
##  5 attributes confirmed important: lw08um, N.1., rh, source,
## total_particle_number;
##  No attributes deemed unimportant.
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
## [1] "rh"                    "total_particle_number" "N.1."                 
## [4] "lw08um"                "source"
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp),])
plot(
  boruta_output,
  cex.axis = .7,
  las = 2,
  xlab = "",
  main = "Variable Importance -  bin4"
)  

Recursive feature elimination - SVM

PMS5003

gr03um

frmla <-
  gr03um  ~  source + lw08um + N.1. + total_particle_number + rh


subsets <- c(1:6)
x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "PMS5003") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., gr03um)


svmProfile <- rfe(
  gr03um ~ .,
  data = x,
  sizes = c(1:6),
  rfeControl = rfeControl(
    functions = caretFuncs,
    number = 10,
    method = "repeatedcv"
  ),
  ## pass options to train()
  method = "svmRadial"
)



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 909.4   0.5705 601.2 111.05    0.09217 58.41         
##          2 861.5   0.6052 569.7 109.15    0.09635 64.31         
##          3 711.6   0.7356 473.0  86.43    0.06754 50.23         
##          4 673.9   0.7559 453.6  91.66    0.07884 42.26         
##          5 579.8   0.8160 401.0  87.42    0.05513 44.21        *
## 
## The top 5 variables (out of 5):
##    lw08um, total_particle_number, N.1., source, rh
predictors(svmProfile)
## [1] "lw08um"                "total_particle_number" "N.1."                 
## [4] "source"                "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall,
           var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm = T))

gr05um

frmla <-
  gr05um  ~  source + lw08um + N.1. + total_particle_number + rh

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "PMS5003") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., gr05um)

svmProfile <- rfe(
  gr05um ~ .,
  data = x,
  sizes = c(1:6),
  rfeControl = rfeControl(
    functions = caretFuncs,
    number = 10,
    method = "repeatedcv"
  ),
  ## pass options to train()
  method = "svmRadial"
)



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 252.9   0.5793 168.6  34.77    0.09532 21.60         
##          2 242.5   0.6124 161.9  38.43    0.07686 23.43         
##          3 205.8   0.7228 136.6  38.46    0.06195 22.35         
##          4 190.5   0.7611 129.4  37.02    0.04556 20.24         
##          5 159.7   0.8302 111.8  27.40    0.02869 16.66        *
## 
## The top 5 variables (out of 5):
##    lw08um, total_particle_number, N.1., source, rh
predictors(svmProfile)
## [1] "lw08um"                "total_particle_number" "N.1."                 
## [4] "source"                "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall,
           var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm = T))

gr10um

frmla <- gr10um  ~  source + lw08um + N.1. + total_particle_number + rh 

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "PMS5003") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1.,gr10um)

svmProfile <- rfe(gr10um~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 44.70   0.6841 31.07  6.787    0.08745 3.269         
##          2 44.21   0.6788 30.72  6.665    0.10598 3.792         
##          3 47.37   0.6366 31.47  9.427    0.12995 4.139         
##          4 45.76   0.6729 30.53 10.368    0.11254 3.433         
##          5 39.50   0.7513 27.31  7.697    0.06212 2.334        *
## 
## The top 5 variables (out of 5):
##    total_particle_number, lw08um, N.1., source, rh
predictors(svmProfile)
## [1] "total_particle_number" "lw08um"                "N.1."                 
## [4] "source"                "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

gr25um

frmla <- gr25um  ~  source + lw08um + N.1. + total_particle_number + rh 

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "PMS5003") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1.,gr25um)

svmProfile <- rfe(gr25um~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD  MAESD Selected
##          1 8.123   0.4875 5.534  2.238    0.13557 1.1442         
##          2 6.728   0.6531 4.524  2.183    0.09998 0.8110         
##          3 6.345   0.6770 4.284  1.871    0.09933 0.7271         
##          4 7.115   0.5921 4.630  2.212    0.08658 0.7146         
##          5 6.033   0.7094 3.997  2.049    0.08132 0.7526        *
## 
## The top 5 variables (out of 5):
##    total_particle_number, lw08um, source, N.1., rh
predictors(svmProfile)
## [1] "total_particle_number" "lw08um"                "source"               
## [4] "N.1."                  "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

SPS30

n05

frmla <- n05  ~  source + lw08um + N.1. + total_particle_number + rh 

subsets <- c(1:6)
x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "SPS030") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., n05)

svmProfile <- rfe(n05~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 23.02   0.6830 16.18  4.815    0.08228 2.280         
##          2 22.37   0.6992 15.99  4.561    0.07431 2.299         
##          3 25.05   0.6296 17.19  5.626    0.07987 2.530         
##          4 24.27   0.6522 16.64  6.074    0.08615 2.567         
##          5 20.31   0.7593 14.25  3.894    0.03027 1.602        *
## 
## The top 5 variables (out of 5):
##    lw08um, total_particle_number, N.1., source, rh
predictors(svmProfile)
## [1] "lw08um"                "total_particle_number" "N.1."                 
## [4] "source"                "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

n1

frmla <- n1  ~  source + lw08um + N.1. + total_particle_number + rh 
x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "SPS030") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., n1)

svmProfile <- rfe(n1~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 26.92   0.6883 19.84  4.309    0.08547 2.224         
##          2 27.52   0.6791 19.82  5.526    0.11932 3.062         
##          3 30.54   0.6017 21.15  4.754    0.10714 2.138         
##          4 29.30   0.6270 20.33  5.689    0.12256 2.645         
##          5 24.46   0.7479 17.48  4.418    0.07750 2.140        *
## 
## The top 5 variables (out of 5):
##    total_particle_number, lw08um, N.1., source, rh
predictors(svmProfile)
## [1] "total_particle_number" "lw08um"                "N.1."                 
## [4] "source"                "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

n25

frmla <- n25  ~  source + lw08um + N.1. + total_particle_number + rh 

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "SPS030") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., n25)

svmProfile <- rfe(n25~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 28.83   0.6525 20.80  3.519    0.10302 1.345         
##          2 29.34   0.6395 21.00  4.177    0.08297 1.652         
##          3 31.59   0.5908 22.06  5.618    0.10668 2.401         
##          4 28.51   0.6731 19.77  6.365    0.07988 3.511         
##          5 26.61   0.7130 19.06  4.344    0.07928 2.171        *
## 
## The top 5 variables (out of 5):
##    total_particle_number, lw08um, N.1., rh, source
predictors(svmProfile)
## [1] "total_particle_number" "lw08um"                "N.1."                 
## [4] "rh"                    "source"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

OPCR1

Bin0 - 0.35/0.7um

frmla <- Bin0  ~  source + lw08um + N.1. + total_particle_number + rh 



subsets <- c(1:6)
x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "OPCR1") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., Bin0)

svmProfile <- rfe(Bin0~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared    MAE RMSESD RsquaredSD MAESD Selected
##          1 202.7   0.5574 129.26  45.02    0.08475 19.10         
##          2 166.0   0.7200  93.81  49.24    0.07139 16.42         
##          3 180.8   0.6569 100.09  63.45    0.13875 19.22         
##          4 168.1   0.7024  92.32  65.96    0.13918 17.79         
##          5 141.3   0.8013  80.83  49.46    0.07384 14.76        *
## 
## The top 5 variables (out of 5):
##    total_particle_number, lw08um, N.1., source, rh
predictors(svmProfile)
## [1] "total_particle_number" "lw08um"                "N.1."                 
## [4] "source"                "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

Bin1 - 0.7/1.1um

frmla <- Bin1  ~  source + lw08um + N.1. + total_particle_number + rh 


x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "OPCR1") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., Bin1)

svmProfile <- rfe(Bin1~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared    MAE RMSESD RsquaredSD MAESD Selected
##          1 365.3   0.1607 221.63  87.99    0.09025 44.31         
##          2 203.8   0.7492  97.96  88.77    0.13251 26.16         
##          3 186.7   0.7877  91.44  90.92    0.14621 24.45         
##          4 159.4   0.8564  83.61  68.53    0.06941 23.57        *
##          5 176.7   0.8171  84.52  77.23    0.09884 20.09         
## 
## The top 4 variables (out of 4):
##    total_particle_number, lw08um, source, rh
predictors(svmProfile)
## [1] "total_particle_number" "lw08um"                "source"               
## [4] "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

Bin2 - 1.1/1.5um

frmla <- Bin2  ~  source + lw08um + N.1. + total_particle_number + rh 

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "OPCR1") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., Bin2)

svmProfile <- rfe(Bin2~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 105.05   0.3727 55.01  26.35    0.06515 8.756         
##          2  54.63   0.8308 27.04  19.54    0.06567 6.295         
##          3  48.01   0.8742 24.57  15.53    0.03954 4.627         
##          4  58.24   0.8066 29.20  17.51    0.06594 5.912         
##          5  46.64   0.8723 25.83  12.58    0.07040 5.672        *
## 
## The top 5 variables (out of 5):
##    source, total_particle_number, lw08um, N.1., rh
predictors(svmProfile)
## [1] "source"                "total_particle_number" "lw08um"               
## [4] "N.1."                  "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

Bin3 - 1.5/1.9um

frmla <- Bin3  ~  source + lw08um + N.1. + total_particle_number + rh 

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "OPCR1") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., Bin3)

svmProfile <- rfe(Bin3~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 43.31   0.3501 22.07  9.315    0.06363 2.252         
##          2 25.24   0.7698 12.00  9.679    0.10826 2.919         
##          3 21.43   0.8440 10.70  8.383    0.07335 2.533         
##          4 23.95   0.7966 11.89  9.286    0.09222 2.579         
##          5 20.53   0.8525 11.00  6.735    0.05661 2.496        *
## 
## The top 5 variables (out of 5):
##    source, total_particle_number, lw08um, N.1., rh
predictors(svmProfile)
## [1] "source"                "total_particle_number" "lw08um"               
## [4] "N.1."                  "rh"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

Bin4 - 1.9/2.4um

frmla <- Bin4  ~  source + lw08um + N.1. + total_particle_number + rh 

x <- df_joined_ft %>%
  dplyr::filter(sensor_type == "OPCR1") %>%
  dplyr::select(source, lw08um, total_particle_number, rh, N.1., Bin4)

svmProfile <- rfe(Bin4~., data = x,
                  sizes = c(1:6),
                  rfeControl = rfeControl(functions = caretFuncs,
                                          number = 10,
                                          method = "repeatedcv"),
                  ## pass options to train()
                  method = "svmRadial")



svmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 1 times) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared    MAE RMSESD RsquaredSD MAESD Selected
##          1 23.86   0.3637 12.503  7.634    0.09004 2.504         
##          2 13.53   0.8004  6.672  4.938    0.06495 1.391         
##          3 11.65   0.8612  5.877  4.719    0.05962 1.358        *
##          4 13.71   0.7936  6.761  5.527    0.07574 1.499         
##          5 12.23   0.8364  6.328  3.878    0.06008 1.276         
## 
## The top 3 variables (out of 3):
##    source, total_particle_number, lw08um
predictors(svmProfile)
## [1] "source"                "total_particle_number" "lw08um"
varImp(svmProfile)
data.frame(Overall = svmProfile$variables$Overall, var = svmProfile$variables$var) %>%
  group_by(var) %>%
  summarise(mean(Overall, na.rm=T))

Ridge

PMS5003

gr03um

frmla <- gr03um  ~  source + lw08um + N.1. + total_particle_number + rh 

trControl <- trainControl(method = "repeatedcv",
    number = 10,
    search = "grid",
    repeats = 10)

tuneGrid <- expand.grid(alpha = 0,
                        lambda = seq(0.0001, 1, length = 100))


glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "PMS5003", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## N.1.                   100.00
## lw08um                  78.08
## total_particle_number   68.10
## source                  31.22
## rh                       0.00

gr05um

frmla <- gr05um  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "PMS5003", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## N.1.                   100.00
## lw08um                  86.62
## total_particle_number   75.77
## source                  32.35
## rh                       0.00

gr10um

frmla <- gr10um  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "PMS5003", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## total_particle_number 100.000
## lw08um                 87.317
## N.1.                   20.084
## source                  9.541
## rh                      0.000

gr25um

frmla <- gr25um  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "PMS5003", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## total_particle_number  100.00
## source                  70.56
## lw08um                  41.69
## rh                      18.09
## N.1.                     0.00

SPS30

n05

frmla <- n05  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "SPS030", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## lw08um                100.000
## total_particle_number  99.445
## N.1.                   26.235
## rh                      8.059
## source                  0.000

n1

frmla <- n1  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "SPS030", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## total_particle_number 100.000
## lw08um                 88.994
## N.1.                   10.317
## rh                      1.104
## source                  0.000

n25

frmla <- n25  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "SPS030", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## total_particle_number 100.000
## lw08um                 82.450
## source                  5.122
## N.1.                    3.637
## rh                      0.000

OPCR1

Bin0 - 0.35/0.7um

frmla <- Bin0  ~  source + lw08um + N.1. + total_particle_number + rh 

glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "OPCR1", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## total_particle_number 100.000
## source                 56.552
## lw08um                 47.453
## rh                      3.423
## N.1.                    0.000

Bin1 - 0.7/1.1um

frmla <- Bin1  ~  source + lw08um + N.1. + total_particle_number + rh 


glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "OPCR1", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## source                100.000
## total_particle_number  98.080
## rh                     31.458
## N.1.                    2.025
## lw08um                  0.000

Bin2 - 1.1/1.5um

frmla <- Bin2  ~  source + lw08um + N.1. + total_particle_number + rh 


glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "OPCR1", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## source                 100.00
## total_particle_number   91.27
## lw08um                  20.15
## rh                      15.16
## N.1.                     0.00

Bin3 - 1.5/1.9um

frmla <- Bin3  ~  source + lw08um + N.1. + total_particle_number + rh 


glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "OPCR1", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## source                 100.00
## total_particle_number   95.44
## lw08um                  28.61
## rh                      14.28
## N.1.                     0.00

Bin4 - 1.9/2.4um

frmla <- Bin4  ~  source + lw08um + N.1. + total_particle_number + rh 


glmnet.fit <- train(frmla, data = df_joined_ft[df_joined_ft$sensor_type == "OPCR1", ], 
                method = "glmnet", metric = "RMSE",
                preProcess = c("center", "scale"),
                trControl = trControl,
                tuneGrid = tuneGrid)

glmnet.fit$bestTune
varImp(glmnet.fit)
## glmnet variable importance
## 
##                       Overall
## source                 100.00
## total_particle_number   98.62
## lw08um                  31.89
## rh                      10.37
## N.1.                     0.00