library(tidyverse) # for basic data manipulations library(sf) # for spatial data analysis library(mgcv) # for generalized additive models library(automap) # for automatic variogram selection in kriging models library(gstat) # for spatial models (kriging and inverse distance weighting) # Packages that may need to be installed manually: library(remap) # (included with materials) the remap package for # regional modeling library(snowload) # (included with materials) implementation of the PRISM # mapping program # General prediction and evaluation functions # ============================================================================== # Create a prediction function that caps GAM predictions at a maximum and # minimum value to avoid runaway extrapolation. predict.cut_model <- function(object, data, ...) { data$ELEVATION[data$ELEVATION < object$cuts$elev[[1]]] <- object$cuts$elev[[1]] data$ELEVATION[data$ELEVATION > object$cuts$elev[[2]]] <- object$cuts$elev[[2]] data$MCMT[data$MCMT < object$cuts$mcmt[[1]]] <- object$cuts$mcmt[[1]] data$MCMT[data$MCMT > object$cuts$mcmt[[2]]] <- object$cuts$mcmt[[2]] data$logPPTWT[data$logPPTWT < object$cuts$logpptwt[[1]]] <- object$cuts$logpptwt[[1]] data$logPPTWT[data$logPPTWT > object$cuts$logpptwt[[2]]] <- object$cuts$logpptwt[[2]] preds <- predict(object$model, data, ...) preds[preds < object$cuts$load[[1]]] <- object$cuts$load[[1]] preds[preds > object$cuts$load[[2]]] <- object$cuts$load[[2]] return(preds) } # Function to print cross-validation accuracy numbers to desired number # of significant digits. eval <- function(pred, obs) { c( MAE = signif(mean(abs(pred - obs)), 4), MedAE = signif(median(abs(pred - obs)), 4), MSE = signif(mean((pred - obs)^2), 4), Bias = signif(mean(pred / obs), 4), COV = signif(sd(pred / obs) / mean(pred / obs), 4) ) } # Runs cross validation for a specified function (fun) using the remap # framework. local_cv <- function(fun, alt_pred = NULL) { preds <- rep(as.numeric(NA), nrow(rtsl)) for (k in 1:max(as.numeric(rtsl$cv))) { print(paste("Fold", k)) pred_index <- rtsl$cv == k models <- remap::remap(rtsl[!pred_index, ], regions = eco3_final, region_id = NA_L3CODE, model_function = fun, buffer = 50, min_n = 150, distances = eco3_dist[!pred_index, ]) if (is.null(alt_pred)) { preds[pred_index] <- exp(predict(models, rtsl[pred_index, ], smooth = 25, distances = eco3_dist[pred_index, ])) } else { preds[pred_index] <- exp(predict(models, alt_pred[pred_index, ], smooth = 25)) } } if (is.null(alt_pred)) { return(eval(preds, rtsl$RC_II)) } else { return(preds) } } # Runs cross validation for "global" models that do not use remap. global_cv <- function(fun) { preds <- rep(as.numeric(NA), nrow(rtsl)) for (k in 1:max(as.numeric(rtsl$cv))) { print(paste("Fold", k)) pred_index <- rtsl$cv == k model <- fun(rtsl[!pred_index, ]) preds[pred_index] <- exp(predict(model, rtsl[pred_index, ])) } return(eval(preds, rtsl$RC_II)) } # GAM model functions # ============================================================================== gam_function <- function(data) { fml <- log(RC_II) ~ s(ELEVATION, k = 15) + s(MCMT, k = 15) + s(logPPTWT, k = 15) + s(LATITUDE, LONGITUDE, bs = "sos", k = 75) model <- mgcv::gam(fml, data = data) cuts <- list( elev = quantile(data$ELEVATION, c(0.01, 0.99)), mcmt = quantile(data$MCMT, c(0.01, 0.99)), logpptwt = quantile(data$logPPTWT, c(0.01, 0.99)), RC_II = log(quantile(data$RC_II, c(0.01, 0.99))) ) out <- list(model = model, cuts = cuts) class(out) <- "cut_model" return(out) } # Linear model functions # ============================================================================== lm_function <- function(data) { fml <- log(RC_II) ~ ELEVATION + MCMT + logPPTWT model <- lm(fml, data = data) cuts <- list( elev = quantile(data$ELEVATION, c(0.01, 0.99)), mcmt = quantile(data$MCMT, c(0.01, 0.99)), logpptwt = quantile(data$logPPTWT, c(0.01, 0.99)), load = log(quantile(data$RC_II, c(0.01, 0.99))) ) out <- list(model = model, cuts = cuts) class(out) <- "cut_model" return(out) } # Kriging model functions # ============================================================================== projection <- sf::st_crs("+proj=laea +x_0=0 +y_0=0 +lon_0=-100 +lat_0=45") krig_function <- function(data, load) { formula <- log(RC_II) ~ ELEVATION + MCMT + logPPTWT data <- data %>% sf::st_transform(projection) %>% sf::as_Spatial() model <- list(data = data, formula = formula) class(model) <- "krig" cuts <- list( elev = quantile(data$ELEVATION, c(0.01, 0.99)), mcmt = quantile(data$MCMT, c(0.01, 0.99)), logpptwt = quantile(data$logPPTWT, c(0.01, 0.99)), load = log(quantile(data$RC_II, c(0.01, 0.99))) ) out <- list(model = model, cuts = cuts) class(out) <- "cut_model" return(out) } predict.krig <- function(object, data) { if (nrow(data) == 0) return(NULL) data <- data %>% sf::st_transform(projection) %>% sf::as_Spatial() variogram_object <- automap::autofitVariogram( formula = object$formula, input_data = object$data, model = "Sph") k <- gstat::krige(formula = object$formula, locations = object$data, newdata = data, model = variogram_object$var_model, debug.level = 0) return(k$var1.pred) } # PRISM functions # ============================================================================== prism_function <- function(data) { fml <- log(RC_II) ~ ELEVATION + MCMT + logPPTWT model <- list(data = sf::as_Spatial(data), fml = fml) class(model) <- "prism" cuts <- list( elev = quantile(data$ELEVATION, c(0.01, 0.99)), mcmt = quantile(data$MCMT, c(0.01, 0.99)), logpptwt = quantile(data$logPPTWT, c(0.01, 0.99)), load = log(quantile(data$RC_II, c(0.01, 0.99))) ) out <- list(model = model, cuts = cuts) class(out) <- "cut_model" return(out) } predict.prism <- function(object, data) { if (nrow(data) != 0) { data <- sf::as_Spatial(data) snowload::prism(formula = object$fml, newdata = data, locations = object$data) } else { return(NULL) } } # IDW functions # ============================================================================== idw <- function(data) { data$ELEVATION[data$ELEVATION == 0] <- 1 data$NGSL <- data$RC_II / data$ELEVATION model <- list(data = sf::as_Spatial(data)) class(model) <- "idw" cuts <- list( elev = quantile(data$ELEVATION, c(0.01, 0.99)), mcmt = quantile(data$MCMT, c(0.01, 0.99)), logpptwt = quantile(data$logPPTWT, c(0.01, 0.99)), load = log(quantile(data$RC_II, c(0.01, 0.99))) ) out <- list(model = model, cuts = cuts) class(out) <- "cut_model" return(out) } predict.idw <- function(object, data) { if (nrow(data) != 0) { data$ELEVATION[data$ELEVATION <= 0] <- 1 data$NGSL <- data$RC_II / data$ELEVATION data <- sf::as_Spatial(data) out <- gstat::idw(NGSL ~ 1, locations = object$data, newdata = data, idp = 2)$var1.pred out[out <= 0] <- 1 return(log(out * data$ELEVATION)) } else { return(NULL) } } # Load data # ============================================================================== set.seed(432) # Set seed for reproducibility. rtsl <- read_csv("rtsl.csv") %>% dplyr::mutate(cv = sample(1:10, dplyr::n(), replace = TRUE), logPPTWT = log(PPTWT), x = LONGITUDE, y = LATITUDE) %>% sf::st_as_sf(coords = c("x", "y"), crs = 4326) load("eco3_simp.RData") eco3_final <- sf::st_as_sf(eco3_final, crs = 4326) %>% sf::st_transform(crs = 4326) # Make distance matrix and evaluation data frame # ============================================================================== eco3_dist <- remap::redist(rtsl, regions = eco3_final, region_id = NA_L3CODE) # Cross validation # ============================================================================== model_types <- c("GAM", "OLS", "Kriging", "Prism", "IDW") functions <- list(gam_function, lm_function, krig_function, prism_function, idw) cv_rtsl <- data.frame( Model = rep(model_types, each = 2), Fit = rep(c("Global", "Regional"), length(model_types)), MAE = rep(NA_real_, 2 * length(model_types)), MedAE = rep(NA_real_, 2 * length(model_types)), MSE = rep(NA_real_, 2 * length(model_types)), Bias = rep(NA_real_, 2 * length(model_types)), COV = rep(NA_real_, 2 * length(model_types)) ) # NOTE: This line of code can take quite a while (~ hour or so) to run. for (i in 1:length(functions)) { print(model_types[[i]]) cv_rtsl[(i * 2 - 1), 3:7] <- global_cv(functions[[i]]) cv_rtsl[(i * 2), 3:7] <- local_cv(functions[[i]]) } cv_rtsl