Section 4

Conventional ML – RF

Estimated Time: ~1.5hr
In this tutorial, we will train a RF model based on encoding positional information without accounting for surrounding conditions.
Inputs:
Grid based MODIS ET, NDVI, EVI values
Temperature and precipitation data from German weather station (DWD)
Coordinates
Output
Trained RF model
Write trained model predictions on the testing data set.
Tuned set of model parameters
Computational time

setwd("C:/Users/raza/project")

base_path <- "C:/Users/raza/project/plot"
library(doParallel)
library(ranger)
library(caret)

cores <- detectCores()
registerDoParallel(cores = cores)

# Loop through years from 2001 to 2022
year_ranges <- "2001-05"

# Initialize variables for tracking lowest RMSE and iterations since improvement
lowest_rmse <- Inf
iterations_since_improvement <- 0
max_iterations_without_improvement <- 20

{

  for (year in year_ranges) {
    start_time <- Sys.time()
    cat("Processing year:", year, "\n")
    file_name <- paste0("stfdf_", year, ".rda")

    # Load spatiotemporal data frame for the current year
    load(file = file_name)

    # Convert the data to a format suitable for modeling
    GWD_df <- as.data.frame(stfdf)
    covariates <- c("GWD", "Lat", "Lon", "SLOPE", "ASP", "DEM", "LNF", "TPI", "TWI", "TRI", "PREC", "TEMP", "NDVI", "ET", "EVI")
    GWD_df <- GWD_df[, c("lon", "lat", "sp.ID", "time", covariates)]
    GWD_df = GWD_df[complete.cases(GWD_df), ]

    time = sort(unique(GWD_df$time))
    daysNum = length(time)
    days = gsub("-", "", time, fixed = TRUE)

    min.node.size <- 2:10
    sample.fraction <- seq(1, 0.832, -0.025) # 0.632 without / 1 with replacement
    ntree <- 250 # 500
    mtry <- 1:9

    indices <- CreateSpacetimeFolds(GWD_df, spacevar = "sp.ID", k = 5, seed = 42)

    hp <- expand.grid(min.node.size = min.node.size, mtry = mtry, sf = sample.fraction)
    hp <- hp[sample(nrow(hp), 50), ]
    hp <- hp[order(hp$mtry), ]
    rmse_hp <- rep(NA, nrow(hp))

    for (h in 1:nrow(hp)) {
      comb <- hp[h, ]
      print(paste("combination: ", h, sep = ""))
      print(comb)

      fold_obs <- c()
      fold_pred <- c()

      for (f in 1:length(indices$index)) {
        print(paste("fold: ", f, sep = ""))
        dev_df1 <- GWD_df[indices$index[[f]], covariates]
        val_df1 <- GWD_df[indices$indexOut[[f]], covariates]

        model <- ranger(GWD ~ ., data = dev_df1, importance = "none", seed = 42,
                        num.trees = ntree, mtry = comb$mtry,
                        splitrule = "extratrees",  #variance  extratrees
                        min.node.size = comb$min.node.size,
                        sample.fraction = comb$sf,
                        oob.error = FALSE)
        fold_obs <- c(fold_obs, val_df1$GWD)
        fold_pred <- c(fold_pred, predict(model, val_df1)$predictions)

      }

      rmse_hp[h] <- round(sqrt(mean((fold_obs - fold_pred)^2, na.rm = TRUE)), 3)

      print(paste("rmse: ", rmse_hp[h], sep = ""))

      # Check if the current RMSE is lower than the lowest RMSE
      if (rmse_hp[h] < lowest_rmse) {
        lowest_rmse <- rmse_hp[h]
        iterations_since_improvement <- 0
      } else {
        iterations_since_improvement <- iterations_since_improvement + 1
      }

      # Check if we've reached the maximum iterations without improvement
      if (iterations_since_improvement >= max_iterations_without_improvement) {
        cat("No improvement in RMSE for", max_iterations_without_improvement, "iterations. Ending the process.\n")
        break
      }

    }

    if (iterations_since_improvement >= max_iterations_without_improvement) {
      cat("Exiting early due to lack of improvement.\n")
      break
    }
  }

  dev_parameters <- hp[which.min(rmse_hp), ]
  print(dev_parameters)

  GWD_df <- GWD_df[, covariates]
  colnames(GWD_df)

  # Identify unique combinations of latitude and longitude
  unique_coords <- unique(GWD_df[, c("Lat", "Lon")])

  # Set a seed for reproducibility
  set.seed(123)

  # Create an index for splitting the data
  index <- createDataPartition(1:nrow(unique_coords), p = 0.8, list = FALSE)

  # Use the index to split the unique coordinates into training and testing sets
  train_coords <- unique_coords[index, ]
  test_coords <- unique_coords[-index, ]

  # Merge the original data with the training and testing coordinates
  train_data <- merge(GWD_df, train_coords, by = c("Lat", "Lon"))
  test_data <- merge(GWD_df, test_coords, by = c("Lat", "Lon"))

  # Select covariates
  covariates <- colnames(GWD_df)
  covariates <- covariates[covariates != "GWD"]  # Assuming "GWD" is the response variable

  # Subset the data frames with selected covariates
  train_data <- train_data[, c("GWD", covariates)]
  test_data <- test_data[, c("GWD", covariates)]

  rf_model <- ranger(GWD ~ ., data = train_data, importance = "impurity", seed = 42,
                     num.trees = ntree, mtry = dev_parameters$mtry,
                     splitrule = "extratrees",  #variance
                     min.node.size = dev_parameters$min.node.size,
                     sample.fraction = dev_parameters$sf,
                     quantreg = TRUE)

  # Save the trained model for the current year
  save(rf_model, file = paste0("../models/RF_", year, ".rda"))

  # Create and save plot_data
  plot_data <- data.frame(Lat = train_data$Lat, Lon = train_data$Lon, Observations = train_data$GWD, Predictions = rf_model$predictions)
  plot_file <- paste0(base_path, (paste0("\\model_training\\rf_training_", year, ".csv")))
  #write.csv(plot_data, plot_file)

  # Make predictions on the test data using the trained model
  predictions <- predict(rf_model, data = test_data, type = "response")

  # Add the predictions to the test_data dataframe
  test_data$predicted_GWD <- predictions$predictions

  plot_data_test <- data.frame(Lat = test_data$Lat, Lon = test_data$Lon, GWD = test_data$GWD, predicted_GWD = test_data$predicted_GWD)

  # Save test_data with predictions as a CSV file
  test_file <- paste0(base_path, (paste0("\\model_testing\\rf_test_", year, ".csv")))
  #write.csv(plot_data_test, test_file)

  end_time <- Sys.time()
  computation_time <- end_time - start_time
  cat("Computation time for year", year, ":", computation_time, "\n")

  # Save dev_parameters, RMSE, and computation_time to a text file
  result_file <-  paste0(base_path,"\\computation_times_rf.txt")
  write(paste("Year:", year), file = result_file, append = TRUE)
  write(paste("dev_parameters:", toString(dev_parameters)), file = result_file, append = TRUE)
  write(paste("RMSE:", min(rmse_hp, na.rm = TRUE)), file = result_file, append = TRUE)
  write(paste("Computation time:", computation_time), file = result_file, append = TRUE)
  write("\n", file = result_file, append = TRUE)
}