Section 5

Conventional ML – RFsp

Estimated Time: ~40 minutes

In this tutorial, we will train a RFsp model based on encoding positional information but also considering the buffer distance maps from observation points as covariates accounting for surrounding conditions. Random Forest for Spatial Data (RFsp) (Hengl et al., 2018) is an adaptation of the traditional RF method to account for spatial dependencies in data.

Inputs:

Grid based MODIS ET, NDVI, EVI values
Temperature and precipitation data from German weather station (DWD)
Coordinates
Buffer distance

Output

Trained RFsp model
Write trained model predictions on the testing data set.
Tuned set of model parameters
Computational time


setwd("C:/Users/raza/project")
base_path <- "C:/Users/raza/project/plot"
library(devtools)
#install_github("envirometrix/landmap")
library(landmap)

year_ranges <- c("2001-05")

for (year in year_ranges) {
  # Load spatiotemporal data frame for the current year
  start_time <- Sys.time()
  cat("Processing year:", year, "\n")
  load(file = paste0("stfdf_", year, ".rda"))

  covariates <- c("GWD", "Lat", "Lon", "SLOPE", "ASP", "DEM", "LNF", "TPI", "TWI", "TRI", "PREC", "TEMP", "NDVI", "ET", "EVI")

  ### Create BBOX ###
  xmin <- min(stfdf@sp@coords[, "lon"]) - 1000
  xmax <- max(stfdf@sp@coords[, "lon"]) + 1000
  ymin <- min(stfdf@sp@coords[, "lat"]) - 1000
  ymax <- max(stfdf@sp@coords[, "lat"]) + 1000
  bbox <- extent(xmin, xmax, ymin, ymax)
  bbox <- as(bbox, "SpatialPolygons")
  bbox@proj4string <- CRS(utm33)  #"+proj=longlat +datum=WGS84"
  fishnet <- raster(bbox)
  res(fishnet) <- c(1000, 1000)  # 1 x 1 km
  fishnet[] <- runif(length(fishnet), -10, 10)
  fishnet <- as(fishnet, "SpatialPixelsDataFrame")
  st <- as(fishnet, "SpatialPoints")
  plot(fishnet)
  #plot(border, add = TRUE)
  extent(fishnet)
  extent(stfdf@sp["staid"])
  class(fishnet)

  ## Geographic distances only:
  grid.distP <- landmap::buffer.dist(stfdf@sp["staid"], fishnet[1], as.factor(1:nrow(stfdf@sp)))
  #print(grid.distP)

  temp_df <- as.data.frame(stfdf)

  GWD_df$staid <- as.integer(as.character(GWD_df$staid))
  ov_toma <- do.call(cbind, list(stfdf@sp@data, over(stfdf@sp, grid.distP)))
  GWD_df <- plyr::join(GWD_df, ov_toma, by = "staid")

  GWD_df <- GWD_df[, c(covariates, paste("layer.", sort(unique(as.integer(as.character(GWD_df$staid)))), sep = ""))]

  GWD_df <- GWD_df[complete.cases(GWD_df), ]
  #colnames(GWD_df)

  time <- sort(unique(GWD_df$time))
  daysNum <- length(time)
  days <- gsub("-", "", time, fixed = TRUE)
  nrow(GWD_df)

  ntree <- 250  # 500   

  # tune RFsp as Hengl et al. 2018 did
  rt <- makeRegrTask(data = GWD_df[sample(nrow(GWD_df), 12000), ], target = "GWD")

  estimateTimeTuneRanger(rt, num.threads = detectCores())

  # Time-consuming >> do on number cruncher
  set.seed(42)
  t.rfsp <- tuneRanger(rt, num.trees = ntree, iters = 120, build.final.model = FALSE)

  dev_parameters <- t.rfsp$recommended.pars
  print(dev_parameters)

  # Identify unique combinations of latitude and longitude
  unique_coords <- unique(GWD_df[, c("Lat", "Lon")])

  # Set a seed for reproducibility
  set.seed(123)

  # Create an index for splitting the data
  index <- createDataPartition(1:nrow(unique_coords), p = 0.8, list = FALSE)

  # Use the index to split the unique coordinates into training and testing sets
  train_coords <- unique_coords[index, ]
  test_coords <- unique_coords[-index, ]

  # Merge the original data with the training and testing coordinates
  train_data <- merge(GWD_df, train_coords, by = c("Lat", "Lon"))
  test_data <- merge(GWD_df, test_coords, by = c("Lat", "Lon"))

  # Select covariates
  covariates <- colnames(GWD_df)
  covariates <- covariates[covariates != "GWD"]  # Assuming "GWD" is the response variable

  # Subset the data frames with selected covariates
  train_data <- train_data[, c("GWD", covariates)]
  test_data <- test_data[, c("GWD", covariates)]

  rfsp_model <- ranger(GWD ~ ., data = train_data, importance = "impurity", seed = 42,
                       num.trees = ntree, mtry = dev_parameters$mtry,
                       splitrule = "variance",
                       min.node.size = dev_parameters$min.node.size,
                       sample.fraction = dev_parameters$sample.fraction,
                       quantreg = TRUE)  ### quantreg???

  # Save the trained model for the current year
  save(rfsp_model, file = paste0("../models/RFsp_", year, ".rda"))

  # Create and save plot_data
  plot_data <- data.frame(Lat = train_data$Lat, Lon = train_data$Lon, Observations = train_data$GWD, Predictions = rfsp_model$predictions)
  plot_file <- paste0(base_path, (paste0("\\model_training\\rfsp_training_", year, ".csv")))
  write.csv(plot_data, plot_file)

  # Make predictions on the test data using the trained model
  predictions <- predict(rfsp_model, data = test_data, type = "response")

  # Add the predictions to the test_data dataframe
  test_data$predicted_GWD <- predictions$predictions

  test_data <- test_data[, c("GWD", "predicted_GWD", "Lat", "Lon")]

  # Save test_data with predictions as a CSV file
  test_file <- paste0(base_path, (paste0("\\model_testing\\rfsp_test_", year, ".csv")))
  write.csv(test_data, test_file)

  end_time <- Sys.time()
  computation_time <- end_time - start_time
  #cat("dev_parameters for year", year, ":", dev_parameters, "\n")
  cat("Computation time for year", year, ":", computation_time, "\n")

  # Save dev_parameters, RMSE, and computation_time to a text file
  result_file <- paste0(base_path, "\\computation_times_rfsp.txt")
  write(paste("Year:", year), file = result_file, append = TRUE)
  write(paste("dev_parameters:", toString(dev_parameters)), file = result_file, append = TRUE)
  #write(paste("RMSE:", min(rmse_hp, na.rm = TRUE)), file = result_file, append = TRUE)
  write(paste("Computation time:", computation_time), file = result_file, append = TRUE)
  write("\n", file = result_file, append = TRUE)

}