Estimated Time: ~1.5hr
In this tutorial, we will train a RF model based on encoding positional information without accounting for surrounding conditions.
Inputs:
Output
setwd("C:/Users/raza/project")
base_path <- "C:/Users/raza/project/plot"
library(doParallel)
library(ranger)
library(caret)
cores <- detectCores()
registerDoParallel(cores = cores)
# Loop through years from 2001 to 2022
year_ranges <- "2001-05"
# Initialize variables for tracking lowest RMSE and iterations since improvement
lowest_rmse <- Inf
iterations_since_improvement <- 0
max_iterations_without_improvement <- 20
{
for (year in year_ranges) {
start_time <- Sys.time()
cat("Processing year:", year, "\n")
file_name <- paste0("stfdf_", year, ".rda")
# Load spatiotemporal data frame for the current year
load(file = file_name)
# Convert the data to a format suitable for modeling
GWD_df <- as.data.frame(stfdf)
covariates <- c("GWD", "Lat", "Lon", "SLOPE", "ASP", "DEM", "LNF", "TPI", "TWI", "TRI", "PREC", "TEMP", "NDVI", "ET", "EVI")
GWD_df <- GWD_df[, c("lon", "lat", "sp.ID", "time", covariates)]
GWD_df = GWD_df[complete.cases(GWD_df), ]
time = sort(unique(GWD_df$time))
daysNum = length(time)
days = gsub("-", "", time, fixed = TRUE)
min.node.size <- 2:10
sample.fraction <- seq(1, 0.832, -0.025) # 0.632 without / 1 with replacement
ntree <- 250 # 500
mtry <- 1:9
indices <- CreateSpacetimeFolds(GWD_df, spacevar = "sp.ID", k = 5, seed = 42)
hp <- expand.grid(min.node.size = min.node.size, mtry = mtry, sf = sample.fraction)
hp <- hp[sample(nrow(hp), 50), ]
hp <- hp[order(hp$mtry), ]
rmse_hp <- rep(NA, nrow(hp))
for (h in 1:nrow(hp)) {
comb <- hp[h, ]
print(paste("combination: ", h, sep = ""))
print(comb)
fold_obs <- c()
fold_pred <- c()
for (f in 1:length(indices$index)) {
print(paste("fold: ", f, sep = ""))
dev_df1 <- GWD_df[indices$index[[f]], covariates]
val_df1 <- GWD_df[indices$indexOut[[f]], covariates]
model <- ranger(GWD ~ ., data = dev_df1, importance = "none", seed = 42,
num.trees = ntree, mtry = comb$mtry,
splitrule = "extratrees", #variance extratrees
min.node.size = comb$min.node.size,
sample.fraction = comb$sf,
oob.error = FALSE)
fold_obs <- c(fold_obs, val_df1$GWD)
fold_pred <- c(fold_pred, predict(model, val_df1)$predictions)
}
rmse_hp[h] <- round(sqrt(mean((fold_obs - fold_pred)^2, na.rm = TRUE)), 3)
print(paste("rmse: ", rmse_hp[h], sep = ""))
# Check if the current RMSE is lower than the lowest RMSE
if (rmse_hp[h] < lowest_rmse) {
lowest_rmse <- rmse_hp[h]
iterations_since_improvement <- 0
} else {
iterations_since_improvement <- iterations_since_improvement + 1
}
# Check if we've reached the maximum iterations without improvement
if (iterations_since_improvement >= max_iterations_without_improvement) {
cat("No improvement in RMSE for", max_iterations_without_improvement, "iterations. Ending the process.\n")
break
}
}
if (iterations_since_improvement >= max_iterations_without_improvement) {
cat("Exiting early due to lack of improvement.\n")
break
}
}
dev_parameters <- hp[which.min(rmse_hp), ]
print(dev_parameters)
GWD_df <- GWD_df[, covariates]
colnames(GWD_df)
# Identify unique combinations of latitude and longitude
unique_coords <- unique(GWD_df[, c("Lat", "Lon")])
# Set a seed for reproducibility
set.seed(123)
# Create an index for splitting the data
index <- createDataPartition(1:nrow(unique_coords), p = 0.8, list = FALSE)
# Use the index to split the unique coordinates into training and testing sets
train_coords <- unique_coords[index, ]
test_coords <- unique_coords[-index, ]
# Merge the original data with the training and testing coordinates
train_data <- merge(GWD_df, train_coords, by = c("Lat", "Lon"))
test_data <- merge(GWD_df, test_coords, by = c("Lat", "Lon"))
# Select covariates
covariates <- colnames(GWD_df)
covariates <- covariates[covariates != "GWD"] # Assuming "GWD" is the response variable
# Subset the data frames with selected covariates
train_data <- train_data[, c("GWD", covariates)]
test_data <- test_data[, c("GWD", covariates)]
rf_model <- ranger(GWD ~ ., data = train_data, importance = "impurity", seed = 42,
num.trees = ntree, mtry = dev_parameters$mtry,
splitrule = "extratrees", #variance
min.node.size = dev_parameters$min.node.size,
sample.fraction = dev_parameters$sf,
quantreg = TRUE)
# Save the trained model for the current year
save(rf_model, file = paste0("../models/RF_", year, ".rda"))
# Create and save plot_data
plot_data <- data.frame(Lat = train_data$Lat, Lon = train_data$Lon, Observations = train_data$GWD, Predictions = rf_model$predictions)
plot_file <- paste0(base_path, (paste0("\\model_training\\rf_training_", year, ".csv")))
#write.csv(plot_data, plot_file)
# Make predictions on the test data using the trained model
predictions <- predict(rf_model, data = test_data, type = "response")
# Add the predictions to the test_data dataframe
test_data$predicted_GWD <- predictions$predictions
plot_data_test <- data.frame(Lat = test_data$Lat, Lon = test_data$Lon, GWD = test_data$GWD, predicted_GWD = test_data$predicted_GWD)
# Save test_data with predictions as a CSV file
test_file <- paste0(base_path, (paste0("\\model_testing\\rf_test_", year, ".csv")))
#write.csv(plot_data_test, test_file)
end_time <- Sys.time()
computation_time <- end_time - start_time
cat("Computation time for year", year, ":", computation_time, "\n")
# Save dev_parameters, RMSE, and computation_time to a text file
result_file <- paste0(base_path,"\\computation_times_rf.txt")
write(paste("Year:", year), file = result_file, append = TRUE)
write(paste("dev_parameters:", toString(dev_parameters)), file = result_file, append = TRUE)
write(paste("RMSE:", min(rmse_hp, na.rm = TRUE)), file = result_file, append = TRUE)
write(paste("Computation time:", computation_time), file = result_file, append = TRUE)
write("\n", file = result_file, append = TRUE)
}