Estimated Time: ~1hr
In this section, we explore and prepare the groundwater monitoring data set. We then conduct a data quality assessment to determine the total number of wells to be used in this study to exclude anthropogenic and extreme climate conditions. Several criteria were applied to this initial selection process: (1) the consistency of data at each well; (2) the observational uncertainty related to the head values in the well database (RMSE =< 0.25m); and (3) continuous functionality of the well.
Please make sure that you have installed all the packages mentioned in the previous sections.
Curate and harmonize groundwater time series; identify and address quality issues
folder_path <- "C:/Users/raza/project/ObsWell_based_data"
csv_files <- list.files(path = folder_path, pattern = "\\.csv$", full.names = TRUE)
num_files_meeting_condition <- 0
for (csv_file in csv_files) {
data <- read.csv(csv_file)
first_date <- dmy(data$Date[1])
if (!is.na(first_date) && year(first_date) >= 2000) {
num_files_meeting_condition <- num_files_meeting_condition + 1
cat("File:", basename(csv_file), "\n")
print(head(data$Date, 5))
# Remove file if needed
# unlink(csv_file)
}
}
cat("Number of files meeting condition:", num_files_meeting_condition, "\n")
source_folder <- folder_path
destination_folder <- "C:/Users/raza/project/monthly_data"
dir_create(destination_folder)
specific_years <- 2001:2005
filter_and_add_filename <- function(data, year, filename) {
data <- data %>%
mutate(Date = dmy(Date)) %>%
filter(year(Date) == year) %>%
mutate(ID = filename)
return(data)
}
csv_files <- dir_ls(source_folder, regexp = "\\.csv$")
for (yr in specific_years) {
data_list <- list()
for (csv_file in csv_files) {
data <- read.csv(csv_file, stringsAsFactors = FALSE)
filename <- substr(tools::file_path_sans_ext(basename(csv_file)), 1, 8)
data_list[[filename]] <- filter_and_add_filename(data, yr, filename)
}
combined_data <- bind_rows(data_list)
write.csv(combined_data, paste0(destination_folder, "/", yr, ".csv"), row.names = FALSE)
cat("Year", yr, "data saved.\n")
}
coordinate_file <- "C:/Users/raza/project/example/stammdaten_2023_07.csv"
coordinates_data <- read.csv(coordinate_file, stringsAsFactors = FALSE, fileEncoding = "UTF-8")
mean_monthly_folder <- "C:/Users/raza/project/mean_monthly_data"
file_list <- list.files(mean_monthly_folder, pattern = "\\.csv$", full.names = TRUE)
for (file in file_list) {
well_data <- read.csv(file, stringsAsFactors = FALSE)
merged_data <- merge(well_data, coordinates_data, by = "ID", all.x = TRUE)
write.csv(merged_data, paste0("merged_", basename(file)), row.names = FALSE)
}
monthly_files <- list.files(destination_folder, pattern = "\\.csv$", full.names = TRUE)
monthly_mean_list <- list()
for (file in monthly_files) {
data <- read.csv(file)
data$Date <- as.Date(data$Date)
data$Month <- format(data$Date, "%Y-%m")
monthly_mean <- data %>%
group_by(ID, Month) %>%
summarize(Mean_GWL = mean(GWL, na.rm = TRUE), .groups = "drop")
monthly_mean_list[[basename(file)]] <- monthly_mean
write.csv(monthly_mean, gsub(".csv", "_monthly_mean.csv", file), row.names = FALSE)
}
clean_folder <- "C:/Users/raza/project/mean_monthly_data"
file_list <- list.files(clean_folder, pattern = "*.csv", full.names = TRUE)
for (file in file_list) {
data <- read.csv(file)
data$Date <- as.yearmon(data$Month, "%Y-%m")
ids_to_remove <- data %>%
group_by(ID) %>%
summarize(Num_Unique_Months = n_distinct(Date)) %>%
filter(Num_Unique_Months < 12) %>%
pull(ID)
data_filtered <- data %>% filter(!ID %in% ids_to_remove)
clean_filename <- paste0("clean_", gsub("[^0-9]", "", basename(file)), ".csv")
write.csv(data_filtered, clean_filename, row.names = FALSE)
data_transposed <- pivot_wider(data_filtered, names_from = ID, values_from = Mean_GWL)
data_transposed <- data_transposed %>% select(where(~ !any(is.na(.))))
transposed_filename <- paste0("transposed_", gsub("[^0-9]", "", basename(file)), ".csv")
write.csv(data_transposed, transposed_filename, row.names = TRUE)
}
data <- read.csv("transposed_2001-05.csv")
data_num <- data[, -1] # remove first column if necessary
data_num <- sapply(data_num, as.numeric)
data_num[is.na(data_num)] <- mean(data_num, na.rm = TRUE)
z_scaled_data <- scale(data_num)
pca_result <- prcomp(z_scaled_data)
summary(pca_result)
# Scree plot
eigenvalues <- (pca_result$sdev)^2 / sum((pca_result$sdev)^2) * 100
eig_df <- data.frame(Dimension = 1:length(eigenvalues), Eigenvalue = eigenvalues)
eig_df_sub <- head(eig_df, 10)
p <- ggplot(eig_df_sub, aes(x = Dimension, y = Eigenvalue)) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.5) +
geom_text(aes(label = round(Eigenvalue, 2)), vjust = -0.5, size = 6) +
labs(title = "Scree Plot", x = "Principal Component", y = "Variance (%)") +
theme_bw() +
theme(axis.text = element_text(size = 28), axis.title = element_text(size = 28), plot.title = element_text(size = 28))
ggsave("scree_plot_1.jpg", plot = p, width = 10, height = 6, units = "in", dpi = 200)
# Save PCA results
write.csv(pca_result$x, "scores_2001-05.csv")
write.csv(pca_result$rotation, "loadings_2001-05.csv")
pc_scores <- as.data.frame(pca_result$x)
regression_data <- cbind(data_num, pc_scores)
reference_hydrographs <- data.frame()
residuals_data <- data
for (well_col in 1:ncol(data_num)) {
lm_result <- lm(data_num[, well_col] ~ PC1 + PC2 + PC3 + PC4, data = regression_data)
reference <- predict(lm_result)
reference_hydrographs <- rbind(reference_hydrographs, data.frame(
Well = colnames(data_num)[well_col],
Reference_Hydrograph = reference
))
residuals_data[, well_col] <- data_num[, well_col] - reference
}
write.csv(reference_hydrographs, "reference_hydrographs.csv")
write.csv(residuals_data, "residuals_data.csv")
well_name <- "X32465430"
plot_data <- data.frame(
Date = as.Date(data$Month),
Groundwater_Head = data[[well_name]],
Reference_Hydrograph = reference_hydrographs$Reference_Hydrograph[reference_hydrographs$Well == well_name]
)
ggplot(plot_data, aes(x = Date)) +
geom_line(aes(y = Groundwater_Head, color = "Observed"), size = 1) +
geom_line(aes(y = Reference_Hydrograph, color = "Reference"), size = 1, linetype = "dashed") +
labs(title = paste("Well", well_name), x = "Year", y = "Groundwater Head") +
scale_color_manual(values = c("Observed" = "black", "Reference" = "red")) +
theme_bw()
well_rmses <- data.frame(Well = colnames(data_num), RMSE = NA)
for (i in seq_along(well_rmses$Well)) {
obs <- data_num[, i]
ref <- reference_hydrographs$Reference_Hydrograph[reference_hydrographs$Well == well_rmses$Well[i]]
well_rmses$RMSE[i] <- sqrt(mean((obs - ref)^2))
}
ranked_wells <- well_rmses[order(well_rmses$RMSE), ]
write.csv(well_rmses, "RMSE_2001-05.csv")
setwd("C:/Users/raza/project")
data <- read.csv("total.csv")
plot <- ggplot(data_melt, aes(x = factor(year), y = value, color = variable, group = variable)) +
geom_line(size = 1.2) + # Line plot with increased line width
labs(title = "",
x = "Year",
y = "Number of wells") +
theme_bw() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 0, hjust = 1, size = 16, face = "bold"),
axis.text.y = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 18),
legend.title = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
plot.title = element_text(size = 14, face = "bold")) +
scale_color_manual(values = c("Total.Wells" = "#E41A1C", "Complete.Data" = "#377EB8",
"RMSE...0.25m" = "darkgreen", "RMSE....0.25m" = "darkgoldenrod",
"Missing.Data" = "orange"),
labels = c("Total Wells",
"Wells with complete data",
"RMSE <= 0.25m",
"Wells with missing data",
"RMSE > 0.25m")) +
scale_x_discrete(breaks = seq(2001, 2005, by = 1)) +
geom_vline(xintercept = c("2001", "2005"),
linetype = "dashed", color = "black", size = 0.45) +
guides(color = guide_legend(nrow = 2))
plot
# Save the plot as a PNG file
ggsave("total_v2.png", plot = plot, width = 10, height = 7, dpi = 100)