-
Notifications
You must be signed in to change notification settings - Fork 3
/
combine-drivers.R
142 lines (111 loc) · 5.83 KB
/
combine-drivers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
## ------------------------------------------------------- ##
# Silica WG - Combine Drivers
## ------------------------------------------------------- ##
# Written by:
## Nick J Lyon
# Purpose:
## Identify all extracted driver data (i.e., spatial data) and combine into a single file
## This makes the analysis phase easier on the WG as they can skip redundant joining steps
## ------------------------------------------------------- ##
# Housekeeping ----
## ------------------------------------------------------- ##
# Load needed libraries
# install.packages("librarian")
librarian::shelf(tidyverse, NCEAS/scicomptools, googledrive, readxl, magrittr, supportR)
# Clear environment
rm(list = ls())
# Identify path to location of shared data
(path <- scicomptools::wd_loc(local = F, remote_path = file.path('/', "home", "shares", "lter-si", "si-watershed-extract")))
# Load in shape area check
area_check <- read.csv(file.path(path, "shape_checks", "artisanal_shape_area_check_2.csv"))
# Check it out
dplyr::glimpse(area_check)
# Load in site names with lat/longs
sites <- readxl::read_excel(path = file.path(path, "site-coordinates",
"silica-coords_RAW.xlsx")) %>%
## Pare down to minimum needed columns
dplyr::select(LTER, Stream_Name, Discharge_File_Name, Shapefile_Name) %>%
## Drop duplicate rows (if any)
dplyr::distinct()
# Check it out
dplyr::glimpse(sites)
# Grab the shapefiles the previous script (see PURPOSE section) created
sheds <- sf::st_read(dsn = file.path(path, "site-coordinates", "silica-watersheds.shp")) %>%
# Expand names to what they were before
dplyr::rename(Shapefile_Name = shp_nm,
Stream_Name = Strm_Nm,
expert_area_km2 = exp_area,
shape_area_km2 = real_area)
## combine sites and sheds to get ALL sheds (including hydrosheds) and their metadata (from the sites dataframe)
sheds <- sheds %>%
dplyr::left_join(y = sites, by = c("LTER", "Shapefile_Name"))%>%
# Remove any watersheds without a shapefile
dplyr::filter(!is.na(Shapefile_Name) &
nchar(Shapefile_Name) != 0 &
!Shapefile_Name %in% c("?", "MISSING"))
sheds$Stream_Name <- ifelse(!is.na(sheds$Stream_Name.x), sheds$Stream_Name.x, sheds$Stream_Name.y)
sheds$Discharge_File_Name <- ifelse(!is.na(sheds$Dsc_F_N), sheds$Dsc_F_N, sheds$Discharge_File_Name)
sheds <- sheds %>% select (-c(Stream_Name.x, Stream_Name.y, expert_area_km2, shape_area_km2, exp_are, hydrshd, real_ar,
Dsc_F_N))
# Check that out
dplyr::glimpse(sheds)
# Clean up environment
rm(list = setdiff(ls(), c('path', 'sheds')))
## ------------------------------------------------------- ##
# Combine Extracted Data ----
## ------------------------------------------------------- ##
# List current extracted data
(extracted_data <- googledrive::drive_ls(googledrive::as_id("https://drive.google.com/drive/u/1/folders/1FBq2-FW6JikgIuGVMX5eyFRB6Axe2Hld"), pattern = ".csv") %>%
dplyr::filter(name != "all-data_si-extract.csv"))
# Download all files
purrr::walk2(.x = extracted_data$id, .y = extracted_data$name,
.f = ~ googledrive::drive_download(file = googledrive::as_id(.x),
path = file.path(path, "extracted-data", .y),
overwrite = T))
# Make an empty list
data_list <- list()
# Read the files into the list
for(file_name in extracted_data$name){
data_list[[file_name]] <- read.csv(file = file.path(path, "extracted-data", file_name))
}
# Duplicate the sheds object
driver_df <- sheds
# For each file
for(k in 1:length(data_list)){
# Processing message
message("Adding ", names(data_list[k]), " to
output dataframe")
# Left join onto the driver dataframe and overwrite the object
driver_df %<>%
dplyr::left_join(y = data_list[[k]],
by = c("LTER", "Stream_Name", "Discharge_File_Name", "Shapefile_Name"))
}
# Glimpse what we wind up with
dplyr::glimpse(driver_df)
# Check all column names (should be *a lot*)
names(driver_df)
# Check for dropped rivers (i.e., rows)
## Stream names (chemistry river names)
supportR::diff_check(old = unique(sheds$Stream_Name), new = unique(driver_df$Stream_Name))
## Discharge file names (discharge river names)
supportR::diff_check(old = unique(sheds$Discharge_File_Name),
new = unique(driver_df$Discharge_File_Name))
## We need to drop the geometry column before export because it messes up the .csv file
driver_df <-sf::st_drop_geometry(driver_df)
## Lastly, we need to add the umlauts, commas, hyphens, etc. back to the Finnish names
## These needed to be removed to ensure that the data extractions workflow ran properly
driver_df$Stream_Name[driver_df$Stream_Name == "Kiiminkij 13010 4tien s"] <- "Kiiminkij 13010 4-tien s"
driver_df$Stream_Name[driver_df$Stream_Name == "Lestijoki 10800 8tien s"] <- "Lestijoki 10800 8-tien s"
driver_df$Stream_Name[driver_df$Stream_Name == "Mustijoki 42 6010"] <- "Mustijoki 4,2 6010"
driver_df$Stream_Name[driver_df$Stream_Name == "Mustionjoki 49 15500"] <- "Mustionjoki 4,9 15500"
driver_df$Stream_Name[driver_df$Stream_Name == "Porvoonjoki 115 6022"] <- "Porvoonjoki 11,5 6022"
driver_df$Stream_Name[driver_df$Stream_Name == "SIMOJOKI AS 13500"] <- "SIMOJOKI AS. 13500"
driver_df$Stream_Name[driver_df$Stream_Name == "Vantaa 42 6040"] <- "Vantaa 4,2 6040"
# Export this
write.csv(x = driver_df, na = '', row.names = F,
file = file.path(path, "extracted-data", "all-data_si-extract_2_202411.csv"))
# And upload to GoogleDrive
googledrive::drive_upload(media = file.path(path, "extracted-data", "all-data_si-extract_2_202411.csv"),
overwrite = T,
path = googledrive::as_id("https://drive.google.com/drive/u/0/folders/1FBq2-FW6JikgIuGVMX5eyFRB6Axe2Hld"))
# End ----