Purpose: Format data for modeling in JAGS

4.1 Base data

Temperature and flow data

Code
dat <- read_csv("data/EcoDrought_FlowTempData_DailyWeekly_clean.csv")
dat
# A tibble: 167,535 × 23
   site_name  basin date       flow_mean tempc_mean tempc_min tempc_max Yield_mm
   <chr>      <chr> <date>         <dbl>      <dbl>     <dbl>     <dbl>    <dbl>
 1 Avery Bro… <NA>  2020-01-01     NA       NA         NA        NA        NA   
 2 Avery Bro… <NA>  2020-01-02     NA       NA         NA        NA        NA   
 3 Avery Bro… <NA>  2020-01-03     NA       NA         NA        NA        NA   
 4 Avery Bro… <NA>  2020-01-04     NA       NA         NA        NA        NA   
 5 Avery Bro… <NA>  2020-01-05     NA       NA         NA        NA        NA   
 6 Avery Bro… <NA>  2020-01-06     NA       NA         NA        NA        NA   
 7 Avery Bro… <NA>  2020-01-07     NA       NA         NA        NA        NA   
 8 Avery Bro… West… 2020-01-08      5.96     0.594      0.111     1.11      1.99
 9 Avery Bro… West… 2020-01-09      4.81     0.0336     0         0.111     1.61
10 Avery Bro… West… 2020-01-10      4.88     0.363      0         0.778     1.63
# ℹ 167,525 more rows
# ℹ 15 more variables: lat <dbl>, long <dbl>, area_sqmi <dbl>, elev_ft <dbl>,
#   yday <dbl>, air_temp_mean <dbl>, precip_mmday <dbl>, swe_kgm2 <dbl>,
#   daylength_sec <dbl>, shortrad_wm2 <dbl>, index <dbl>, year <dbl>,
#   count <dbl>, siteYear <chr>, movingMean <dbl>

Fix basins and trim to focal variables.

Code
mysitebasins <- dat %>% 
  group_by(site_name) %>% 
  summarize(basin = unique(basin),
            lat = unique(lat), 
            long = unique(long), 
            elev_ft = unique(elev_ft),
            area_sqmi = unique(area_sqmi)) %>% 
  filter(!is.na(basin), !is.na(lat), !is.na(long), !is.na(elev_ft), !is.na(area_sqmi)) %>% 
  mutate(basin = recode(basin, "Shields River" = "Yellowstone"))

dat <- dat %>% 
  select(-c(basin, lat, long, elev_ft, area_sqmi)) %>% 
  left_join(mysitebasins) %>% 
  select(site_name, basin, lat, long, elev_ft, area_sqmi, date, yday, year, siteYear, 
         tempc_mean, tempc_min, tempc_max, flow_mean, Yield_mm, air_temp_mean, precip_mmday, swe_kgm2, daylength_sec, shortrad_wm2)

dat
# A tibble: 167,535 × 20
   site_name basin   lat  long elev_ft area_sqmi date        yday  year siteYear
   <chr>     <chr> <dbl> <dbl>   <dbl>     <dbl> <date>     <dbl> <dbl> <chr>   
 1 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-01     1  2020 Avery B…
 2 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-02     2  2020 Avery B…
 3 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-03     3  2020 Avery B…
 4 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-04     4  2020 Avery B…
 5 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-05     5  2020 Avery B…
 6 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-06     6  2020 Avery B…
 7 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-07     7  2020 Avery B…
 8 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-08     8  2020 Avery B…
 9 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-09     9  2020 Avery B…
10 Avery Br… West…  42.4 -72.7    699.      2.83 2020-01-10    10  2020 Avery B…
# ℹ 167,525 more rows
# ℹ 10 more variables: tempc_mean <dbl>, tempc_min <dbl>, tempc_max <dbl>,
#   flow_mean <dbl>, Yield_mm <dbl>, air_temp_mean <dbl>, precip_mmday <dbl>,
#   swe_kgm2 <dbl>, daylength_sec <dbl>, shortrad_wm2 <dbl>

Calculate lagged air temp variables, logged flow variables, and daily change in SWE.

Code
# ensure proper ordering
dat <- dat[order(dat$site_name, dat$year, dat$yday),]

# calculate lagged air temp
dat <- dat %>%
  mutate(Yield_mm_log = log(Yield_mm + 0.00001),
         flow_mean_log = log(flow_mean + 0.00001)) %>%
  group_by(site_name) %>%
  mutate(air_temp_mean_lag1 = lag(air_temp_mean, 1),
         air_temp_mean_lag2 = lag(air_temp_mean, 2),
         delta_swe = swe_kgm2 - lag(swe_kgm2, 1)) %>%
  ungroup()

4.1.1 Trim to synchronized period

Load breakpoints

Code
springFallBPs <- read_csv("data/breakpoints.csv") %>% 
  mutate(basin = recode(basin, "Shields River" = "Yellowstone"))
springFallBPs
# A tibble: 459 × 9
   basin   site_name  year finalSpringBP sourceSpringBP finalFallBP sourceFallBP
   <chr>   <chr>     <dbl>         <dbl> <chr>                <dbl> <chr>       
 1 West B… Avery Br…  2020          66   directly calc…        300  directly ca…
 2 West B… Avery Br…  2021          93   directly calc…        304  directly ca…
 3 West B… Avery Br…  2022          94   directly calc…        313  directly ca…
 4 West B… Avery Br…  2023          79   directly calc…        300. site mean   
 5 West B… Avery Br…  2024          97   directly calc…        282  directly ca…
 6 West B… Sanderso…  2019          83.2 site mean             300  site mean   
 7 West B… Sanderso…  2020          66   directly calc…        301  directly ca…
 8 West B… Sanderso…  2021          80   directly calc…        303  directly ca…
 9 West B… Sanderso…  2022          94   directly calc…        313  directly ca…
10 West B… Sanderso…  2023          78   directly calc…        302  directly ca…
# ℹ 449 more rows
# ℹ 2 more variables: quantileLo <dbl>, quantileHi <dbl>

Join temp/flow data with breakpoints and filter to days within synchronized period.

Code
dat_bp <- dat %>% 
  left_join(springFallBPs) %>%
  filter(yday >= finalSpringBP & yday <= finalFallBP)
dat_bp
# A tibble: 94,956 × 34
   site_name basin   lat  long elev_ft area_sqmi date        yday  year siteYear
   <chr>     <chr> <dbl> <dbl>   <dbl>     <dbl> <date>     <dbl> <dbl> <chr>   
 1 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-06    66  2020 Avery B…
 2 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-07    67  2020 Avery B…
 3 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-08    68  2020 Avery B…
 4 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-09    69  2020 Avery B…
 5 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-10    70  2020 Avery B…
 6 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-11    71  2020 Avery B…
 7 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-12    72  2020 Avery B…
 8 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-13    73  2020 Avery B…
 9 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-14    74  2020 Avery B…
10 Avery Br… West…  42.4 -72.7    699.      2.83 2020-03-15    75  2020 Avery B…
# ℹ 94,946 more rows
# ℹ 24 more variables: tempc_mean <dbl>, tempc_min <dbl>, tempc_max <dbl>,
#   flow_mean <dbl>, Yield_mm <dbl>, air_temp_mean <dbl>, precip_mmday <dbl>,
#   swe_kgm2 <dbl>, daylength_sec <dbl>, shortrad_wm2 <dbl>,
#   Yield_mm_log <dbl>, flow_mean_log <dbl>, z_Yield_mm_log <dbl>,
#   z_air_temp_mean <dbl>, air_temp_mean_lag1 <dbl>, air_temp_mean_lag2 <dbl>,
#   z_air_temp_mean_lag1 <dbl>, z_air_temp_mean_lag2 <dbl>, …

4.1.2 Drop missing data

Show the distribution of the number of stream temp/flow observations (non-NA) per site-year…many years with poor data coverage, especially with respect to flow

Code
obspersy <- dat_bp %>% 
  group_by(siteYear) %>% 
  summarize(streamtemp_numobs = sum(!is.na(tempc_mean)),
            flow_numobs = sum(!is.na(Yield_mm))) %>% 
  ungroup()

paste(dim(obspersy)[1], " total site-years", sep = "")
[1] "459 total site-years"
Code
ggpubr::ggarrange(obspersy %>% ggplot() + geom_histogram(aes(x = streamtemp_numobs)) + theme_bw() + xlab("Number of stream temp. observations"),
                  obspersy %>% ggplot() + geom_histogram(aes(x = flow_numobs)) + theme_bw() + xlab("Number of flow observations"), 
                  nrow = 1)

Drop siteYears with less than 50 days of stream temp./flow observations

Code
drops <- obspersy %>% filter(streamtemp_numobs < 50 | flow_numobs < 50)

paste(dim(obspersy)[1]-dim(drops)[1], " filtered site-years", sep = "")
[1] "328 filtered site-years"
Code
dat_bp <- dat_bp %>% filter(!siteYear %in% drops$siteYear)

Now drop days with missing stream temperature data

Code
dat_bp <- dat_bp %>% filter(!is.na(tempc_mean))

4.1.3 Show data

Plot temp data with LOESS to show seasonal hysteresis

Code
dat_bp %>% 
  ggplot(aes(x = yday, y = tempc_mean)) + 
  geom_point(size = 0.1) +
  facet_wrap(~factor(basin, levels = c("West Brook", "Staunton River", "Paine Run", "Flathead", "Yellowstone", "Snake River", "Donner Blitzen"))) +
  geom_smooth(color = "red", se = FALSE) +
  theme_bw() #+ theme(panel.grid = element_blank())

4.1.4 Z-scores and dummy vars.

Z-score air temp, flow, and day of year. Note: variables are standardized both by site (“z_…”) and by basin (“zbas_…”).

Code
# calculate lagged air temp
dat_bp <- dat_bp %>%
  group_by(site_name) %>%
  mutate(z_Yield_mm_log = c(scale(Yield_mm_log, center = TRUE, scale = TRUE)),
         z_air_temp_mean = c(scale(air_temp_mean, center = TRUE, scale = TRUE)),
         z_air_temp_mean_lag1 = c(scale(air_temp_mean_lag1, center = TRUE, scale = TRUE)), #lag(z_air_temp_mean, n = 1),
         z_air_temp_mean_lag2 = c(scale(air_temp_mean_lag2, center = TRUE, scale = TRUE)) #lag(z_air_temp_mean, n = 2),
         #z_yday = c(scale(yday, center = TRUE, scale = TRUE))
         ) %>%
  ungroup() %>%
  group_by(basin) %>%
  mutate(zbas_Yield_mm_log = c(scale(Yield_mm_log, center = TRUE, scale = TRUE)),
         zbas_air_temp_mean = c(scale(air_temp_mean, center = TRUE, scale = TRUE)),
         zbas_air_temp_mean_lag1 = c(scale(air_temp_mean_lag1, center = TRUE, scale = TRUE)), #lag(zbas_air_temp_mean, n = 1),
         zbas_air_temp_mean_lag2 = c(scale(air_temp_mean_lag2, center = TRUE, scale = TRUE)) #lag(zbas_air_temp_mean, n = 2),
         #zbas_yday = c(scale(yday, center = TRUE, scale = TRUE))
         ) %>%
  ungroup()

Create dummy site and basin variables (numeric for JAGS), and define “rowNum” variable to allow for identifying first rows and evaluation rows

Code
# # Drop sites without any flow or water temp observations
# dropsites <- unlist(dat_bp %>% 
#   group_by(site_name) %>% 
#   summarize(numt = sum(!is.na(tempc_mean)), 
#             numf = sum(!is.na(z_Yield_mm_log))) %>% 
#   ungroup() %>%
#   filter(numt == 0 | numf == 0) %>%
#   select(site_name))
# dat_bp <- dat_bp %>% filter(!site_name %in% dropsites)

# arrange
dat_bp <- dat_bp %>% arrange(basin, site_name, year, yday)

# create tibbles of site and basin numeric codes
sitecodes <- tibble(site_name = unique(dat_bp$site_name), site_code = 1:length(unique(dat_bp$site_name)))
basincodes <- tibble(basin = unique(dat_bp$basin), basin_code = 1:length(unique(dat_bp$basin)))

# join to data and define row number
dat_bp <- dat_bp %>%
  left_join(sitecodes) %>%
  left_join(basincodes) %>%
  mutate(rowNum = 1:nrow(.))

4.2 Landscape covariates

To do – derive additional landscape covariates presumed to affect stream temperature, or rather, mediate the relationship between stream and air temperature/flow (groundwater influence/pasta?, basin slope, lake area, percent forest cover, etc.)

4.3 Check correlations

4.4 Write to file

Write formatted data to file

Code
write_csv(dat_bp, "data/EcoDrought_FlowTempData_formatted.csv")