library(weatheRcues)
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
#> ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
#> ✔ purrr 1.1.0
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mgcv)
#> Loading required package: nlme
#>
#> Attaching package: 'nlme'
#>
#> The following object is masked from 'package:dplyr':
#>
#> collapse
#>
#> This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(lubridate)
The weatheRcues
package helps detect biologically
relevant weather cue windows—specific periods where climate variables
(e.g., temperature, rainfall) best explain biological responses such as
seed production. Here we demonstrate how to apply four major
methods:
Sliding Time Window (climwin)
P-spline Regression
Climate Sensitivity Profile
Peak Signal Detection
We will first simulate climate and seed data (because we all love seed production and want to know more about plant mast seeding) to illustrate each method.
We generate one weather cue window from June 150–160 (DOY) to influence seed production.
library(tidyverse)
set.seed(123)
startyear = 1940
years <- startyear:2020
days_per_year <- 365
#ok so now just generate climate for a long time.
#I will add yday, but also, the way I coded the function (for now) you need to have LONGITUDE AND LATITUDE columns
climate_data_simulated <- data.frame(
date = seq.Date(from = as.Date(paste0(startyear,"-01-01")), by = "day", length.out = length(years) * days_per_year),
temp = runif(length(years) * days_per_year, min = 10, max = 30)
) %>%
mutate(
year = as.numeric(format(date, "%Y")),
yday = yday(date),
temp = scale(temp)[,1],
LONGITUDE = NA,
LATITUDE = NA
)
# Select ~ one week in June
june_week <- climate_data_simulated %>%
filter(yday >= 150 & yday <= 160)
#now you can simulate you seed production (here I used parameters I observed). For seed production data, you also need to have additional column, Date2, with this d-m-Y format and two other column, plotname.lon.lat and sitenewname. I will try to fix this later, I used this to be sure my time series are matching climate location. I also selected one year less (to have sure that climate would be enough years)
seed_production_simulated <- june_week %>%
group_by(year) %>%
summarise(mean_temp = mean(temp)) %>%
mutate(
log.seed = 1 + 5 * scale(mean_temp)[,1] + rnorm(n(),mean =0, sd=1),
Date2 = as.POSIXct(paste0(year, "-06-15")),
sitenewname = "test",
plotname.lon.lat = "test") %>%
filter(year > startyear)
For the first method, I was using climwin R package. Here I customize a bit the function, but you can specify more option (for aggregate weather window for example). Please refer to Bailey and van de Pol, 2016, PlosOne, climwin: An R Toolbox for Climate Window Analysis.
head(seed_production_simulated)
#> # A tibble: 6 × 6
#> year mean_temp log.seed Date2 sitenewname plotname.lon.lat
#> <dbl> <dbl> <dbl> <dttm> <chr> <chr>
#> 1 1941 -0.312 -5.33 1941-06-15 00:00:00 test test
#> 2 1942 -0.0541 -0.648 1942-06-15 00:00:00 test test
#> 3 1943 -0.237 -3.59 1943-06-15 00:00:00 test test
#> 4 1944 0.0860 1.47 1944-06-15 00:00:00 test test
#> 5 1945 -0.343 -4.35 1945-06-15 00:00:00 test test
#> 6 1946 0.0669 2.59 1946-06-15 00:00:00 test test
head(climate_data_simulated)
#> date temp year yday LONGITUDE LATITUDE
#> 1 1940-01-01 -0.7300034 1940 1 NA NA
#> 2 1940-01-02 1.0069975 1940 2 NA NA
#> 3 1940-01-03 -0.3088745 1940 3 NA NA
#> 4 1940-01-04 1.3355500 1940 4 NA NA
#> 5 1940-01-05 1.5348410 1940 5 NA NA
#> 6 1940-01-06 -1.5695631 1940 6 NA NA
climwin.output = invisible(capture.output(runing_climwin(
climate_data = climate_data_simulated,
bio_data = seed_production_simulated,
site.name = 'test',
range = c(200, 0), #double check if climate is long enough (to match bio data)
cinterval = 'day',
refday = c(01, 11),
optionwindows = 'absolute',
climate_var = 'temp'
)))
#> Initialising, please wait...
#> Registered S3 methods overwritten by 'broom':
#> method from
#> nobs.fitdistr MuMIn
#> nobs.multinom MuMIn
climwin.output
#> [1] "[1] \"Response variable: log.seed\""
#> [2] "[1] \"range: 200\" \"range: 0\" "
#> [3] "[1] \"cinterval: day\""
#> [4] "[1] \"ref day starting: 1 with month: 11\""
#> [5] "[1] \"stat aggregate: mean\""
#> [6] "[1] \"windows option: absolute\""
#> [7] "[1] \"variable of interest: temp\""
#> [8] "\r | \r | | 0%# A tibble: 1 × 26"
#> [9] " sitenewname climate.file response climate type stat func DeltaAICc"
#> [10] "* <chr> <chr> <chr> <fct> <fct> <fct> <fct> <dbl>"
#> [11] "1 test test log.seed temperature.deg… abso… mean lin -204."
#> [12] "# ℹ 18 more variables: window.open <int>, window.close <dbl>, AIC <dbl>,"
#> [13] "# AICc <dbl>, BIC <dbl>, r2 <dbl>, R2_adjusted <dbl>, RMSE <dbl>,"
#> [14] "# Sigma <dbl>, slope.estimate <dbl>, slope.std.error <dbl>,"
#> [15] "# slope.statistic <dbl>, slope.p.value <dbl>, intercept.estimate <dbl>,"
#> [16] "# intercept.std.error <dbl>, intercept.statistic <dbl>,"
#> [17] "# intercept.p.value <dbl>, sigma <dbl>"
#in case you can change the option to give.clean = FALSE, it will give you all models outputs from `climwin::slidingwin`
P-splines model day-specific weather effects and apply a penalty to smooth consecutive coefficients. Here we got some issues, if data is too noisy (which is often the case with seed production in trees) or too short time series. But it is consider as one of the robust method if you are using phenological datasets (see another application in Simmonds et al, Journal of Animal Ecology, 2019).
head(seed_production_simulated)
#> # A tibble: 6 × 6
#> year mean_temp log.seed Date2 sitenewname plotname.lon.lat
#> <dbl> <dbl> <dbl> <dttm> <chr> <chr>
#> 1 1941 -0.312 -5.33 1941-06-15 00:00:00 test test
#> 2 1942 -0.0541 -0.648 1942-06-15 00:00:00 test test
#> 3 1943 -0.237 -3.59 1943-06-15 00:00:00 test test
#> 4 1944 0.0860 1.47 1944-06-15 00:00:00 test test
#> 5 1945 -0.343 -4.35 1945-06-15 00:00:00 test test
#> 6 1946 0.0669 2.59 1946-06-15 00:00:00 test test
head(climate_data_simulated)
#> date temp year yday LONGITUDE LATITUDE
#> 1 1940-01-01 -0.7300034 1940 1 NA NA
#> 2 1940-01-02 1.0069975 1940 2 NA NA
#> 3 1940-01-03 -0.3088745 1940 3 NA NA
#> 4 1940-01-04 1.3355500 1940 4 NA NA
#> 5 1940-01-05 1.5348410 1940 5 NA NA
#> 6 1940-01-06 -1.5695631 1940 6 NA NA
pspline.output = runing_psr(
bio_data = seed_production_simulated,
site = 'test',
climate_data = climate_data_simulated,
lastdays = 170,
refday = 305,
rollwin = 1,
formula_model = formula('log.seed ~ temp'),
matrice = c(3, 1),
knots = NULL,
tolerancedays = 7,
yearneed = 1
)
#> Joining with `by = join_by(year)`
pspline.output
#> sitenewname plotname.lon.lat reference.day windows.size window.open
#> 1 test test 305 1 155
#> window.close intercept.estimate intercept.std.error slope.estimate
#> 1 147 1.010298 0.2752799 13.10497
#> slope.std.error pvalue r2 AIC sigma nobs nsequence.id
#> 1 0.7893469 2.541362e-27 0.7794343 375.1715 2.462168 80 1
This method performs a daily regression, extracting \(R^2\) and slope over time, and fits two GAMs to smooth the results. Cue windows are defined by where both metrics exceed thresholds. First step here is to run a daily relationship.
head(seed_production_simulated)
#> # A tibble: 6 × 6
#> year mean_temp log.seed Date2 sitenewname plotname.lon.lat
#> <dbl> <dbl> <dbl> <dttm> <chr> <chr>
#> 1 1941 -0.312 -5.33 1941-06-15 00:00:00 test test
#> 2 1942 -0.0541 -0.648 1942-06-15 00:00:00 test test
#> 3 1943 -0.237 -3.59 1943-06-15 00:00:00 test test
#> 4 1944 0.0860 1.47 1944-06-15 00:00:00 test test
#> 5 1945 -0.343 -4.35 1945-06-15 00:00:00 test test
#> 6 1946 0.0669 2.59 1946-06-15 00:00:00 test test
head(climate_data_simulated)
#> date temp year yday LONGITUDE LATITUDE
#> 1 1940-01-01 -0.7300034 1940 1 NA NA
#> 2 1940-01-02 1.0069975 1940 2 NA NA
#> 3 1940-01-03 -0.3088745 1940 3 NA NA
#> 4 1940-01-04 1.3355500 1940 4 NA NA
#> 5 1940-01-05 1.5348410 1940 5 NA NA
#> 6 1940-01-06 -1.5695631 1940 6 NA NA
daily.relation.res = runing_daily_relationship(
bio_data = seed_production_simulated,
climate_data = climate_data_simulated,
lastdays = 170,
refday = 305,
rollwin = 1,
formula_model = formula('log.seed ~ temp'),
yearneed = 1
)
#> Joining with `by = join_by(year)`
#> Joining with `by = join_by(days.reversed)`
#> Joining with `by = join_by(sitenewname, days.reversed)`
#here you have a function to make a calendar matching DOY and days reversed.
calendar = generate_reverse_day_calendar(refday = 305, lastdays = 180, yearback = 1)
#daily.relation.res %>% left_join(calendar)
#You can also have a small fancy plot
plot_daily_relation(
daily.relation.res,
calendar,
x_axis = "reversed.day", #can change to doy or reversed.day
whattoplot = "estimate",
highlight_window = c(160, 150),#here it is close, but remember it is a simulation...
)
And now let’s try the climate sensitivity profile.
#here it is CSP method
csp.out = runing_csp(Results_days =daily.relation.res,
bio_data = seed_production_simulated,
climate_data= climate_data_simulated,
siteneame.forsub = "test",
refday = 305,
lastdays = 170,
rollwin = 1, yearneed = 1,
k.provided = -1, #by default please keep na, here is just a test to make it fast
formula_model = formula('log.seed ~ temp'), optim.k = F, option.check.name = F)
csp.out
#> sitenewname plotname.lon.lat reference.day windows.size window.open
#> 1 test test 1 1 169
#> 2 test test 1 1 159
#> window.close intercept.estimate intercept.std.error slope.estimate
#> 1 164 1.0251390 0.5906223 0.6019482
#> 2 143 0.9289593 0.3380087 18.3943038
#> slope.std.error pvalue r2 AIC sigma nobs nsequence.id
#> 1 1.639869 7.145602e-01 0.00172447 495.9582 5.238101 80 1
#> 2 1.469821 2.439479e-20 0.66754269 407.9968 3.022850 80 2
Peak signal detection identifies days with highest \(R^2 \times\) slope signal strength. It is quick method, but might be sensitive if you have multiple peaks (i.e multiple windows can be detected). It will work like a moving window after runing correlation to each days. Here you can adjust parameters lag, that will basically search cues after a number of lag day (here we specified 50). Then by increasing the threshold, you will basically select only the highest peak of correlation.
peak.out = runing_peak_detection(Results_days = daily.relation.res,
bio_data = seed_production_simulated,
climate_data = climate_data_simulated,
refday = 305,
lastdays = 170,
rollwin = 1, yearneed = 1,formula_model = formula('log.seed ~ temp'),
lag = 50,
threshold = 4,
tolerancedays = 7)
peak.out
#> sitenewname plotname.lon.lat reference.day windows.size window.open
#> 1 test test 1 1 51
#> 2 test test 1 1 73
#> 3 test test 1 1 91
#> 4 test test 1 1 105
#> 5 test test 1 1 125
#> 6 test test 1 1 156
#> 7 test test 1 1 169
#> window.close intercept.estimate intercept.std.error slope.estimate
#> 1 51 0.7837830 0.5663751 -1.5158707
#> 2 73 0.6724786 0.5817803 -1.2563666
#> 3 91 0.9399305 0.5657726 1.3444472
#> 4 105 0.8785059 0.5688615 1.2959679
#> 5 125 1.0400171 0.5667397 1.2974821
#> 6 146 1.0516443 0.1616624 15.6839376
#> 7 167 0.9506381 0.5897548 -0.6348765
#> slope.std.error pvalue r2 AIC sigma nobs nsequence.id
#> 1 0.5643704 8.833531e-03 0.084660975 489.0194 5.015794 80 1
#> 2 0.5250706 1.912750e-02 0.068381823 490.4297 5.060199 80 2
#> 3 0.5552178 1.777984e-02 0.069917768 490.2977 5.056026 80 3
#> 4 0.5539512 2.186933e-02 0.065568940 490.6709 5.067833 80 4
#> 5 0.5522725 2.133722e-02 0.066085823 490.6266 5.066431 80 5
#> 6 0.5095246 2.169725e-45 0.923939664 289.9980 1.445865 80 6
#> 7 1.0473750 5.461688e-01 0.004688548 495.7203 5.230319 80 7
Again, if you arrived here, these methods are available thanks to all previous research done on this topic (i.e detecting environmental cues), and we tried to make some of these methods more accessible to a broader audience. But we have still some improvement to make and would be happy to hear other advice ;)