Peter Ralph
14 January 2020 – Advanced Biological Statistics
A time series is a sequence of observations \[\begin{aligned} (y_1, y_2, \ldots, y_N) , \end{aligned}\] that were taken at some set of times \[\begin{aligned} t_1 < t_2 < \cdots < t_N . \end{aligned}\]
Goals might be to:
Many statistical methods assume independence of observations.
Nearby observations in a time series are definately not.
Solutions: model the correlation; subtract it off.
Suppose we have regular, noisy observations from a discrete, noisy oscillator.
The system itself does \[\begin{aligned} x_{t+1} - x_t &= \alpha y_t + \Normal(0, \sigma_{xy}) \\ y_{t+1} - y_t &= - \beta x_t + \Normal(0, \sigma_{xy}) \end{aligned}\] but we only get to observe \[\begin{aligned} X_t &= x_t + \Normal(0, \sigma_\epsilon) \\ Y_t &= y_t + \Normal(0, \sigma_\epsilon) . \end{aligned}\]
Here’s what this looks like.
true_osc <- list(alpha=.1,
beta=.05,
sigma_xy=.01,
sigma_eps=1)
N <- 500
xy <- matrix(nrow=N, ncol=2)
xy[1,] <- c(3,0)
for (k in 1:(N-1)) {
xy[k+1,] <- (xy[k,]
+ c(true_osc$alpha * xy[k,2],
(-1) * true_osc$beta * xy[k,1])
+ rnorm(2, 0, true_osc$sigma_xy))
}
XY <- xy + rnorm(N*2, 0, true_osc$sigma_eps)
osc_block <- "
data {
int N;
vector[N] X;
vector[N] Y;
}
parameters {
real alpha;
real beta;
real<lower=0> sigma_xy;
real<lower=0> sigma_eps;
vector[N] x;
vector[N] y;
}
model {
x[2:N] ~ normal(x[1:(N-1)] + alpha * y[1:(N-1)], sigma_xy);
y[2:N] ~ normal(y[1:(N-1)] - beta * x[1:(N-1)], sigma_xy);
X ~ normal(x, sigma_eps);
Y ~ normal(y, sigma_eps);
alpha ~ normal(0, 1);
beta ~ normal(0, 1);
sigma_xy ~ normal(0, 1);
sigma_eps ~ normal(0, 1);
}
"
osc_model <- stan_model(model_code=osc_block)
osc_fit <- sampling(osc_model,
data=list(N=N,
X=XY[,1],
Y=XY[,2]),
iter=1000, chains=3,
control=list(max_treedepth=12))
## Warning: There were 3 chains where the estimated Bayesian Fraction of Missing Information was low. See
## http://mc-stan.org/misc/warnings.html#bfmi-low
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.45, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
cbind(truth=c(true_osc$alpha, true_osc$beta, true_osc$sigma_xy, true_osc$sigma_eps),
rstan::summary(osc_fit, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary)
## truth mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 0.10 0.09976285 6.021567e-05 0.0017514028 0.09629297 0.09855202 0.09972681 0.10097621 0.10309224 845.966230 1.0006770
## beta 0.05 0.04989406 3.039775e-05 0.0008868576 0.04824137 0.04926927 0.04986840 0.05049346 0.05172836 851.186636 1.0009753
## sigma_xy 0.01 0.01605386 1.171786e-03 0.0036154651 0.01104737 0.01332916 0.01554787 0.01780032 0.02528952 9.519889 1.4247695
## sigma_eps 1.00 0.99157676 5.863422e-04 0.0220641441 0.95013703 0.97525310 0.99101743 1.00719460 1.03560456 1416.028210 0.9987841
Here is a density plot of 100 estimated trajectories (of x
and y
) from the Stan fit.
Let’s try that again, with more noise.
Here’s what this looks like.
true_osc2 <- list(alpha=.1,
beta=.05,
sigma_xy=2.0,
sigma_eps=4.0)
xy2 <- matrix(nrow=N, ncol=2)
xy2[1,] <- c(3,0)
for (k in 1:(N-1)) {
xy2[k+1,] <- (xy2[k,]
+ c(true_osc2$alpha * xy2[k,2],
(-1) * true_osc2$beta * xy2[k,1])
+ rnorm(2, 0, true_osc2$sigma_xy))
}
XY2 <- xy2 + rnorm(N*2, 0, true_osc2$sigma_eps)
osc_fit2 <- sampling(osc_model,
data=list(N=N,
X=XY2[,1],
Y=XY2[,2]),
iter=1000, chains=3,
control=list(max_treedepth=12))
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
cbind(truth=c(true_osc2$alpha, true_osc2$beta, true_osc2$sigma_xy, true_osc2$sigma_eps),
rstan::summary(osc_fit2, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary)
## truth mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 0.10 0.09951669 3.934835e-05 0.001594103 0.09637761 0.09842452 0.09954156 0.10058203 0.10263996 1641.268 0.9993526
## beta 0.05 0.04898589 2.859515e-05 0.001144167 0.04670660 0.04822168 0.04899289 0.04975163 0.05129391 1601.010 1.0010777
## sigma_xy 2.00 2.04486066 9.993463e-03 0.130059864 1.80366354 1.95971901 2.03881715 2.12445243 2.30083361 169.377 1.0005429
## sigma_eps 4.00 3.75648565 3.566001e-03 0.115847651 3.52383531 3.67657052 3.75798270 3.83640882 3.97456349 1055.386 0.9983640
Here is a density plot of 100 estimated trajectories (of x
and y
) from the Stan fit.
Now what if we actually don’t observe most of the \(Y\) values?
Here’s what this looks like.
true_osc3 <- list(alpha=.1,
beta=.05,
sigma_xy=.05,
sigma_eps=.5)
xy3 <- matrix(nrow=N, ncol=2)
xy3[1,] <- c(3,0)
for (k in 1:(N-1)) {
xy3[k+1,] <- (xy3[k,]
+ c(true_osc3$alpha * xy3[k,2],
(-1) * true_osc3$beta * xy3[k,1])
+ rnorm(2, 0, true_osc3$sigma_xy))
}
XY3 <- xy3 + rnorm(N*2, 0, true_osc3$sigma_eps)
obs_y <- sample.int(N, size=10)
XY3[setdiff(1:N, obs_y), 2] <- NA
osc_block_missing <- "
data {
int N;
vector[N] X;
int k; // number of observed Y
int obs_y[k]; // which Y values are observed
vector[k] Y;
}
parameters {
real alpha;
real beta;
real<lower=0> sigma_xy;
real<lower=0> sigma_eps;
vector[N] x;
vector[N] y;
}
model {
x[2:N] ~ normal(x[1:(N-1)] + alpha * y[1:(N-1)], sigma_xy);
y[2:N] ~ normal(y[1:(N-1)] - beta * x[1:(N-1)], sigma_xy);
X ~ normal(x, sigma_eps);
Y ~ normal(y[obs_y], sigma_eps);
alpha ~ normal(0, 1);
beta ~ normal(0, 1);
sigma_xy ~ normal(0, 1);
sigma_eps ~ normal(0, 1);
}
"
osc_model_missing <- stan_model(model_code=osc_block_missing)
osc_fit3 <- sampling(osc_model_missing,
data=list(N=N,
X=XY3[,1],
k=length(obs_y),
obs_y=obs_y,
Y=XY3[obs_y,2]),
iter=1000, chains=3,
control=list(max_treedepth=12))
## Warning: There were 3 chains where the estimated Bayesian Fraction of Missing Information was low. See
## http://mc-stan.org/misc/warnings.html#bfmi-low
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 1.31, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
cbind(truth=c(true_osc3$alpha, true_osc3$beta, true_osc3$sigma_xy, true_osc3$sigma_eps),
rstan::summary(osc_fit3, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary)
## truth mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## alpha 0.10 0.09863762 0.0002607767 0.003744194 0.09199764 0.09601437 0.09837469 0.10095379 0.10683077 206.14798 1.011433
## beta 0.05 0.05016057 0.0001329084 0.001892851 0.04639661 0.04898090 0.05023687 0.05146732 0.05376685 202.82802 1.013808
## sigma_xy 0.05 0.04134701 0.0013517795 0.006084007 0.03134862 0.03669677 0.04096572 0.04545947 0.05404657 20.25665 1.307217
## sigma_eps 0.50 0.49036338 0.0005004922 0.017022319 0.45726336 0.47920755 0.48966945 0.50093498 0.52475828 1156.75896 1.001548
Here is a density plot of 100 estimated trajectories (of x
and y
) from the Stan fit.
# The "average" column contains the monthly mean CO2 mole fraction determined
# from daily averages. The mole fraction of CO2, expressed as parts per million
# (ppm) is the number of molecules of CO2 in every one million molecules of dried
# air (water vapor removed). If there are missing days concentrated either early
# or late in the month, the monthly mean is corrected to the middle of the month
# using the average seasonal cycle. Missing months are denoted by -99.99.
# The "interpolated" column includes average values from the preceding column
# and interpolated values where data are missing. Interpolated values are
# computed in two steps. First, we compute for each month the average seasonal
# cycle in a 7-year window around each monthly value. In this way the seasonal
# cycle is allowed to change slowly over time. We then determine the "trend"
# value for each month by removing the seasonal cycle; this result is shown in
# the "trend" column. Trend values are linearly interpolated for missing months.
# The interpolated monthly mean is then the sum of the average seasonal cycle
# value and the trend value for the missing month.
co2 <- read.table("../Datasets/Mauna_Loa_C02/co2_mm_mlo.txt", comment="#")
names(co2) <- c("year", "month", "decimal_date", "average", "interpolated", "trend", "num_days")
co2[co2 == -99.99] <- NA
head(co2)
## year month decimal_date average interpolated trend num_days
## 1 1958 3 1958.208 315.71 315.71 314.62 -1
## 2 1958 4 1958.292 317.45 317.45 315.29 -1
## 3 1958 5 1958.375 317.50 317.50 314.71 -1
## 4 1958 6 1958.458 NA 317.10 314.85 -1
## 5 1958 7 1958.542 315.86 315.86 314.98 -1
## 6 1958 8 1958.625 314.93 314.93 315.94 -1
# 1. get monthly means of nearest 7 years and subtract off to get trend
co2$our_trend <- NA
monthly_deviation <- rep(NA, nrow(co2))
for (j in 1:nrow(co2)) {
nearest_seven <- ( abs(co2$decimal_date - co2$decimal_date[j]) <= 3.5 )
# e.g., if month[j] is 7 then monthly_deviation[j] will be the average difference
# in co2 concentration from the overall mean
# across the seven Julys in this chunk of data from 7 years
k <- co2$month[j]
monthly_deviation[j] <- with(subset(co2, nearest_seven),
mean(average[month == k] - mean(average, na.rm=TRUE), na.rm=TRUE))
co2$our_trend[j] <- co2$average[j] - monthly_deviation[j]
}
# 2. Interpolate missing trend values
for (j in which(is.na(co2$our_trend))) {
before <- max(which(!is.na(co2$our_trend[1:j])))
after <- min(which(!is.na(co2$our_trend[j:nrow(co2)])))
dx <- (after - before)
dy <- co2$our_trend[after] - co2$our_trend[before]
co2$our_trend[j] <- co2$our_trend[before] + (j - before) * dy / dx
}
# sanity check
stopifnot(all(!is.na(co2$our_trend)))
# 3. Get interpolated values by adding trend to average monthly values
co2$our_interp <- co2$our_trend + monthly_deviation
stopifnot(all(!is.na(co2$our_interp)))
plot(our_trend ~ decimal_date, data=co2, type='l')
(a.k.a. “the correlation”)
\[\begin{aligned} r = \cor[x,y] = \frac{1}{n-1} \sum_{i=1}^n \left(\frac{ x_i - \bar x }{\sd[x]}\right) \left(\frac{ y_i - \bar y }{\sd[y]}\right) \end{aligned}\] where \[\begin{aligned} \bar x = \text{mean}[x] &= \frac{1}{n} \sum_{i=1}^n x_i \\ s_x = \sd[x] &= \sqrt{\frac{1}{n-1} \sum_{i=1}^n (x_i - \bar x)^2} \end{aligned}\]
> help(cor)
Correlation, Variance and Covariance (Matrices)
Description:
‘var’, ‘cov’ and ‘cor’ compute the variance of ‘x’ and the
covariance or correlation of ‘x’ and ‘y’ if these are vectors. If
‘x’ and ‘y’ are matrices then the covariances (or correlations)
between the columns of ‘x’ and the columns of ‘y’ are computed.
‘cov2cor’ scales a covariance matrix into the corresponding
correlation matrix _efficiently_.
Usage:
var(x, y = NULL, na.rm = FALSE, use)
cov(x, y = NULL, use = "everything",
method = c("pearson", "kendall", "spearman"))
Spearman’s correlation coefficient (or, “Spearman’s rho”)
is just “the” correlation coefficient between the ranks of the vectors.
It is nonparametric - it makes sense regardless of the data’s distribution.
## [1] -0.1157895
## [1] -0.1157895
The autocorrelation function of a time series \((x(t) : 0 \le t \le L)\) is the correlation between points as a function of lag:
\[\begin{aligned} \rho(\ell) &= \cor[x(T), x(T+\ell)], \end{aligned}\]
where \(T\) is uniform between 0 and \(L-\ell\).
\[\begin{aligned} \rho(\ell) &= \frac{1}{T-\ell-1} \sum_{t=1}^{T-\ell} \left(\frac{ x(t) - \bar x }{ \sd[x] } \right) \left(\frac{ x(t + \ell) - \bar x }{ \sd[x] } \right) . \end{aligned}\]
The acf( )
function requires a time series object:
loess
> help(loess)
Local Polynomial Regression Fitting
Description:
Fit a polynomial surface determined by one or more numerical
predictors, using local fitting.
Usage:
loess(formula, data, weights, subset, na.action, model = FALSE,
span = 0.75, enp.target, degree = 2,
parametric = FALSE, drop.square = FALSE, normalize = TRUE,
family = c("gaussian", "symmetric"),
method = c("loess", "model.frame"),
control = loess.control(...), ...)
span
Ocean temperature is available from NOAA buoys at tidesandcurrents.noaa.gov:
Ten years of data for Newport, OR are in the Datasets/ directory
ocean$numdate <- as.numeric(ocean$date)
month_smooth <- loess(WATER_TEMP_F ~ numdate, data=ocean,
span=sum(ocean$Month < 4 & ocean$Year == 2012)/nrow(ocean))
ocean$month_sm <- predict(month_smooth, newdata=ocean)
plot(WATER_TEMP_F ~ date, data=ocean, pch=20, cex=0.5)
lines(month_sm ~ date, data=ocean, col='red', lwd=2)
12 * (1 + 1/27.2)
hour cycle!acf(water, lag.max=10 * 12, xlab='lag (hours)', na.action=na.pass)
abline(v=12*(1 + 1/27.2) * 1:10, col=c('red', 'blue'), lwd=2)
… why?