Peter Ralph
12 March 2019 – Advanced Biological Statistics
A time series is a sequence of observations \[\begin{aligned} (y_1, y_2, \ldots, y_N) , \end{aligned}\] that were taken at some set of times \[\begin{aligned} t_1 < t_2 < \cdots < t_N . \end{aligned}\]
In general, the goal is to understand how what happens next depends on the previous history and maybe some predictor variables \[\begin{aligned} (x_1, x_2, \ldots, x_N) , \end{aligned}\] taken at the same set of times.
Suppose we have regular, noisy observations from a discrete, noisy oscillator.
The system itself does \[\begin{aligned} x_{t+1} - x_t &= \alpha y_t + \Normal(0, \sigma_{xy}) \\ y_{t+1} - y_t &= - \beta x_t + \Normal(0, \sigma_{xy}) \end{aligned}\] but we only get to observe \[\begin{aligned} X_t &= x_t + \Normal(0, \sigma_\epsilon) \\ Y_t &= y_t + \Normal(0, \sigma_\epsilon) . \end{aligned}\]
Here’s what this looks like.
true_osc <- list(alpha=.1,
beta=.05,
sigma_xy=.01,
sigma_eps=.1)
N <- 500
xy <- matrix(nrow=N, ncol=2)
xy[1,] <- c(3,0)
for (k in 1:(N-1)) {
xy[k+1,] <- (xy[k,]
+ c(true_osc$alpha * xy[k,2],
(-1) * true_osc$beta * xy[k,1])
+ rnorm(2, 0, true_osc$sigma_xy))
}
XY <- xy + rnorm(N*2, 0, true_osc$sigma_eps)
osc_block <- "
data {
int N;
vector[N] X;
vector[N] Y;
}
parameters {
real alpha;
real beta;
real<lower=0> sigma_xy;
real<lower=0> sigma_eps;
vector[N] x;
vector[N] y;
}
model {
x[2:N] ~ normal(x[1:(N-1)] + alpha * y[1:(N-1)], sigma_xy);
y[2:N] ~ normal(y[1:(N-1)] - beta * x[1:(N-1)], sigma_xy);
X ~ normal(x, sigma_eps);
Y ~ normal(y, sigma_eps);
alpha ~ normal(0, 1);
beta ~ normal(0, 1);
sigma_xy ~ normal(0, 1);
sigma_eps ~ normal(0, 1);
}
"
osc_model <- stan_model(model_code=osc_block)
## Error in sink(type = "output"): invalid connection
osc_fit <- sampling(osc_model,
data=list(N,
X=XY[,1],
Y=XY[,2]),
iter=1000, chains=3,
control=list(max_treedepth=12))
## Error in sampling(osc_model, data = list(N, X = XY[, 1], Y = XY[, 2]), : object 'osc_model' not found
cbind(rstan::summary(osc_fit, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary,
truth=c(true_osc$alpha, true_osc$beta, true_osc$sigma_xy, true_osc$sigma_eps))
## Error in rstan::summary(osc_fit, pars = c("alpha", "beta", "sigma_xy", : object 'osc_fit' not found
Here is a density plot of 100 estimated trajectories (of x
and y
) from the Stan fit.
## Error in extract(osc_fit): object 'osc_fit' not found
## Error in lines(osc_results$x[k, ], osc_results$y[k, ], col = adjustcolor("black", : object 'osc_results' not found
Let’s try that again, with more noise.
Here’s what this looks like.
true_osc2 <- list(alpha=.1,
beta=.05,
sigma_xy=2.0,
sigma_eps=4.0)
xy2 <- matrix(nrow=N, ncol=2)
xy2[1,] <- c(3,0)
for (k in 1:(N-1)) {
xy2[k+1,] <- (xy2[k,]
+ c(true_osc2$alpha * xy2[k,2],
(-1) * true_osc2$beta * xy2[k,1])
+ rnorm(2, 0, true_osc2$sigma_xy))
}
XY2 <- xy2 + rnorm(N*2, 0, true_osc2$sigma_eps)
osc_fit2 <- sampling(osc_model,
data=list(N,
X=XY2[,1],
Y=XY2[,2]),
iter=1000, chains=3,
control=list(max_treedepth=12))
## Error in sampling(osc_model, data = list(N, X = XY2[, 1], Y = XY2[, 2]), : object 'osc_model' not found
cbind(rstan::summary(osc_fit2, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary,
truth=c(true_osc2$alpha, true_osc2$beta, true_osc2$sigma_xy, true_osc2$sigma_eps))
## Error in rstan::summary(osc_fit2, pars = c("alpha", "beta", "sigma_xy", : object 'osc_fit2' not found
Here is a density plot of 100 estimated trajectories (of x
and y
) from the Stan fit.
## Error in extract(osc_fit2): object 'osc_fit2' not found
## Error in lines(osc_results2$x[k, ], osc_results2$y[k, ], col = adjustcolor("black", : object 'osc_results2' not found
Now what if we actually don’t observe most of the \(Y\) values?
Here’s what this looks like.
true_osc3 <- list(alpha=.1,
beta=.05,
sigma_xy=.05,
sigma_eps=.5)
xy3 <- matrix(nrow=N, ncol=2)
xy3[1,] <- c(3,0)
for (k in 1:(N-1)) {
xy3[k+1,] <- (xy3[k,]
+ c(true_osc3$alpha * xy3[k,2],
(-1) * true_osc3$beta * xy3[k,1])
+ rnorm(2, 0, true_osc3$sigma_xy))
}
XY3 <- xy3 + rnorm(N*2, 0, true_osc3$sigma_eps)
obs_y <- sample.int(N, size=10)
XY3[setdiff(1:N, obs_y), 2] <- NA
osc_block_missing <- "
data {
int N;
vector[N] X;
int k; // number of observed Y
int obs_y[k]; // which Y values are observed
vector[k] Y;
}
parameters {
real alpha;
real beta;
real<lower=0> sigma_xy;
real<lower=0> sigma_eps;
vector[N] x;
vector[N] y;
}
model {
x[2:N] ~ normal(x[1:(N-1)] + alpha * y[1:(N-1)], sigma_xy);
y[2:N] ~ normal(y[1:(N-1)] - beta * x[1:(N-1)], sigma_xy);
X ~ normal(x, sigma_eps);
Y ~ normal(y[obs_y], sigma_eps);
alpha ~ normal(0, 1);
beta ~ normal(0, 1);
sigma_xy ~ normal(0, 1);
sigma_eps ~ normal(0, 1);
}
"
osc_model_missing <- stan_model(model_code=osc_block_missing)
## Error in sink(type = "output"): invalid connection
osc_fit3 <- sampling(osc_model_missing,
data=list(N,
X=XY3[,1],
k=length(obs_y),
obs_y=obs_y,
Y=XY3[obs_y,2]),
iter=1000, chains=3,
control=list(max_treedepth=12))
## Error in sampling(osc_model_missing, data = list(N, X = XY3[, 1], k = length(obs_y), : object 'osc_model_missing' not found
cbind(rstan::summary(osc_fit3, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary,
truth=c(true_osc3$alpha, true_osc3$beta, true_osc3$sigma_xy, true_osc3$sigma_eps))
## Error in rstan::summary(osc_fit3, pars = c("alpha", "beta", "sigma_xy", : object 'osc_fit3' not found
Here is a density plot of 100 estimated trajectories (of x
and y
) from the Stan fit.
## Error in extract(osc_fit3): object 'osc_fit3' not found
## Error in lines(osc_results3$x[k, ], osc_results3$y[k, ], col = adjustcolor("black", : object 'osc_results3' not found
Suppose we have estimates of abundance of a soil microbe from a number of samples across our study area:
(x,y)
: spatial coords; z
: abundance
## x y z
## 1 0.36570561 0.91601951 4.203546
## 2 0.34674591 0.14804962 9.677145
## 3 0.28120674 0.99160415 4.202536
## 4 0.09618601 0.76485639 4.562420
## 5 0.37855554 0.37953143 7.309844
## 6 0.27307616 0.32704858 7.990845
## 7 0.26440936 0.58937968 5.235354
## 8 0.69809336 0.80556321 4.150339
## 9 0.74084430 0.03327871 9.092326
## 10 0.22440814 0.38113112 7.268042
## 11 0.65007941 0.14698290 9.917480
## 12 0.55097394 0.11196825 10.030375
## 13 0.38075338 0.17814633 8.662223
## 14 0.68119877 0.23249908 8.836484
## 15 0.38755934 0.57082907 5.065999
## 16 0.49790287 0.04134980 9.315853
## 17 0.79792488 0.97102161 2.878486
## 18 0.46672735 0.59853551 5.155774
## 19 0.87866343 0.48030911 6.187081
## 20 0.26769967 0.14591511 8.578937
Goals:
(descriptive) What spatial scale does abundance vary over?
(predictive) What are the likely (range of) abundances at new locations?
Everything is related to everything else, but near things are more related than distant things.
Modeler: Great, covariance is a decreasing function of distance.
A convenient choice: the covariance between two points distance \(d\) apart is \[\begin{aligned} \alpha^2 \exp\left(- \frac{1}{2}\left(\frac{d}{\rho}\right)^2 \right) . \end{aligned}\]
\(\alpha\) controls the overall variance (amount of noise)
\(\rho\) is the spatial scale that covariance decays over
Here’s an R
function that takes a set of locations (xy
), a variance scaling alpha
, and a spatial scale rho
:
cov_exp_quad <- function (xy, alpha, rho) {
# return the 'quadratic exponential' covariance matrix
# for spatial positions xy
dxy <- as.matrix(dist(xy))
return( alpha^2 * exp( - (1/2) * dxy^2 / rho^2 ) )
}
Challenge: simulate spatially autocorrelated random Gaussian values, and plot them, in space. Pick parameters so you can tell they are autocorrelated.
to color points by a continuous value:
colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)]
library(mvtnorm)
N <- 100
# spatial extent of order 1
xy <- data.frame(y=runif(N),
x=rnorm(N))
rho <- .25
alpha <- 1.0
# compute covariance matrix
K <- cov_exp_quad(xy, alpha, rho)
# simulate z's
xy$z <- as.vector(rmvnorm(1, mean=rep(0,N),
sigma=K))
plot(xy$x, xy$y, xlab='x spatial coord',
ylab='y spatial coord', pch=20, asp=1,
cex=as.numeric(cut(xy$z, breaks=10))/2,
col=colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)])
(descriptive) What spatial scale does abundance vary over?
\(\Rightarrow\) What is \(\rho\)?
(predictive) What are the likely (range of) abundances at new locations?
\(\Rightarrow\) Add unobserved abundances as parameters.
sp_block <- "
data {
int N; // number of obs
vector[2] xy[N]; // spatial pos
vector[N] z;
}
parameters {
real<lower=0> alpha;
real<lower=0> rho;
}
model {
matrix[N, N] K;
K = cov_exp_quad(xy, alpha, rho);
z ~ multi_normal(rep_vector(0.0, N), K);
alpha ~ normal(0, 5);
rho ~ normal(0, 5);
}
"
# check this compiles
sp_model <- stan_model(model_code=sp_block)
## Error in sink(type = "output"): invalid connection
Challenge: we would like to estimate the abundance at the k
locations new_xy
. Add this feature to the Stan block.
data {
int N; // number of obs
vector[2] xy[N]; // spatial pos
vector[N] z;
}
parameters {
real<lower=0> alpha;
real<lower=0> rho;
}
model {
matrix[N, N] K;
K = cov_exp_quad(xy, alpha, rho);
z ~ multi_normal(rep_vector(0.0, N), K);
alpha ~ normal(0, 5);
rho ~ normal(0, 5);
}
sp_block <- "
data {
int N; // number of obs
vector[2] old_xy[N]; // spatial pos
vector[N] old_z;
int n;
vector[2] new_xy[n]; // new locs
}
transformed data {
vector[2] xy[N+n];
xy[1:N] = old_xy;
xy[(N+1):(N+n)] = new_xy;
print(dims(old_z));
}
parameters {
real<lower=0> alpha;
real<lower=0> rho;
vector[n] new_z;
real<lower=0> delta;
real mu;
}
model {
matrix[N+n, N+n] K;
vector[N+n] z;
K = cov_exp_quad(xy, alpha, rho);
for (k in 1:(N+n)) {
K[k,k] += delta;
}
z[1:N] = old_z;
z[(N+1):(N+n)] = new_z;
z ~ multi_normal(rep_vector(mu, N+n), K);
alpha ~ normal(0, 5);
rho ~ normal(0, 5);
delta ~ normal(0, 5);
mu ~ normal(0, 5);
}
"
new_sp <- stan_model(model_code=sp_block)
## Error in sink(type = "output"): invalid connection
library(mvtnorm)
N <- 20
xy <- data.frame(x=runif(N), y=runif(N))
dxy <- as.matrix(dist(xy))
ut <- upper.tri(dxy, diag=TRUE)
truth <- list(rho=.6,
delta=.1,
alpha=2.5,
mu=5)
truth$covmat <- (truth$delta * diag(N)
+ truth$alpha^2 * exp(-(dxy/truth$rho)^2))
xy$z <- as.vector(rmvnorm(1, mean=rep(truth$mu,N), sigma=truth$covmat))
new_xy <- cbind(x=runif(5), y=runif(5))
sp_data <- list(N=nrow(xy),
old_xy=cbind(xy$x, xy$y),
old_z=xy$z,
n=nrow(new_xy),
new_xy=as.matrix(new_xy))
(sp_time <- system.time(
sp_fit <- sampling(new_sp,
data=sp_data,
iter=1000,
chains=2,
control=list(adapt_delta=0.99,
max_treedepth=12))))
## Error in sampling(new_sp, data = sp_data, iter = 1000, chains = 2, control = list(adapt_delta = 0.99, : object 'new_sp' not found
## Timing stopped at: 0 0 0
cbind(truth=truth[c("alpha", "rho", "delta", "mu")],
rstan::summary(sp_fit, pars=c("alpha", "rho", "delta", "mu"))$summary)
## Error in rstan::summary(sp_fit, pars = c("alpha", "rho", "delta", "mu")): object 'sp_fit' not found
## Error in extract(sp_fit, pars = "new_z"): object 'sp_fit' not found
## Error in is.data.frame(x): object 'new_z' not found