Temporal and spatial data

Peter Ralph

12 March 2019 – Advanced Biological Statistics

Modeling time series

Time series

A time series is a sequence of observations \[\begin{aligned} (y_1, y_2, \ldots, y_N) , \end{aligned}\] that were taken at some set of times \[\begin{aligned} t_1 < t_2 < \cdots < t_N . \end{aligned}\]

In general, the goal is to understand how what happens next depends on the previous history and maybe some predictor variables \[\begin{aligned} (x_1, x_2, \ldots, x_N) , \end{aligned}\] taken at the same set of times.

An oscillator

A discrete, noisy oscillator

Suppose we have regular, noisy observations from a discrete, noisy oscillator.

The system itself does \[\begin{aligned} x_{t+1} - x_t &= \alpha y_t + \Normal(0, \sigma_{xy}) \\ y_{t+1} - y_t &= - \beta x_t + \Normal(0, \sigma_{xy}) \end{aligned}\] but we only get to observe \[\begin{aligned} X_t &= x_t + \Normal(0, \sigma_\epsilon) \\ Y_t &= y_t + \Normal(0, \sigma_\epsilon) . \end{aligned}\]

Here’s what this looks like.

true_osc <- list(alpha=.1,
                 beta=.05,
                 sigma_xy=.01,
                 sigma_eps=.1)
N <- 500
xy <- matrix(nrow=N, ncol=2)
xy[1,] <- c(3,0)
for (k in 1:(N-1)) {
    xy[k+1,] <- (xy[k,] 
                + c(true_osc$alpha * xy[k,2],
                    (-1) * true_osc$beta * xy[k,1])
                + rnorm(2, 0, true_osc$sigma_xy))
}
XY <- xy + rnorm(N*2, 0, true_osc$sigma_eps)

plot of chunk plot_osc

A Stan block

osc_block <- "
data {
    int N;
    vector[N] X;
    vector[N] Y;
}
parameters {
    real alpha;
    real beta;
    real<lower=0> sigma_xy;
    real<lower=0> sigma_eps;
    vector[N] x;
    vector[N] y;
}
model {
    x[2:N] ~ normal(x[1:(N-1)] + alpha * y[1:(N-1)], sigma_xy);
    y[2:N] ~ normal(y[1:(N-1)] - beta * x[1:(N-1)], sigma_xy);
    X ~ normal(x, sigma_eps);
    Y ~ normal(y, sigma_eps);
    alpha ~ normal(0, 1);
    beta ~ normal(0, 1);
    sigma_xy ~ normal(0, 1);
    sigma_eps ~ normal(0, 1);
}
"
osc_model <- stan_model(model_code=osc_block)

## Error in sink(type = "output"): invalid connection

osc_fit <- sampling(osc_model,
                    data=list(N,
                              X=XY[,1],
                              Y=XY[,2]),
                    iter=1000, chains=3,
                    control=list(max_treedepth=12))

## Error in sampling(osc_model, data = list(N, X = XY[, 1], Y = XY[, 2]), : object 'osc_model' not found

How’d we do?

cbind(rstan::summary(osc_fit, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary, 
      truth=c(true_osc$alpha, true_osc$beta, true_osc$sigma_xy, true_osc$sigma_eps))

## Error in rstan::summary(osc_fit, pars = c("alpha", "beta", "sigma_xy", : object 'osc_fit' not found

Here is a density plot of 100 estimated trajectories (of x and y) from the Stan fit.

## Error in extract(osc_fit): object 'osc_fit' not found

plot of chunk show_osc_fit

## Error in lines(osc_results$x[k, ], osc_results$y[k, ], col = adjustcolor("black", : object 'osc_results' not found

A noisier oscillator

More realism?

Let’s try that again, with more noise.

Here’s what this looks like.

true_osc2 <- list(alpha=.1,
                 beta=.05,
                 sigma_xy=2.0,
                 sigma_eps=4.0)
xy2 <- matrix(nrow=N, ncol=2)
xy2[1,] <- c(3,0)
for (k in 1:(N-1)) {
    xy2[k+1,] <- (xy2[k,] 
                + c(true_osc2$alpha * xy2[k,2],
                    (-1) * true_osc2$beta * xy2[k,1])
                + rnorm(2, 0, true_osc2$sigma_xy))
}
XY2 <- xy2 + rnorm(N*2, 0, true_osc2$sigma_eps)

plot of chunk plot_osc2

osc_fit2 <- sampling(osc_model,
                    data=list(N,
                              X=XY2[,1],
                              Y=XY2[,2]),
                    iter=1000, chains=3,
                    control=list(max_treedepth=12))

## Error in sampling(osc_model, data = list(N, X = XY2[, 1], Y = XY2[, 2]), : object 'osc_model' not found

How’d we do?

cbind(rstan::summary(osc_fit2, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary, 
      truth=c(true_osc2$alpha, true_osc2$beta, true_osc2$sigma_xy, true_osc2$sigma_eps))

## Error in rstan::summary(osc_fit2, pars = c("alpha", "beta", "sigma_xy", : object 'osc_fit2' not found

Here is a density plot of 100 estimated trajectories (of x and y) from the Stan fit.

## Error in extract(osc_fit2): object 'osc_fit2' not found

plot of chunk show_osc_fit2

## Error in lines(osc_results2$x[k, ], osc_results2$y[k, ], col = adjustcolor("black", : object 'osc_results2' not found

Missing data

Even more realism?

Now what if we actually don’t observe most of the \(Y\) values?

Here’s what this looks like.

true_osc3 <- list(alpha=.1,
                 beta=.05,
                 sigma_xy=.05,
                 sigma_eps=.5)
xy3 <- matrix(nrow=N, ncol=2)
xy3[1,] <- c(3,0)
for (k in 1:(N-1)) {
    xy3[k+1,] <- (xy3[k,] 
                + c(true_osc3$alpha * xy3[k,2],
                    (-1) * true_osc3$beta * xy3[k,1])
                + rnorm(2, 0, true_osc3$sigma_xy))
}
XY3 <- xy3 + rnorm(N*2, 0, true_osc3$sigma_eps)
obs_y <- sample.int(N, size=10)
XY3[setdiff(1:N, obs_y), 2] <- NA

plot of chunk plot_osc3

A new Stan block

osc_block_missing <- "
data {
    int N;
    vector[N] X;
    int k; // number of observed Y
    int obs_y[k]; // which Y values are observed
    vector[k] Y;
}
parameters {
    real alpha;
    real beta;
    real<lower=0> sigma_xy;
    real<lower=0> sigma_eps;
    vector[N] x;
    vector[N] y;
}
model {
    x[2:N] ~ normal(x[1:(N-1)] + alpha * y[1:(N-1)], sigma_xy);
    y[2:N] ~ normal(y[1:(N-1)] - beta * x[1:(N-1)], sigma_xy);
    X ~ normal(x, sigma_eps);
    Y ~ normal(y[obs_y], sigma_eps);
    alpha ~ normal(0, 1);
    beta ~ normal(0, 1);
    sigma_xy ~ normal(0, 1);
    sigma_eps ~ normal(0, 1);
}
"
osc_model_missing <- stan_model(model_code=osc_block_missing)

## Error in sink(type = "output"): invalid connection

osc_fit3 <- sampling(osc_model_missing,
                    data=list(N,
                              X=XY3[,1],
                              k=length(obs_y),
                              obs_y=obs_y,
                              Y=XY3[obs_y,2]),
                    iter=1000, chains=3,
                    control=list(max_treedepth=12))

## Error in sampling(osc_model_missing, data = list(N, X = XY3[, 1], k = length(obs_y), : object 'osc_model_missing' not found

How’d we do?

cbind(rstan::summary(osc_fit3, pars=c("alpha", "beta", "sigma_xy", "sigma_eps"))$summary, 
      truth=c(true_osc3$alpha, true_osc3$beta, true_osc3$sigma_xy, true_osc3$sigma_eps))

## Error in rstan::summary(osc_fit3, pars = c("alpha", "beta", "sigma_xy", : object 'osc_fit3' not found

Here is a density plot of 100 estimated trajectories (of x and y) from the Stan fit.

## Error in extract(osc_fit3): object 'osc_fit3' not found

plot of chunk show_osc_fit3

## Error in lines(osc_results3$x[k, ], osc_results3$y[k, ], col = adjustcolor("black", : object 'osc_results3' not found

Spatial models

A simple scenario

Suppose we have estimates of abundance of a soil microbe from a number of samples across our study area:

plot of chunk show_study

The data

(x,y) : spatial coords; z : abundance

xy

##             x          y         z
## 1  0.36570561 0.91601951  4.203546
## 2  0.34674591 0.14804962  9.677145
## 3  0.28120674 0.99160415  4.202536
## 4  0.09618601 0.76485639  4.562420
## 5  0.37855554 0.37953143  7.309844
## 6  0.27307616 0.32704858  7.990845
## 7  0.26440936 0.58937968  5.235354
## 8  0.69809336 0.80556321  4.150339
## 9  0.74084430 0.03327871  9.092326
## 10 0.22440814 0.38113112  7.268042
## 11 0.65007941 0.14698290  9.917480
## 12 0.55097394 0.11196825 10.030375
## 13 0.38075338 0.17814633  8.662223
## 14 0.68119877 0.23249908  8.836484
## 15 0.38755934 0.57082907  5.065999
## 16 0.49790287 0.04134980  9.315853
## 17 0.79792488 0.97102161  2.878486
## 18 0.46672735 0.59853551  5.155774
## 19 0.87866343 0.48030911  6.187081
## 20 0.26769967 0.14591511  8.578937

Goals:

(descriptive) What spatial scale does abundance vary over?
(predictive) What are the likely (range of) abundances at new locations?

Spatial covariance

Tobler’s First Law of Geography:

Everything is related to everything else, but near things are more related than distant things.

Modeler: Great, covariance is a decreasing function of distance.

A decreasing function of distance.

A convenient choice: the covariance between two points distance \(d\) apart is \[\begin{aligned} \alpha^2 \exp\left(- \frac{1}{2}\left(\frac{d}{\rho}\right)^2 \right) . \end{aligned}\]

\(\alpha\) controls the overall variance (amount of noise)
\(\rho\) is the spatial scale that covariance decays over

In Stan

Here’s an R function that takes a set of locations (xy), a variance scaling alpha, and a spatial scale rho:

cov_exp_quad <- function (xy, alpha, rho) {
    # return the 'quadratic exponential' covariance matrix
    # for spatial positions xy
    dxy <- as.matrix(dist(xy))
    return( alpha^2 * exp( - (1/2) * dxy^2 / rho^2 ) )
}

Challenge: simulate spatially autocorrelated random Gaussian values, and plot them, in space. Pick parameters so you can tell they are autocorrelated.

to color points by a continuous value:

     colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)]

Simulation

library(mvtnorm)
N <- 100
# spatial extent of order 1
xy <- data.frame(y=runif(N),
                 x=rnorm(N))
rho <- .25
alpha <- 1.0
# compute covariance matrix
K <- cov_exp_quad(xy, alpha, rho)
# simulate z's
xy$z <- as.vector(rmvnorm(1, mean=rep(0,N),
                          sigma=K))
plot(xy$x, xy$y, xlab='x spatial coord', 
     ylab='y spatial coord', pch=20, asp=1,
     cex=as.numeric(cut(xy$z, breaks=10))/2,
     col=colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)])

plot of chunk sim_sp_pts

Simulation number 2

library(mvtnorm)
N <- 1000
# spatial extent of order 1
xy <- data.frame(x=runif(N),
                 y=rnorm(N))
rho <- .25
alpha <- 1.0
# compute covariance matrix
K <- cov_exp_quad(xy, alpha, rho)
# simulate z's
xy$z <- as.vector(rmvnorm(1, mean=rep(0,N),
                          sigma=K))

plot(xy$y, xy$x, xlab='y spatial coord', 
     ylab='x spatial coord', pch=20, asp=1,
     cex=as.numeric(cut(xy$z, breaks=10))/2,
     col=colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)])

plot of chunk plot_pts2

Back to the data

Goals

(descriptive) What spatial scale does abundance vary over?

\(\Rightarrow\) What is \(\rho\)?
(predictive) What are the likely (range of) abundances at new locations?

\(\Rightarrow\) Add unobserved abundances as parameters.

A basic Stan block

sp_block <- "
data {
    int N; // number of obs
    vector[2] xy[N]; // spatial pos
    vector[N] z;
}
parameters {
    real<lower=0> alpha;
    real<lower=0> rho;
}
model {
    matrix[N, N] K;
    K = cov_exp_quad(xy, alpha, rho);

    z ~ multi_normal(rep_vector(0.0, N), K);
    alpha ~ normal(0, 5);
    rho ~ normal(0, 5);
}
"
# check this compiles
sp_model <- stan_model(model_code=sp_block)

## Error in sink(type = "output"): invalid connection

Challenge: we would like to estimate the abundance at the k locations new_xy. Add this feature to the Stan block.


data {
    int N; // number of obs
    vector[2] xy[N]; // spatial pos
    vector[N] z;
}
parameters {
    real<lower=0> alpha;
    real<lower=0> rho;
}
model {
    matrix[N, N] K;
    K = cov_exp_quad(xy, alpha, rho);

    z ~ multi_normal(rep_vector(0.0, N), K);
    alpha ~ normal(0, 5);
    rho ~ normal(0, 5);
}

A solution

sp_block <- "
data {
    int N; // number of obs
    vector[2] old_xy[N]; // spatial pos
    vector[N] old_z;
    int n;
    vector[2] new_xy[n]; // new locs
}
transformed data {
    vector[2] xy[N+n];
    xy[1:N] = old_xy;
    xy[(N+1):(N+n)] = new_xy;
    print(dims(old_z));
}
parameters {
    real<lower=0> alpha;
    real<lower=0> rho;
    vector[n] new_z;
    real<lower=0> delta;
    real mu;
}
model {
    matrix[N+n, N+n] K;
    vector[N+n] z;
    K = cov_exp_quad(xy, alpha, rho);
    for (k in 1:(N+n)) {
        K[k,k] += delta;
    }
    z[1:N] = old_z;
    z[(N+1):(N+n)] = new_z;
    z ~ multi_normal(rep_vector(mu, N+n), K);
    alpha ~ normal(0, 5);
    rho ~ normal(0, 5);
    delta ~ normal(0, 5);
    mu ~ normal(0, 5);
}
"

new_sp <- stan_model(model_code=sp_block)

## Error in sink(type = "output"): invalid connection

Simulate data

library(mvtnorm)
N <- 20
xy <- data.frame(x=runif(N), y=runif(N))
dxy <- as.matrix(dist(xy))
ut <- upper.tri(dxy, diag=TRUE)
truth <- list(rho=.6,
              delta=.1,
              alpha=2.5,
              mu=5)
truth$covmat <- (truth$delta * diag(N) 
                 + truth$alpha^2 * exp(-(dxy/truth$rho)^2))
xy$z <- as.vector(rmvnorm(1, mean=rep(truth$mu,N), sigma=truth$covmat))

It runs.

new_xy <- cbind(x=runif(5), y=runif(5))
sp_data <- list(N=nrow(xy),
                old_xy=cbind(xy$x, xy$y),
                old_z=xy$z,
                n=nrow(new_xy),
                new_xy=as.matrix(new_xy))

(sp_time <- system.time(
    sp_fit <- sampling(new_sp,
                       data=sp_data,
                       iter=1000,
                       chains=2,
                       control=list(adapt_delta=0.99,
                                    max_treedepth=12))))

## Error in sampling(new_sp, data = sp_data, iter = 1000, chains = 2, control = list(adapt_delta = 0.99, : object 'new_sp' not found

## Timing stopped at: 0 0 0

Does it work?

cbind(truth=truth[c("alpha", "rho", "delta", "mu")],
      rstan::summary(sp_fit, pars=c("alpha", "rho", "delta", "mu"))$summary)

## Error in rstan::summary(sp_fit, pars = c("alpha", "rho", "delta", "mu")): object 'sp_fit' not found

## Error in extract(sp_fit, pars = "new_z"): object 'sp_fit' not found

## Error in is.data.frame(x): object 'new_z' not found

plot of chunk plot_pts_interp