Peter Ralph
2 March 2021 – Advanced Biological Statistics
Suppose we have estimates of abundance of a soil microbe from a number of samples across our study area:
(x,y) : spatial coords; z : abundance
##            x         y        z
## 1  0.5766037 0.5863978 3.845720
## 2  0.2230729 0.2747410 3.571873
## 3  0.3318966 0.1476570 4.683332
## 4  0.7107246 0.8014103 3.410312
## 5  0.8194490 0.3864098 3.726504
## 6  0.4237206 0.8204507 3.356803
## 7  0.9635445 0.6849373 4.424564
## 8  0.9781304 0.8833893 5.795779
## 9  0.8405219 0.1119208 3.962872
## 10 0.9966112 0.7788340 5.155496
## 11 0.8659590 0.6210109 3.911921
## 12 0.7014217 0.2741515 4.235119
## 13 0.3904731 0.3014697 3.657133
## 14 0.3147697 0.3490438 3.342371
## 15 0.8459473 0.3291311 3.835436
## 16 0.1392785 0.8095039 4.035925
## 17 0.5181206 0.2744821 4.694328
## 18 0.5935508 0.9390949 3.054045
## 19 0.9424617 0.9022671 5.518488
## 20 0.6280196 0.1770531 4.617342Goals:
(descriptive) What spatial scale does abundance vary over?
(predictive) What are the likely (range of) abundances at new locations?
Let’s brainstorm a few examples, listing
Everything is related to everything else, but near things are more related than distant things.
Modeler: Great, covariance is a decreasing function of distance.
A convenient choice: the covariance between two points distance \(d\) apart is \[\begin{aligned} \alpha^2 \exp\left(- \frac{1}{2}\left(\frac{d}{\rho}\right)^2 \right) . \end{aligned}\]
\(\alpha\) controls the overall variance (amount of noise)
\(\rho\) is the spatial scale that covariance decays over
Here’s an R function that takes a set of locations (xy), a variance scaling alpha, and a spatial scale rho:
cov_exp_quad <- function (xy, alpha, rho) {
    # return the 'quadratic exponential' covariance matrix
    # for spatial positions xy
    dxy <- as.matrix(dist(xy))
    return( alpha^2 * exp( - (1/2) * dxy^2 / rho^2 ) )
}Challenge: simulate spatially autocorrelated random Gaussian values, and plot them, in space. Pick parameters so you can tell they are autocorrelated.
to color points by a continuous value:
     colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)]plot(xy$x, xy$y, xlab='x spatial coord', 
     ylab='y spatial coord', pch=20, asp=1,
     cex=as.numeric(cut(xy$z, breaks=10))/2,
     col=colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)])plot(xy$y, xy$x, xlab='y spatial coord', 
     ylab='x spatial coord', pch=20, asp=1,
     cex=as.numeric(cut(xy$z, breaks=10))/2,
     col=colorRampPalette(c('blue', 'red'))(24)[cut(xy$z, breaks=24)])(descriptive) What spatial scale does abundance vary over?
\(\Rightarrow\) What is \(\rho\)?
(predictive) What are the likely (range of) abundances at new locations?
\(\Rightarrow\) Add unobserved abundances as parameters.
sp_block <- "
data {
    int N; // number of obs
    vector[2] xy[N]; // spatial pos
    vector[N] z;
}
parameters {
    real<lower=0> alpha;
    real<lower=0> rho;
    real mu;
}
model {
    matrix[N, N] K;
    K = cov_exp_quad(xy, alpha, rho);
    z ~ multi_normal(rep_vector(mu, N), K);
    alpha ~ normal(0, 5);
    rho ~ normal(0, 5);
}
"
# check this compiles
sp_model <- stan_model(model_code=sp_block)Challenge: we would like to estimate the abundance at the k locations new_xy. Add this feature to the Stan block.
data {
    int N; // number of obs
    vector[2] xy[N]; // spatial pos
    vector[N] z;
}
parameters {
    real<lower=0> alpha;
    real<lower=0> rho;
    real mu;
}
model {
    matrix[N, N] K;
    K = cov_exp_quad(xy, alpha, rho);
    z ~ multi_normal(rep_vector(mu, N), K);
    alpha ~ normal(0, 5);
    rho ~ normal(0, 5);
}sp_block <- "
data {
    int N; // number of obs
    vector[2] old_xy[N]; // spatial pos
    vector[N] old_z;
    int n;
    vector[2] new_xy[n]; // new locs
}
transformed data {
    vector[2] xy[N+n];
    xy[1:N] = old_xy;
    xy[(N+1):(N+n)] = new_xy;
    print(dims(old_z));
}
parameters {
    real<lower=0> alpha;
    real<lower=0> rho;
    vector[n] new_z;
    real<lower=0> delta;
    real mu;
}
model {
    matrix[N+n, N+n] K;
    vector[N+n] z;
    K = cov_exp_quad(xy, alpha, rho);
    for (k in 1:(N+n)) {
        K[k,k] += delta;
    }
    z[1:N] = old_z;
    z[(N+1):(N+n)] = new_z;
    z ~ multi_normal(rep_vector(mu, N+n), K);
    alpha ~ normal(0, 5);
    rho ~ normal(0, 5);
    delta ~ exponential(50);
    mu ~ normal(0, 5);
}
"
new_sp <- stan_model(model_code=sp_block)library(mvtnorm)
N <- 20
xy <- data.frame(x=runif(N), y=runif(N))
dxy <- as.matrix(dist(xy))
ut <- upper.tri(dxy, diag=TRUE)
truth <- list(rho=.6,
              delta=.1,
              alpha=2.5,
              mu=5)
truth$covmat <- (truth$delta * diag(N) 
                 + truth$alpha^2 * exp(-(dxy/truth$rho)^2))
xy$z <- as.vector(rmvnorm(1, mean=rep(truth$mu,N), sigma=truth$covmat))new_xy <- cbind(x=runif(5), y=runif(5))
sp_data <- list(N=nrow(xy),
                old_xy=cbind(xy$x, xy$y),
                old_z=xy$z,
                n=nrow(new_xy),
                new_xy=as.matrix(new_xy))
(sp_time <- system.time(
    sp_fit <- sampling(new_sp,
                       data=sp_data,
                       iter=1000,
                       chains=2,
                       control=list(adapt_delta=0.99,
                                    max_treedepth=12))))##    user  system elapsed 
##   3.880   0.474   2.514cbind(truth=truth[c("alpha", "rho", "delta", "mu")],
      rstan::summary(sp_fit, pars=c("alpha", "rho", "delta", "mu"))$summary)##       truth mean       se_mean      sd         2.5%       25%        50%        75%        97.5%     n_eff    Rhat     
## alpha 2.5   3.079001   0.06967296   1.322923   1.532859   2.153231   2.712159   3.672658   6.034993  360.5294 0.998619 
## rho   0.6   0.4164355  0.004422966  0.0842706  0.2796243  0.3562324  0.4069715  0.4649138  0.5962107 363.0154 0.9995499
## delta 0.1   0.08402985 0.0009771853 0.02792276 0.04243394 0.06420457 0.07989332 0.09783214 0.1530835 816.5128 1.000783 
## mu    5     5.122798   0.1153539    1.674189   1.315383   4.226367   5.180844   6.094035   8.365483  210.6419 1.009901(descriptive) What spatial scale does abundance vary over?
Values are correlated over distances of order \(\rho=0.4164355\) units of distance.
(predictive) What are the likely (range of) abundances at new locations?
These are
##                  x          y     mean        sd
## new_z[1] 0.5092173 0.81697839 4.767944 0.6757971
## new_z[2] 0.4682500 0.69922907 4.712741 0.5660043
## new_z[3] 0.1264559 0.18936180 6.459838 0.4910199
## new_z[4] 0.4797258 0.25212101 3.281285 0.3592538
## new_z[5] 0.7559952 0.07650894 4.209681 0.3323931grid_xy <- expand.grid(x=seq(0,1,length.out=11), y=seq(0,1,length.out=11))
grid_data <- list(N=nrow(xy),
                old_xy=cbind(xy$x, xy$y),
                old_z=xy$z,
                n=nrow(grid_xy),
                new_xy=as.matrix(grid_xy))
(grid_time <- system.time(
    grid_fit <- sampling(new_sp, data=grid_data,
                         iter=1000, control=list(adapt_delta=0.95))))## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess##    user  system elapsed 
## 947.231   2.195 332.810