Spatial density estimation

Peter Ralph

Advanced Biological Statistics

Biketown

layout(t(1:2))
setup_map(main='starts'); points(bt_start, pch=20, cex=0.5, col='red')
setup_map(main='ends'); points(bt_end, pch=20, cex=0.5, col='blue')

plot of chunk r plot_pdx

Spatial density estimation

Kernel density estimation

Suppose we have a bunch of points

\[ (x_1, y_1), \ldots, (x_n, y_n) \]

with density $f(x,y)$, i.e.,

\[ f(x,y) \approx \frac{\#(\text{of points within $r$ of $(x,y)$})}{\pi r^2} \]

Using

\[ \rho_\sigma(x,y) = \frac{1}{2\pi\sigma^2} e^{-\frac{x^2+y^2}{2\sigma^2}} \]

we can estimate $f(x,y)$ by

\[ \hat f(x,y) = \sum_{i=1}^n \rho_\sigma(x_i-x, y_i-y) . \]

The two choices of kernel density estimation

The kernel, $\rho()$.

(we used a Gaussian)
The bandwidth, $\sigma$.

Try it out

Simulate 200 spatial locations with a density having two “bumps”. Plot these points. (rnorm(n, mean=-3), rnorm(n, mean=+3))
Make a $20 \times 20$ grid of “reference” locations. (expand.grid( ))
Compute the kernel density estimate for each reference location, with $\sigma$ chosen appropriately, and plot it. (dnorm( ))

Bike trip density estimation

`bkde2D`

library(KernSmooth)
grid_n <- 301
start_smooth <- bkde2D(bt_start@coords, bandwidth=0.004, gridsize=c(grid_n, grid_n))
str(start_smooth)

## List of 3
##  $ x1  : num [1:301] -123 -123 -123 -123 -123 ...
##  $ x2  : num [1:301] 45.5 45.5 45.5 45.5 45.5 ...
##  $ fhat: num [1:301, 1:301] 0.00 0.00 1.08e-14 4.68e-16 1.78e-14 ...

Plotting surfaces, method 1

image( ) and contour( )

image(x=start_smooth$x1,
      y=start_smooth$x2,
      z=matrix(start_smooth$fhat, nrow=grid_n, ncol=grid_n),
      xlab='eastings',
      ylab='northings'
)
contour(x=start_smooth$x1,
        y=start_smooth$x2,
        z=matrix(start_smooth$fhat, nrow=grid_n, ncol=grid_n),
        add=TRUE
)

plot of chunk r plot_surface1x

Plotting surfaces, method 2

library(raster)
smooth_df <- as.data.frame(expand.grid(x=start_smooth$x1, y=start_smooth$x2))
smooth_df$z <- as.vector(start_smooth$fhat)
raster_smooth <- rasterFromXYZ(smooth_df)

plot(raster_smooth)
setup_map(add=TRUE)
contour(raster_smooth, add=TRUE)

plot of chunk r plot_surface2x

Choosing a bandwidth

The bandwidth is important

plot of chunk r kernsmooth

How to pick a bandwidth?

There are various automatic methods.

But… crossvalidation!

For each bandwidth:

Fit using most of the data,
and predict the remaining data.
Do this a bunch and return an estimate of goodness-of-fit.

… but wait, what’s this mean, here?

Revised:

For each bandwidth:

Fit the smooth using most of the data,
predict the density at the locations of the remaining data,
and return the mean log density at “true” points.

Why log?:

If $f$ and $g$ are probability distributions, and $x_1, \ldots, x_n$ are drawn from $f$, then \[ L(g) = \sum_{i=1}^n \log g(x_i) \lessapprox L(f) , \] i.e., is maximized for $g=f$. (see: cross entropy)

On example data

n <- 400
xy <- cbind(rnorm(n, mean=rep(c(-3,3), times=c(n/2,n/2))), rnorm(n))
xygrid <- expand.grid(x=seq(-5,5,length.out=40),
                      y=seq(-5,5,length.out=40))
xy_fhat <- rep(NA, nrow(xygrid))
sigma <- 0.25
for (k in 1:nrow(xygrid)) {
    xy_fhat[k] <- sum( exp(-( (xy[,1] - xygrid[k,1])^2 + (xy[,2] - xygrid[k,2])^2 )/(2*sigma^2) ) / (2 * pi * sigma^2 ) )
}
plot(xy, asp=1, pch=20)
points(xygrid[,1], xygrid[,2], cex=xy_fhat/6)

plot of chunk r do_it

Prediction, by linear interpolation

library(fields)
the_xval <- function (bw, points, ntest, grid_n=301) {
    use_these <- sample(rep(c(TRUE, FALSE), times=c(nrow(xy)-ntest, ntest)))
    smooth <- bkde2D(xy[use_these,], bandwidth=bw, gridsize=c(grid_n, grid_n))
    names(smooth) <- c("x", "y", "z")
    pred <- interp.surface(smooth, loc=xy[!use_these,])
    return(mean(log(pred), na.rm=TRUE))
}

the_scores <- data.frame(bw = rep(seq(0.1, 1.0, length.out=12), each=20),
                          xent = NA)
for (k in 1:nrow(the_scores)) {
    pred <- the_xval(bw=the_scores$bw[k], points=bt_start, ntest=100)
    the_scores$xent[k] <- pred
}

The results

plot(xent ~ bw, data=the_scores)
lines(lowess(the_scores$bw, the_scores$xent), lwd=2, col='red')

plot of chunk r plot_xvalA

Prediction, on bicycle data

bw <- 0.004
xy <- bt_start@coords
ntest <- 100
use_these <- sample(rep(c(TRUE, FALSE), times=c(nrow(xy)-ntest, ntest)))
smooth <- bkde2D(xy[use_these,], bandwidth=bw, gridsize=c(grid_n, grid_n))
names(smooth) <- c("x", "y", "z")
pred <- fields::interp.surface(smooth, loc=xy[!use_these,])
hist(pred)

plot of chunk r one_xval

density_xval <- function (bw, points, ntest, grid_n=301) {
    xy <- points@coords
    use_these <- sample(rep(c(TRUE, FALSE), times=c(nrow(xy)-ntest, ntest)))
    smooth <- bkde2D(xy[use_these,], bandwidth=bw, gridsize=c(grid_n, grid_n))
    names(smooth) <- c("x", "y", "z")
    pred <- interp.surface(smooth, loc=xy[!use_these,])
    return(mean(log(pred), na.rm=TRUE))
}

xval_scores <- data.frame(bw = rep(bw_list[1:5], each=20),
                          xent = NA)
for (k in 1:nrow(xval_scores)) {
    pred <- density_xval(bw=xval_scores$bw[k], points=bt_start, ntest=100)
    xval_scores$xent[k] <- pred
}

plot(xent ~ bw, data=xval_scores)

plot of chunk r do_xval

Bicycle migration

plot of chunk r plot_surface2y

Start minus end

plot of chunk r plot_diff