Peter Ralph
10 March 2019 – Advanced Biological Statistics
git clone git@github.com:UO-Biostats/datasets.git
data_dir <- "../../../datasets"
bt <- read.csv(file.path(data_dir, "biketown/2019_08.csv.gz"), header=TRUE, stringsAsFactors=FALSE)
xylims <- c(-122.75, -122.6, 45.46, 45.56)
bt <- subset(bt, !is.na(Start_Latitude) & Start_Latitude > xylims[3] & Start_Latitude < xylims[4]
& !is.na(End_Latitude) & End_Latitude > xylims[3] & End_Latitude < xylims[4]
& !is.na(Start_Longitude) & Start_Longitude > xylims[1] & Start_Longitude < xylims[2]
& !is.na(End_Longitude) & End_Longitude > xylims[1] & End_Longitude < xylims[2])
nrow(bt)
## [1] 38587
layout(t(1:2))
plot(Start_Latitude ~ Start_Longitude, data=bt, pch=20, cex=0.5, col=adjustcolor('black', 0.2), xlab='eastings', ylab='northings')
plot(Start_Latitude ~ Start_Longitude, data=bt, type='n', xlab='eastings', ylab='northings')
with(bt, segments(x0=Start_Longitude, y0=Start_Latitude,
x1=End_Longitude, y1=End_Latitude, col=adjustcolor("black", 0.05), lwd=0.5))
Visualize:
See this tutorial by Manual Gimond.
## OGR data source with driver: ESRI Shapefile
## Source: "/home/peter/teaching/uo-biostats/datasets/biketown/River_Overlay", layer: "River_Overlay_Boundary"
## with 7 features
## It has 6 fields
## OGR data source with driver: ESRI Shapefile
## Source: "/home/peter/teaching/uo-biostats/datasets/biketown/bicycle_network", layer: "Bicycle_Network"
## with 39798 features
## It has 10 fields
## OGR data source with driver: ESRI Shapefile
## Source: "/home/peter/teaching/uo-biostats/datasets/biketown/pdx_streets", layer: "Streets"
## with 108454 features
## It has 39 fields
bikepaths <- subset(bikepaths, Status == "Active")
streets <- subset(streets, LCITY == "Portland" | RCITY == "Portland")
river <- crop(river, crop_extent)
freeways <- crop(subset(streets, TYPE == 1110), crop_extent)
bigstreets <- crop(subset(streets, TYPE %in% c(1300, 1400)), crop_extent)
rm(streets)
bt_start <- SpatialPointsDataFrame(coords=cbind(bt$Start_Longitude, bt$Start_Latitude),
data=bt, proj4string=CRS("+proj=longlat"))
bt_end <- SpatialPointsDataFrame(coords=cbind(bt$End_Longitude, bt$End_Latitude),
data=bt, proj4string=CRS("+proj=longlat"))
bt_start <- crop(
spTransform(bt_start,
proj4string(river)),
crop_extent)
bt_end <- crop(
spTransform(bt_end,
proj4string(river)),
crop_extent)
Suppose we have a bunch of points
\[ (x_1, y_1), \ldots, (x_n, y_n) \]
with density \(f(x,y)\), i.e.,
\[ f(x,y) \approx \frac{\#(\text{of points within $r$ of $(x,y)$})}{\pi r^2} \]
Now using
\[ \rho_\sigma(x,y) = \frac{1}{2\pi\sigma^2} e^{-\frac{x^2+y^2}{2\sigma^2}} \]
we can estimate \(f(x,y)\) by
\[ \hat f(x,y) = \sum_{i=1}^n \rho_\sigma(x_i-x, y_i-y) . \]
The kernel, \(\rho()\).
(we used a Gaussian)
The bandwidth, \(\sigma\).
Simulate 200 spatial locations with a density having two “bumps”. Plot these points. (rnorm(n, mean=-3), rnorm(n, mean=+3)
)
Make a \(20 \times 20\) grid of “reference” locations. (expand.grid( )
)
Compute the kernel density estimate for each location, with \(\sigma\) chosen appropriately, and plot it.
bkde2D
grid_n <- 301
start_smooth <- bkde2D(bt_start@coords, bandwidth=0.004, gridsize=c(grid_n, grid_n))
str(start_smooth)
## List of 3
## $ x1 : num [1:301] -123 -123 -123 -123 -123 ...
## $ x2 : num [1:301] 45.5 45.5 45.5 45.5 45.5 ...
## $ fhat: num [1:301, 1:301] 0 0 0 0 0 ...
image( )
and contour( )
There are various automatic methods.
But… crossvalidation!
For each bandwidth:
… but wait, what’s this mean, here?
For each bandwidth:
n <- 400
xy <- cbind(rnorm(n, mean=rep(c(-3,3), times=c(n/2,n/2))), rnorm(n))
xygrid <- expand.grid(x=seq(-5,5,length.out=40),
y=seq(-5,5,length.out=40))
xy_fhat <- rep(NA, nrow(xygrid))
sigma <- 0.25
for (k in 1:nrow(xygrid)) {
xy_fhat[k] <- sum( exp(-( (xy[,1] - xygrid[k,1])^2 + (xy[,2] - xygrid[k,2])^2 )/(2*sigma^2) ) / (2 * pi * sigma^2 ) )
}
plot(xy, asp=1, pch=20)
points(xygrid[,1], xygrid[,2], cex=xy_fhat/6)
the_xval <- function (bw, points, ntest, grid_n=301) {
background <- cbind(x=runif(ntest, min=-5.5, max=5.5),
y=runif(ntest, min=-2.3, 2.3))
use_these <- sample(rep(c(TRUE, FALSE), times=c(nrow(xy)-ntest, ntest)))
smooth <- bkde2D(xy[use_these,], bandwidth=bw, gridsize=c(grid_n, grid_n))
names(smooth) <- c("x", "y", "z")
pred_yes <- interp.surface(smooth, loc=xy[!use_these,])
pred_no <- interp.surface(smooth, loc=background)
return(c(mean(pred_yes, na.rm=TRUE), mean(pred_no, na.rm=TRUE)))
}
layout(t(1:2))
plot(pred_yes ~ bw, data=the_scores, col='red', ylim=c(0, max(pred_yes, na.rm=TRUE)))
points(pred_no ~ bw, data=the_scores, col='blue')
legend("topright", pch=1, col=c("red", "blue"), legend=c("points", "background"))
plot(pred_yes - pred_no ~ bw, data=the_scores, pch=20,
xlab='bandwidth', ylab='mean density minus background')
lines(lowess(the_scores$bw, the_scores$pred_yes - the_scores$pred_no), lwd=2, col='red')
bw <- 0.004
xy <- bt_start@coords
ntest <- 100
background <- cbind(x=runif(ntest, min=extent(bt_start@coords)@xmin, max=extent(bt_start@coords)@xmax),
y=runif(ntest, min=extent(bt_start@coords)@ymin, max=extent(bt_start@coords)@ymax))
use_these <- sample(rep(c(TRUE, FALSE), times=c(nrow(xy)-ntest, ntest)))
smooth <- bkde2D(xy[use_these,], bandwidth=bw, gridsize=c(grid_n, grid_n))
names(smooth) <- c("x", "y", "z")
pred_yes <- fields::interp.surface(smooth, loc=xy[!use_these,])
pred_no <- fields::interp.surface(smooth, loc=background)
boxplot(list(yes=pred_yes, no=pred_no))
density_xval <- function (bw, points, ntest, grid_n=301) {
xy <- points@coords
background <- cbind(x=runif(ntest, min=extent(points@coords)@xmin, max=extent(points@coords)@xmax),
y=runif(ntest, min=extent(points@coords)@ymin, max=extent(points@coords)@ymax))
use_these <- sample(rep(c(TRUE, FALSE), times=c(nrow(xy)-ntest, ntest)))
smooth <- bkde2D(xy[use_these,], bandwidth=bw, gridsize=c(grid_n, grid_n))
names(smooth) <- c("x", "y", "z")
pred_yes <- interp.surface(smooth, loc=xy[!use_these,])
pred_no <- interp.surface(smooth, loc=background)
return(c(mean(pred_yes), mean(pred_no)))
}
xval_scores <- data.frame(bw = rep(bw_list[1:5], each=20),
pred_yes = NA,
pred_no = NA)
for (k in 1:nrow(xval_scores)) {
pred <- density_xval(bw=xval_scores$bw[k], points=bt_start, ntest=100)
xval_scores$pred_yes[k] <- pred[1]
xval_scores$pred_no[k] <- pred[2]
}
plot(pred_yes ~ bw, data=xval_scores, col='red', ylim=c(0, max(pred_yes)))
points(pred_no ~ bw, data=xval_scores, col='blue')
legend("topright", pch=1, col=c("red", "blue"), legend=c("points", "background"))
lo_dur <- loess(Duration ~ Start_Latitude * Start_Longitude, data=bt, span=0.1)
pred_lats <- seq(min(bt$Start_Latitude), max(bt$Start_Latitude), length.out=101)
pred_lons <- seq(min(bt$Start_Longitude), max(bt$Start_Longitude), length.out=101)
pred_locs <- expand.grid(Start_Latitude=pred_lats,
Start_Longitude=pred_lons)
pred_duration <- predict(lo_dur, newdata=pred_locs)
Why?
Suggest another method.