Peter Ralph
Advanced Biological Statistics
We have counts of transcript numbers,
from some individuals of different ages and past exposures to solar irradiation,
of two genotypes.
\[\begin{aligned} Z_i &\sim \Poisson(y_i) \\ y_i &= \exp(a_{g_i} + b \times \text{age}_i \\ &\qquad {} + c_{g_i} \times \text{exposure}_i ) \end{aligned}\]
fit1 <- brm(counts ~ genotype + age + exposure * genotype,
data=count_data, family=poisson(link='log'),
prior=prior("normal(0, 5)", class="b"),
file="cache/overdispersion_fit1.rds")
## Compiling Stan program...
## Start sampling
## Warning: Argument 'nsamples' is deprecated. Please use argument 'ndraws' instead.
Recall that if \(X \sim \Poisson(\lambda)\) then \[ \E[X] = \var[X] = \lambda, \] and so a “\(z\)-score” is \[\begin{aligned} \E\left(\frac{X - \lambda}{\sqrt{\lambda}}\right) = 0, \qquad \qquad \text{SD}\left(\frac{X - \lambda}{\sqrt{\lambda}}\right) = 1. \end{aligned}\]
## Family: poisson
## Links: mu = log
## Formula: counts ~ genotype + age + exposure * genotype
## Data: count_data (Number of observations: 500)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 0.31 0.07 0.17 0.46 1.00 3236 2720
## genotypeB 1.91 0.05 1.80 2.02 1.00 1967 2420
## age 0.05 0.01 0.04 0.06 1.00 4058 3116
## exposure 0.09 0.00 0.08 0.10 1.00 2711 3165
## genotypeB:exposure -0.21 0.01 -0.22 -0.19 1.00 2235 2612
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
\[\begin{aligned} Z_i &\sim \Poisson(\exp(\mu_i)) \\ \mu_i &\sim \Normal(y_i, \sigma) \\ y_i &= a_{g_i} + b \times \text{age}_i + c_{g_i} \times \text{exposure}_i \end{aligned}\]
is equivalent to
\[\begin{aligned} Z_i &\sim \Poisson(\exp(y_i)) \\ y_i &= a_{g_i} + b \times \text{age}_i + c_{g_i} \times \text{exposure}_i + \epsilon_i \\ \epsilon_i &\sim \Normal(0, \sigma) \end{aligned}\]
Simulate 1000 draws from \[ X_i \sim \Normal(\text{mean}=3, \text{sd}=3) .\]
Simulate 1000 draws from \[ Y_i \sim \Poisson(\lambda=\exp(X_i)). \]
Simulate 1000 draws from \[\begin{aligned} Z_i &\sim \Poisson(\lambda=\exp(U_i)) \\ U_i &\sim \Normal(\text{mean}=X_i, \text{sd}=2) \end{aligned}\]
Compare \(Y\) and \(Z\) to \(X\), and compare the distribution of the residuals.
\[\begin{aligned} Z_i &\sim \Poisson(\exp(y_i)) \\ y_i &= a_{g_i} + b \times \text{age}_i + c_{g_i} \times \text{exposure}_i + \epsilon_i \\ \epsilon_i &\sim \Normal(0, \sigma) \end{aligned}\]
count_data$id <- 1:nrow(count_data)
fit2 <- brm(counts ~ genotype + age + exposure * genotype + (1|id),
data=count_data, family=poisson(link='log'),
prior=prior("normal(0, 5)", class="b"),
file="cache/overdispersion_fit2.rds")
## Compiling Stan program...
## Start sampling
## Warning: Argument 'nsamples' is deprecated. Please use argument 'ndraws' instead.
## Family: poisson
## Links: mu = log
## Formula: counts ~ genotype + age + exposure * genotype + (1 | id)
## Data: count_data (Number of observations: 500)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Group-Level Effects:
## ~id (Number of levels: 500)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.72 0.03 0.65 0.78 1.00 1061 1975
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 0.00 0.15 -0.29 0.30 1.00 1270 2235
## genotypeB 1.91 0.11 1.69 2.13 1.00 971 1886
## age 0.05 0.01 0.03 0.08 1.00 1220 2171
## exposure 0.10 0.01 0.08 0.11 1.00 867 1282
## genotypeB:exposure -0.20 0.02 -0.23 -0.17 1.00 1043 1724
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
The overdispersed model fits better, but is it better?
brms_kfold2 <- function (K, models, xy) {
stopifnot(!is.null(names(models)))
Kfold <- sample(rep(1:K, nrow(xy)/K))
results <- data.frame(rep=1:K)
for (j in seq_along(models)) {
train <- test <- rep(NA, K)
for (k in 1:K) {
new_fit <- update(models[[j]], newdata=subset(xy, Kfold != k))
train[k] <- sqrt(mean(resid(new_fit)[,"Estimate"]^2))
test_y <- xy$y[Kfold == k]
test[k] <- sqrt(mean(
(test_y - predict(new_fit, newdata=subset(xy, Kfold==k), re_formula=NA)[,"Estimate"])^2 ))
}
results[[paste0(names(models)[j], "_train")]] <- train
results[[paste0(names(models)[j], "_test")]] <- test
}
return(results)
}
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
brms_kfold4 <- function (K, models, xy) {
stopifnot(!is.null(names(models)))
Kfold <- sample(rep(1:K, nrow(xy)/K))
results <- data.frame(rep=1:K)
for (j in seq_along(models)) {
train <- test <- rep(NA, K)
for (k in 1:K) {
new_fit <- update(models[[j]], newdata=subset(xy, Kfold != k))
train_post <- posterior_epred(new_fit, re_formula=NA)
train_y <- xy$y[Kfold != k]
train[k] <- mean(
sqrt(rowMedians(sweep(train_post, 2, train_y, "-")^2))
)
test_y <- xy$y[Kfold == k]
test_post <- posterior_epred(new_fit, newdata=subset(xy, Kfold==k), re_formula=NA)
test[k] <- mean(
sqrt(rowMedians(sweep(test_post, 2, test_y, "-")^2))
)
}
results[[paste0(names(models)[j], "_train")]] <- train
results[[paste0(names(models)[j], "_test")]] <- test
}
return(results)
}
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling