Peter Ralph
3 December 2019 – Advanced Biological Statistics
We have counts of transcript numbers,
from some individuals of different ages and past exposures to solar irradiation,
of two genotypes.
\[\begin{aligned} Z_i &\sim \Poisson(y_i) \\ y_i &= \exp(a_{g_i} + b \times \text{age}_i \\ &\qquad {} + c_{g_i} \times \text{exposure}_i ) \end{aligned}\]
Here are posterior distributions of the parameters, with the true values in red.
model {
vector[N] mu;
mu = exp(a[geno]
+ b * age
+ c[geno]
.* expo);
counts ~ poisson(mu);
Recall that if \(X \sim \Poisson(\lambda)\) then \[ \E[X] = \var[X] = \lambda, \] and so a “\(z\)-score” is \[\begin{aligned} \E\left(\frac{X - \lambda}{\sqrt{\lambda}}\right) = 0, \qquad \qquad \text{SD}\left(\frac{X - \lambda}{\sqrt{\lambda}}\right) = 1. \end{aligned}\]
\[\begin{aligned} Z_i &\sim \Poisson(\mu_i) \\ \mu_i &\sim \log\Normal(y_i, \sigma) \\ y_i &= a_{g_i} + b \times \text{age}_i \\ &\qquad {} + c_{g_i} \times \text{exposure}_i \end{aligned}\]
overdispersed_model <- stan_model(model_code="
data {
int N; // number of data points
vector[N] age;
vector[N] exposure;
int counts[N];
int genotype[N];
int ngenotypes;
}
parameters {
vector[ngenotypes] a; // intercepts
real b; // slope for age
vector[ngenotypes] c; // slopes for exposure
real<lower=0> sigma; // SD on lognormal
vector[N] mu; // mean of the poissons
}
model {
vector[N] y; // mean of the lognormals
y = a[genotype] + b * age + c[genotype] .* exposure;
mu ~ lognormal(y, sigma);
counts ~ poisson(mu);
a ~ normal(0, 100);
b ~ normal(0, 10);
c ~ normal(0, 20);
sigma ~ normal(0, 10);
}")
Here are posterior distributions from the full model, with the true values in red.
# 100 datasets:
params2 <- list(a=colMeans(post2$a),
b=mean(post2$b),
c=colMeans(post2$c),
sigma=mean(post2$sigma))
y2 <- with(list2env(scaled_data),
params2$a[geno]
+ params2$b * age
+ params2$c[geno] * expo)
sim2 <- replicate(100, {
mu = exp(rnorm(length(y2),
mean=y2,
sd=params2$sigma))
rpois(length(mu), mu)
})
model {
vector[N] y;
y = a[geno] + b * age + c[geno] .* expo;
mu ~ lognormal(y, sigma);
counts ~ poisson(mu);
}
ord <- order(rowMeans(mu1))
plot(rowMeans(sim2), ylab="counts", ylim=range(sim2), type='n')
segments(x0=seq_len(nrow(data)),
y0=rowMins(sim2)[ord],
y1=rowMaxs(sim2)[ord])
points(data$counts[ord], pch=20, col='red')
legend("topleft", pch=c(20,NA), lty=c(NA,1), legend=c("observed", "simulated range"), col=c('red', 'black'))
Explain the model, in words.
\[\begin{aligned} Z_i &\sim \Poisson(\mu_i) \\ \mu_i &\sim \log\Normal(y_i, \sigma) \\ y_i &= a_{g_i} + b \times \text{age}_i \\ &\qquad {} + c_{g_i} \times \text{exposure}_i \end{aligned}\]
overdispersed_model <- stan_model(model_code="
data {
int N; // number of data points
vector[N] age;
vector[N] exposure;
int counts[N];
int genotype[N];
int ngenotypes;
}
parameters {
vector[ngenotypes] a; // intercepts
real b; // slope for age
vector[ngenotypes] c; // slopes for exposure
real<lower=0> sigma; // SD on lognormal
vector[N] mu; // mean of the poissons
}
model {
vector[N] y; // mean of the lognormals
y = a[genotype] + b * age + c[genotype] .* exposure;
mu ~ lognormal(y, sigma);
counts ~ poisson(mu);
a ~ normal(0, 100);
b ~ normal(0, 10);
c ~ normal(0, 20);
sigma ~ normal(0, 10);
}")
Compare the result of fitting a quasipoisson GLM to the data.
To test predictive ability (and diagnose overfitting!):
If you do this a lot of times, it’s called crossvalidation.
pred_model <- stan_model(model_code="
data {
int N; // number of data points
int Nm; // number of missing counts
vector[N] age;
vector[N] exposure;
int missings[Nm]; // indexes of missings
int not_missings[N - Nm]; // the complement
int counts[N - Nm];
int genotype[N];
int ngenotypes;
}
parameters {
vector[ngenotypes] a; // intercepts
real b; // slope for age
vector[ngenotypes] c; // slopes for exposure
real<lower=0> sigma; // SD on lognormal
vector<lower=0>[N] mu; // mean of the poissons
}
model {
vector[N] y; // mean of the lognormals
y = a[genotype] + b * age + c[genotype] .* exposure;
mu ~ lognormal(y, sigma);
counts ~ poisson(mu[not_missings]);
a ~ normal(0, 100);
b ~ normal(0, 10);
c ~ normal(0, 20);
sigma ~ normal(0, 10);
}
generated quantities {
int missing_counts[Nm];
missing_counts = poisson_rng(mu[missings]);
}
")
predict_these <- sample(length(data$counts), 20)
pred_data <- with(data,
list(N=length(counts),
Nm=length(predict_these),
age=(age - mean(age))/sd(age),
exposure=(exposure - mean(exposure))/sd(exposure),
counts=counts[-predict_these],
missings=predict_these,
not_missings=seq_along(counts)[-predict_these],
genotype=genotype,
ngenotypes=length(unique(genotype))))
fit3 <- sampling(pred_model,
data=pred_data,
control=list(max_treedepth=12),
iter=1000, chains=3)
Tavalire et al 2016, Genotypic variation in host response to infection affects parasite reproductive rate, International Journal for Parasitology, has:
How much do genotype and parasite dose affect parasite production?
Here’s the data
## Snail_ID Strain Inf_Dose Tray Infected Initial_Size_mm Weeks_to_first_shed Total_weeks_shed Total_Parasites Avg_Parasites_per_Shed Week_Died
## 1 60 36 0 k 0 6.67 NA NA NA NA 2
## 2 51 36 0 o 0 5.44 NA NA NA NA 3
## 3 431 36 0 g 0 4.89 NA NA NA NA 3
## 4 432 36 0 e 0 3.97 NA NA NA NA 3
## 5 56 36 0 h 0 7.97 NA NA NA NA 6
## 6 49 36 0 c 0 7.86 NA NA NA NA 10
mod1 <- glm(Total_Parasites ~ Strain * Total_weeks_shed + Strain : dose, data=snails, family=poisson("log"))
summary(mod1)
# mean predicted load at 0 weeks for default strain:
exp(9.6)
# default strain is the first one
levels(snails$Strain)
# predicted load of second strain relative to first one
exp(-.66)
# plot data and predicted means
plot(Total_Parasites ~ Total_weeks_shed, data=snails, col=Strain, pch=20)
newdata <- expand.grid(Strain=levels(snails$Strain), Total_weeks_shed=0:30, dose=TRUE)
for (st in levels(snails$Strain)) {
lines(0:30, exp(preds[newdata$Strain == st]), col=match(st, levels(snails$Strain)))
}
This one fails to fit because of negative values
> mod2 <- glm(Total_Parasites ~ Strain * Total_weeks_shed + Strain : dose, data=snails, family=poisson("identity"))
Error: no valid set of coefficients has been found: please supply starting values
In addition: Warning message:
In log(y/mu) : NaNs produced
Using Stan doesn’t help:
> library(brms)
> mod2 <- brm(Total_Parasites ~ Strain * Total_weeks_shed + Strain : dose, data=snails, family=poisson("identity"))
Compiling the C++ model
mod3 <- glm(Total_Parasites ~ 0 + Strain : Total_weeks_shed + Strain : dose, data=snails, family=poisson("identity"))
mod4 <- glm(Total_Parasites ~ 0 + Strain : Total_weeks_shed + Strain : dose, data=snails, family=poisson("log"))
preds3 <- predict(mod3, newdata=newdata)
preds4 <- predict(mod4, newdata=newdata)
plot(Total_Parasites ~ Total_weeks_shed, data=snails, col=Strain, pch=20)
for (st in levels(snails$Strain)) {
lines(0:30, exp(preds[newdata$Strain == st]), col=match(st, levels(snails$Strain)))
lines(0:30, exp(preds3[newdata$Strain == st]), col=match(st, levels(snails$Strain)), lty=3)
lines(0:30, exp(preds4[newdata$Strain == st]), col=match(st, levels(snails$Strain)), lty=4)
}
Distributions:
Visualization:
Statistical models: