Peter Ralph
28 January 2020 – Advanced Biological Statistics
## top term num_genes upregulated downregulated
## 1 cell death neuron death 2405 122 70
## 2 cell death oxidative stress 886 41 25
## 3 cell death necrotic 32 2 0
## 4 cell death apoptosis 898 468 123
## 5 cell motility actin polymerization 1822 243 117
## 6 cell motility sperm motility 1585 292 131
## 7 cell motility cilia 517 75 25
## 8 cell motility axis elongation 62 9 5
## 9 ion transport anion transport 128 2 5
## 10 ion transport cation transport 1697 91 50
## 11 ion transport transmembrane transport 99 3 2
Are some categories of genes more likely to be differentially regulated than others? or, upregulated? downregulated?
goterms$per_up <- with(goterms, upregulated/sum(upregulated))
goterms$per_down <- with(goterms, downregulated/sum(downregulated))
goterms$per_diff <- with(goterms, (upregulated + downregulated)/sum(upregulated + downregulated))
goterms$per_nodiff <- with(goterms, (num_genes - upregulated + downregulated)/sum(num_genes - upregulated + downregulated))
goterms$diff_enrich <- goterms$per_diff / goterms$per_nodiff
goterms$up_vs_down <- goterms$per_up / goterms$per_down
goterms
## top term num_genes upregulated downregulated per_up per_down per_diff per_nodiff diff_enrich up_vs_down
## 1 cell death neuron death 2405 122 70 0.0905 0.1266 0.1010 0.2520 0.40 0.71
## 2 cell death oxidative stress 886 41 25 0.0304 0.0452 0.0347 0.0932 0.37 0.67
## 3 cell death necrotic 32 2 0 0.0015 0.0000 0.0011 0.0032 0.33 Inf
## 4 cell death apoptosis 898 468 123 0.3472 0.2224 0.3109 0.0592 5.25 1.56
## 5 cell motility actin polymerization 1822 243 117 0.1803 0.2116 0.1894 0.1817 1.04 0.85
## 6 cell motility sperm motility 1585 292 131 0.2166 0.2369 0.2225 0.1525 1.46 0.91
## 7 cell motility cilia 517 75 25 0.0556 0.0452 0.0526 0.0500 1.05 1.23
## 8 cell motility axis elongation 62 9 5 0.0067 0.0090 0.0074 0.0062 1.19 0.74
## 9 ion transport anion transport 128 2 5 0.0015 0.0090 0.0037 0.0140 0.26 0.16
## 10 ion transport cation transport 1697 91 50 0.0675 0.0904 0.0742 0.1774 0.42 0.75
## 11 ion transport transmembrane transport 99 3 2 0.0022 0.0036 0.0026 0.0105 0.25 0.62
Let
\[\begin{aligned} N_i &= (\text{number of no difference genes in category $i$}) \\ &\sim \Poisson(\exp(\lambda_i^N)) \\ U_i &= (\text{number of upregulated genes in category $i$}) \\ &\sim \Poisson(\exp(\lambda_i^U)) \\ D_i &= (\text{number of downregulated genes in category $i$}) \\ &\sim \Poisson(\exp(\lambda_i^D)) . \end{aligned}\]
and
\[\begin{aligned} \lambda_i^N &= \alpha_i \\ \lambda_i^U &= \alpha_i + \delta_U + \beta_1[\text{top}_i] + \beta_2[i] \\ \lambda_i^D &= \alpha_i + \delta_D + \beta_1[\text{top}_i] + \beta_2[i] + \gamma_i. \end{aligned}\]
go_model_code <- "
data {
int nterms;
int ntop; // number of 'top' categories
int N[nterms];
int U[nterms];
int D[nterms];
int top[nterms];
}
parameters {
real alpha[nterms];
real delta[2]; // up, down regulation
real beta1[ntop];
real beta2[nterms];
real gamma[nterms];
}
model {
real lambda_n;
real lambda_u;
real lambda_d;
for (i in 1:nterms) {
lambda_n = alpha[i];
lambda_u = alpha[i] + delta[1] + beta1[top[i]] + beta2[i];
lambda_d = alpha[i] + delta[2] + beta1[top[i]] + beta2[i] + gamma[i];
N[i] ~ poisson(exp(lambda_n));
U[i] ~ poisson(exp(lambda_u));
D[i] ~ poisson(exp(lambda_d));
}
}
"
go_model <- stan_model(model_code=go_model_code)
go_fit <- sampling(go_model, chains = 4, iter = 2000,
data=list(nterms = nrow(goterms),
ntop = nlevels(goterms$top),
N = goterms$num_genes - goterms$upregulated - goterms$downregulated,
U = goterms$upregulated,
D = goterms$downregulated,
top = as.numeric(goterms$top)))
## Warning: There were 231 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: There were 3769 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: The largest R-hat is 3.82, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
## mean se_mean sd 2.5% 25% 50% 75% 98% n_eff Rhat
## alpha[1] 7.7 2.2e-03 0.020 7.7 7.7 7.7 7.7 7.7 81.7 1.0
## alpha[2] 6.7 4.6e-03 0.033 6.6 6.7 6.7 6.7 6.8 50.0 1.1
## alpha[3] 3.4 2.6e-02 0.200 3.0 3.2 3.4 3.5 3.7 57.4 1.1
## alpha[4] 5.7 5.5e-03 0.053 5.6 5.7 5.7 5.8 5.8 90.9 1.0
## alpha[5] 7.3 3.1e-03 0.026 7.2 7.3 7.3 7.3 7.3 69.0 1.1
## alpha[6] 7.0 5.4e-03 0.033 7.0 7.0 7.1 7.1 7.1 37.1 1.1
## alpha[7] 6.0 7.4e-03 0.055 5.9 6.0 6.0 6.1 6.1 55.0 1.1
## alpha[8] 3.9 2.1e-02 0.148 3.6 3.8 3.9 4.0 4.2 48.8 1.1
## alpha[9] 4.8 1.1e-02 0.090 4.6 4.7 4.8 4.9 5.0 67.4 1.0
## alpha[10] 7.4 2.1e-03 0.022 7.3 7.3 7.4 7.4 7.4 108.6 1.0
## alpha[11] 4.5 1.4e-02 0.110 4.3 4.5 4.5 4.6 4.7 62.9 1.1
## delta[1] 124.9 1.4e+02 199.883 -137.9 -50.7 90.8 275.1 464.9 2.1 7.7
## delta[2] 164.0 1.2e+02 173.357 -121.7 2.0 164.6 297.6 448.1 2.1 6.0
## beta1[1] -41.0 1.3e+02 201.879 -377.1 -208.9 -77.8 174.0 277.0 2.4 4.2
## beta1[2] -94.0 7.2e+01 109.531 -250.0 -189.9 -109.0 5.6 108.5 2.3 3.7
## beta1[3] -27.8 1.3e+02 213.203 -397.8 -172.0 -15.9 154.2 305.3 2.7 3.7
## beta2[1] -86.8 1.6e+02 237.618 -473.6 -283.3 -98.6 80.9 304.2 2.1 6.4
## beta2[2] -86.9 1.6e+02 237.619 -474.0 -283.5 -98.8 80.9 303.7 2.1 6.4
## beta2[3] -86.8 1.6e+02 237.658 -474.1 -283.2 -98.5 79.3 304.2 2.1 6.4
## beta2[4] -83.4 1.6e+02 237.627 -470.2 -279.9 -95.3 84.2 307.3 2.1 6.4
## beta2[5] -32.6 9.0e+01 131.971 -291.3 -67.2 -16.9 19.2 194.9 2.1 5.8
## beta2[6] -32.2 9.0e+01 131.973 -290.9 -66.7 -16.5 19.6 195.2 2.1 5.8
## beta2[7] -32.5 9.0e+01 131.954 -291.2 -67.1 -16.9 19.3 194.9 2.1 5.8
## beta2[8] -32.5 9.0e+01 131.987 -291.3 -67.0 -16.9 19.1 194.9 2.1 5.8
## beta2[9] -101.4 3.4e+01 100.484 -239.4 -177.7 -113.3 -56.2 143.9 8.6 1.9
## beta2[10] -99.9 3.4e+01 100.481 -237.8 -176.0 -111.9 -54.5 144.4 8.6 1.9
## beta2[11] -100.7 3.4e+01 100.485 -238.2 -176.6 -112.6 -55.6 143.4 8.6 1.9
## gamma[1] -39.7 3.8e+01 54.962 -160.0 -53.3 -24.1 7.8 16.7 2.1 4.6
## gamma[2] -39.6 3.8e+01 54.963 -160.4 -53.1 -24.2 7.7 16.8 2.1 4.6
## gamma[3] -421.3 9.6e+01 235.064 -818.7 -625.3 -428.2 -221.7 -16.7 6.0 1.4
## gamma[4] -40.5 3.8e+01 54.964 -160.9 -54.2 -25.0 7.1 16.0 2.1 4.6
## gamma[5] -39.9 3.8e+01 54.965 -160.3 -53.6 -24.2 7.6 16.6 2.1 4.6
## gamma[6] -39.9 3.8e+01 54.962 -160.4 -53.7 -24.2 7.6 16.5 2.1 4.6
## gamma[7] -40.3 3.8e+01 54.963 -160.7 -54.2 -24.4 7.1 16.2 2.1 4.6
## gamma[8] -39.8 3.8e+01 54.957 -160.1 -53.4 -24.0 7.6 16.8 2.1 4.6
## gamma[9] -38.1 3.8e+01 54.940 -158.8 -52.3 -22.5 8.8 18.7 2.1 4.5
## gamma[10] -39.8 3.8e+01 54.964 -160.0 -53.3 -24.0 7.7 16.7 2.1 4.6
## gamma[11] -39.6 3.8e+01 54.994 -160.2 -53.5 -23.8 7.3 17.3 2.1 4.6
## lp__ 58087.8 2.0e-01 3.983 58079.1 58085.2 58088.1 58090.6 58094.8 391.9 1.0
## alpha[1] alpha[2] alpha[3] alpha[4] alpha[5] alpha[6] alpha[7] alpha[8] alpha[9] alpha[10] alpha[11] delta[1] delta[2] beta1[1] beta1[2] beta1[3] beta2[1] beta2[2] beta2[3] beta2[4] beta2[5]
## 1.0 1.1 1.1 1.0 1.1 1.1 1.1 1.1 1.0 1.0 1.1 7.7 6.0 4.2 3.7 3.7 6.4 6.4 6.4 6.4 5.8
## beta2[6] beta2[7] beta2[8] beta2[9] beta2[10] beta2[11] gamma[1] gamma[2] gamma[3] gamma[4] gamma[5] gamma[6] gamma[7] gamma[8] gamma[9] gamma[10] gamma[11] lp__
## 5.8 5.8 5.8 1.9 1.9 1.9 4.6 4.6 1.4 4.6 4.6 4.6 4.6 4.6 4.5 4.6 4.6 1.0
pairs()
Number of ‘high’ values:
## [1] 11
Wait, let’s add another level:
## [1] 11
What is x[f[3]]
?
## low
## 1
## Error in templater::render_template("Week_14_Lecture.Rmd", output = "Week_14_Lecture.md", : all(names(xf) == f) is not TRUE
go_model_code2 <- "
data {
int nterms;
int ntop; // number of 'top' categories
int N[nterms];
int U[nterms];
int D[nterms];
int top[nterms];
}
parameters {
real alpha[nterms];
real delta[2]; // up, down regulation
real beta1[ntop];
real beta2[nterms];
real gamma[nterms];
}
model {
real lambda_n;
real lambda_u;
real lambda_d;
for (i in 1:nterms) {
lambda_n = alpha[i];
lambda_u = alpha[i] + delta[1] + beta1[top[i]] + beta2[i];
lambda_d = alpha[i] + delta[2] + beta1[top[i]] + beta2[i] + gamma[i];
N[i] ~ poisson(exp(lambda_n));
U[i] ~ poisson(exp(lambda_u));
D[i] ~ poisson(exp(lambda_d));
}
alpha ~ normal(0, 5); // flat-ish
delta ~ normal(0, 5); // flat-ish
beta1 ~ cauchy(0, 0.1); // very peaked
beta2 ~ cauchy(0, 0.1); // very peaked
gamma ~ cauchy(0, 0.1); // very peaked
}
"
go_model2 <- stan_model(model_code=go_model_code2)
go_fit2 <- sampling(go_model2, chains = 4, iter = 2000,
data=list(nterms = nrow(goterms),
ntop = nlevels(goterms$top),
N = goterms$num_genes - goterms$upregulated - goterms$downregulated,
U = goterms$upregulated,
D = goterms$downregulated,
top = as.numeric(goterms$top)))
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
## mean se_mean sd 2.5% 25% 50% 75% 98% n_eff Rhat
## alpha[1] 7.7e+00 0.00027 0.021 7.7e+00 7.7e+00 7.7e+00 7.7e+00 7.74 5949 1
## alpha[2] 6.7e+00 0.00047 0.034 6.6e+00 6.7e+00 6.7e+00 6.7e+00 6.77 5339 1
## alpha[3] 3.4e+00 0.00278 0.182 3.0e+00 3.3e+00 3.4e+00 3.5e+00 3.71 4288 1
## alpha[4] 5.7e+00 0.00080 0.057 5.6e+00 5.7e+00 5.7e+00 5.8e+00 5.83 5044 1
## alpha[5] 7.3e+00 0.00037 0.027 7.2e+00 7.3e+00 7.3e+00 7.3e+00 7.34 5268 1
## alpha[6] 7.1e+00 0.00045 0.029 7.0e+00 7.0e+00 7.1e+00 7.1e+00 7.12 4138 1
## alpha[7] 6.0e+00 0.00071 0.048 5.9e+00 6.0e+00 6.0e+00 6.1e+00 6.12 4513 1
## alpha[8] 3.9e+00 0.00208 0.135 3.6e+00 3.8e+00 3.9e+00 4.0e+00 4.13 4219 1
## alpha[9] 4.8e+00 0.00133 0.090 4.6e+00 4.7e+00 4.8e+00 4.8e+00 4.95 4651 1
## alpha[10] 7.4e+00 0.00038 0.026 7.3e+00 7.3e+00 7.4e+00 7.4e+00 7.40 4626 1
## alpha[11] 4.5e+00 0.00150 0.103 4.3e+00 4.4e+00 4.5e+00 4.6e+00 4.71 4777 1
## delta[1] -2.8e+00 0.01876 0.271 -3.2e+00 -3.0e+00 -2.9e+00 -2.8e+00 -1.87 209 1
## delta[2] -3.5e+00 0.01861 0.282 -3.9e+00 -3.7e+00 -3.6e+00 -3.5e+00 -2.59 230 1
## beta1[1] -7.4e-02 0.01829 0.262 -1.0e+00 -1.1e-01 -1.8e-02 4.8e-02 0.25 205 1
## beta1[2] 1.1e+00 0.01900 0.304 9.8e-02 9.8e-01 1.1e+00 1.3e+00 1.55 257 1
## beta1[3] -9.6e-02 0.01894 0.269 -1.0e+00 -1.3e-01 -3.3e-02 3.7e-02 0.21 202 1
## beta2[1] 1.0e-02 0.00245 0.116 -2.3e-01 -5.2e-02 9.3e-03 7.3e-02 0.24 2225 1
## beta2[2] -3.2e-02 0.00238 0.120 -3.0e-01 -9.1e-02 -2.2e-02 3.9e-02 0.19 2533 1
## beta2[3] -3.4e-02 0.00598 0.247 -6.4e-01 -1.0e-01 -1.0e-02 7.1e-02 0.41 1710 1
## beta2[4] 3.3e+00 0.00295 0.137 3.0e+00 3.2e+00 3.3e+00 3.4e+00 3.57 2139 1
## beta2[5] -6.1e-02 0.00301 0.123 -3.5e-01 -1.3e-01 -4.4e-02 1.7e-02 0.15 1662 1
## beta2[6] 3.1e-01 0.00346 0.144 3.1e-03 2.2e-01 3.1e-01 4.0e-01 0.58 1730 1
## beta2[7] -3.5e-02 0.00275 0.121 -3.3e-01 -9.4e-02 -2.4e-02 3.6e-02 0.19 1945 1
## beta2[8] 1.7e-02 0.00325 0.160 -3.1e-01 -5.9e-02 1.3e-02 8.7e-02 0.38 2423 1
## beta2[9] -1.0e-01 0.00536 0.228 -6.9e-01 -1.7e-01 -5.3e-02 2.2e-02 0.21 1809 1
## beta2[10] 6.5e-02 0.00361 0.146 -1.6e-01 -2.0e-02 4.3e-02 1.2e-01 0.44 1631 1
## beta2[11] -9.2e-02 0.00630 0.228 -7.1e-01 -1.5e-01 -4.0e-02 3.3e-02 0.22 1309 1
## gamma[1] 8.5e-02 0.00222 0.125 -1.2e-01 2.5e-03 6.7e-02 1.6e-01 0.37 3199 1
## gamma[2] 5.6e-02 0.00258 0.145 -1.9e-01 -3.0e-02 3.8e-02 1.3e-01 0.40 3138 1
## gamma[3] -2.3e-01 0.04488 0.873 -2.9e+00 -1.7e-01 -3.3e-02 6.1e-02 0.45 378 1
## gamma[4] -5.7e-01 0.00236 0.139 -8.5e-01 -6.7e-01 -5.7e-01 -4.8e-01 -0.30 3466 1
## gamma[5] -9.5e-03 0.00159 0.095 -2.0e-01 -6.2e-02 -7.7e-03 4.6e-02 0.18 3525 1
## gamma[6] -3.1e-02 0.00168 0.098 -2.5e-01 -8.5e-02 -2.6e-02 2.9e-02 0.15 3396 1
## gamma[7] -1.6e-01 0.00404 0.195 -6.5e-01 -2.6e-01 -1.1e-01 -3.0e-02 0.11 2321 1
## gamma[8] 1.7e-02 0.00530 0.208 -3.4e-01 -6.5e-02 8.9e-03 9.3e-02 0.46 1538 1
## gamma[9] 1.0e-01 0.00651 0.262 -2.7e-01 -3.8e-02 4.3e-02 1.8e-01 0.80 1617 1
## gamma[10] 6.6e-02 0.00238 0.128 -1.5e-01 -1.5e-02 4.9e-02 1.3e-01 0.37 2904 1
## gamma[11] -3.0e-02 0.00550 0.251 -6.6e-01 -9.8e-02 -6.9e-03 7.4e-02 0.42 2075 1
## lp__ 5.8e+04 0.20568 5.730 5.8e+04 5.8e+04 5.8e+04 5.8e+04 58054.32 776 1
stan_trace()
, alphastan_trace()
, gammapairs()
In conclusion, we used Stan fit a Poisson model in which the cell means are given by the equation above. (say Bayesian somehow) The priors on each parameter are specified above. Standard methods were used for assessing convergence of the Hamiltonian Monte Carlo algorithm. Most genes are not differentially regulated: in the data, X percent were; our model fit a 95% credible interval of A to B for this effect. However, apoptosis and cell motility genes are strongly differentially regulated, showing relative enrichments of about 25-fold and 3.3-fold, respectively, over “not differentiated”. (TODO: get the actual numbers, as exp(beta)) 95% credible intervals for these two effects ranged from X to Y. (look those up) TODO: up vs down for apoptosis; sperm motility; Remaining effects were not statistically signficant.
It’s often important to consider more than one model, e.g.:
The problem is that then there’s more than one answer.
How do we compare models?
diabetes package:lars R Documentation
Blood and other measurements in diabetics
Description:
The ‘diabetes’ data frame has 442 rows and 3 columns. These are
the data used in the Efron et al "Least Angle Regression" paper.
Format:
This data frame contains the following columns:
x a matrix with 10 columns
y a numeric vector
x2 a matrix with 64 columns
The dataset has
age:ldl
age^2
y
## age sex bmi map tc ldl hdl tch ltg glu y
## age 1.000 0.174 0.185 0.34 0.260 0.22 -0.075 0.20 0.27 0.30 0.188
## sex 0.174 1.000 0.088 0.24 0.035 0.14 -0.379 0.33 0.15 0.21 0.043
## bmi 0.185 0.088 1.000 0.40 0.250 0.26 -0.367 0.41 0.45 0.39 0.586
## map 0.335 0.241 0.395 1.00 0.242 0.19 -0.179 0.26 0.39 0.39 0.441
## tc 0.260 0.035 0.250 0.24 1.000 0.90 0.052 0.54 0.52 0.33 0.212
## ldl 0.219 0.143 0.261 0.19 0.897 1.00 -0.196 0.66 0.32 0.29 0.174
## hdl -0.075 -0.379 -0.367 -0.18 0.052 -0.20 1.000 -0.74 -0.40 -0.27 -0.395
## tch 0.204 0.332 0.414 0.26 0.542 0.66 -0.738 1.00 0.62 0.42 0.430
## ltg 0.271 0.150 0.446 0.39 0.516 0.32 -0.399 0.62 1.00 0.46 0.566
## glu 0.302 0.208 0.389 0.39 0.326 0.29 -0.274 0.42 0.46 1.00 0.382
## y 0.188 0.043 0.586 0.44 0.212 0.17 -0.395 0.43 0.57 0.38 1.000
Put aside 20% of the data for testing.
Refit the model.
Predict the test data; compute \[\begin{aligned} S = \sqrt{\frac{1}{M} \sum_{k=1}^M (\hat y_i - y_i)^2} \end{aligned}\]
Repeat for the other four 20%s.
Compare.
First let’s split the data into testing and training just once:
##
## Call:
## lm(formula = y ~ ., data = training_d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.74 -33.70 -1.85 30.24 152.54
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 151.83 2.93 51.78 < 2e-16 ***
## age 19.14 75.84 0.25 0.801
## sex -300.90 74.65 -4.03 7.1e-05 ***
## bmi 467.76 99.04 4.72 3.6e-06 ***
## map 374.50 82.79 4.52 8.8e-06 ***
## tc 1140.35 62451.29 0.02 0.985
## ldl -1159.25 54887.17 -0.02 0.983
## hdl -684.36 23339.30 -0.03 0.977
## tch 107.96 312.77 0.35 0.730
## ltg 250.42 20532.47 0.01 0.990
## glu 81.55 79.01 1.03 0.303
## `age^2` 62.94 79.26 0.79 0.428
## `bmi^2` 91.92 92.32 1.00 0.320
## `map^2` 33.49 80.15 0.42 0.676
## `tc^2` 4185.94 7981.86 0.52 0.600
## `ldl^2` 2116.00 5777.25 0.37 0.714
## `hdl^2` 625.77 2029.38 0.31 0.758
## `tch^2` 269.08 757.96 0.36 0.723
## `ltg^2` 1154.33 1820.83 0.63 0.527
## `glu^2` 114.75 110.58 1.04 0.300
## `age:sex` 183.24 85.76 2.14 0.033 *
## `age:bmi` 24.01 94.21 0.25 0.799
## `age:map` -71.22 90.13 -0.79 0.430
## `age:tc` -2.27 693.38 0.00 0.997
## `age:ldl` -92.78 556.41 -0.17 0.868
## `age:hdl` 126.90 316.47 0.40 0.689
## `age:tch` 96.27 242.67 0.40 0.692
## `age:ltg` 17.73 258.97 0.07 0.945
## `age:glu` 52.22 96.96 0.54 0.591
## `sex:bmi` 99.22 87.49 1.13 0.258
## `sex:map` 121.92 81.71 1.49 0.137
## `sex:tc` -37.72 716.68 -0.05 0.958
## `sex:ldl` 81.11 563.40 0.14 0.886
## `sex:hdl` 19.35 336.57 0.06 0.954
## `sex:tch` -89.86 236.22 -0.38 0.704
## `sex:ltg` -2.25 272.39 -0.01 0.993
## `sex:glu` 43.18 81.30 0.53 0.596
## `bmi:map` 101.69 96.77 1.05 0.294
## `bmi:tc` -238.61 854.28 -0.28 0.780
## `bmi:ldl` 195.92 724.22 0.27 0.787
## `bmi:hdl` 96.67 397.92 0.24 0.808
## `bmi:tch` -52.31 252.72 -0.21 0.836
## `bmi:ltg` -26.00 324.18 -0.08 0.936
## `bmi:glu` 32.65 102.26 0.32 0.750
## `map:tc` 1185.67 840.43 1.41 0.159
## `map:ldl` -967.00 710.39 -1.36 0.174
## `map:hdl` -494.82 374.48 -1.32 0.187
## `map:tch` -34.65 229.94 -0.15 0.880
## `map:ltg` -457.15 332.46 -1.38 0.170
## `map:glu` -113.81 105.27 -1.08 0.281
## `tc:ldl` -5737.11 13018.75 -0.44 0.660
## `tc:hdl` -2006.73 4590.82 -0.44 0.662
## `tc:tch` -895.17 2215.28 -0.40 0.686
## `tc:ltg` -3415.95 13722.65 -0.25 0.804
## `tc:glu` -351.44 693.15 -0.51 0.613
## `ldl:hdl` 1342.35 3709.57 0.36 0.718
## `ldl:tch` 343.92 1792.65 0.19 0.848
## `ldl:ltg` 2686.26 11393.93 0.24 0.814
## `ldl:glu` 139.21 586.20 0.24 0.812
## `hdl:tch` 278.02 1306.12 0.21 0.832
## `hdl:ltg` 1059.65 4844.47 0.22 0.827
## `hdl:glu` 406.32 351.63 1.16 0.249
## `tch:ltg` -68.01 754.07 -0.09 0.928
## `tch:glu` 461.48 270.76 1.70 0.089 .
## `ltg:glu` 148.10 314.57 0.47 0.638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54 on 293 degrees of freedom
## Multiple R-squared: 0.61, Adjusted R-squared: 0.524
## F-statistic: 7.15 on 64 and 293 DF, p-value: <2e-16
With ordinary linear regression, we got a root-mean-square-prediction-error of 55.5 (on the test data), compared to a root-mean-square-error of 49.02 for the training data.
This suggests there’s some overfitting going on.
We have a lot of predictors: 64 of them. A good guess is that only a few are really useful. So, we can put a sparsifying prior on the coefficients, i.e., \(\beta\)s in \[\begin{aligned} y = \beta_0 + \beta_1 x_1 + \cdots \beta_n x_n + \epsilon \end{aligned}\]
y ~ a + b x[1] + c x[2]
, and fit a linear model.# write the crossvalidation error function
kfold <- function (K, df) {
N <- nrow(df)
Kfold <- sample(rep(1:K, N/K))
do_xval <- function (k) {
the_lm <- lm(y ~ ., data=df, subset=(Kfold != k))
train_error <- sqrt(mean(resid(the_lm)^2))
test_y <- df$y[Kfold == k]
test_error <- sqrt(mean( (test_y - predict(the_lm, newdata=subset(df, Kfold==k)))^2 ))
return(c('test'=test_error, 'train'=train_error))
}
results <- sapply(1:K, do_xval)
return(results)
}
K <- 10
max_M <- 300
do_m <- floor(seq(from=2, to=max_M-1, length.out=40))
all_results <- matrix(NA, nrow=length(do_m) + 1, ncol=2)
first_results <- rowMeans(kfold(K, df))
all_results[1,] <- first_results
colnames(all_results) <- names(first_results)
noise_df <- matrix(rnorm(N * (N-2)), nrow=N)
colnames(noise_df) <- paste0('z', 1:ncol(noise_df))
new_df <- cbind(df, noise_df)
for (j in seq_along(do_m)) {
m <- do_m[j]
all_results[j,] <- rowMeans(kfold(K, new_df[,1:(m+1)]))
}
matplot(c(2, do_m), all_results, type='l', xlab='number of variables', ylab='root mean square error')
legend("topleft", lty=1, col=1:2, legend=colnames(all_results))
For interpretation, we looked at parameter estimates, and relied on strong priors to deal with model nonidentifiability.
A possibly better alternative is to ask for the posterior distribution of the quantities we actually want to know (not the same as the parameters, in this case).
go_model_code3 <- "
data {
int nterms;
int ntop; // number of 'top' categories
int N[nterms];
int U[nterms];
int D[nterms];
int top[nterms];
}
parameters {
vector[nterms] alpha;
vector[2] delta; // up, down regulation
vector[ntop] beta1;
vector[nterms] beta2;
vector[nterms] gamma;
}
transformed parameters {
vector[nterms] mean_n;
vector[nterms] mean_u;
vector[nterms] mean_d;
mean_n = exp(alpha);
mean_u = exp(alpha + delta[1] + beta1[top] + beta2);
mean_d = exp(alpha + delta[2] + beta1[top] + beta2 + gamma);
}
model {
N ~ poisson(mean_n);
U ~ poisson(mean_u);
D ~ poisson(mean_d);
alpha ~ normal(0, 5); // flat-ish
delta ~ normal(0, 5); // flat-ish
beta1 ~ cauchy(0, 1.0);
beta2 ~ cauchy(0, 1.0);
gamma ~ cauchy(0, 1.0);
}
"
go_model3 <- stan_model(model_code=go_model_code3)
go_fit3 <- sampling(go_model3, chains = 4, iter = 4000,
data=list(nterms = nrow(goterms),
ntop = nlevels(goterms$top),
N = goterms$num_genes - goterms$upregulated - goterms$downregulated,
U = goterms$upregulated,
D = goterms$downregulated,
top = as.numeric(goterms$top)))
## mean se_mean sd 2.5% 25% 50% 75% 98% n_eff Rhat
## alpha[1] 7.7e+00 0.00018 0.021 7.7e+00 7.7e+00 7.7e+00 7.72 7.7e+00 13309 1
## alpha[2] 6.7e+00 0.00031 0.035 6.6e+00 6.7e+00 6.7e+00 6.73 6.8e+00 12903 1
## alpha[3] 3.4e+00 0.00164 0.180 3.0e+00 3.3e+00 3.4e+00 3.50 3.7e+00 12168 1
## alpha[4] 5.7e+00 0.00049 0.056 5.6e+00 5.7e+00 5.7e+00 5.76 5.8e+00 12993 1
## alpha[5] 7.3e+00 0.00023 0.026 7.2e+00 7.3e+00 7.3e+00 7.30 7.3e+00 12927 1
## alpha[6] 7.1e+00 0.00024 0.030 7.0e+00 7.0e+00 7.1e+00 7.08 7.1e+00 15034 1
## alpha[7] 6.0e+00 0.00041 0.049 5.9e+00 6.0e+00 6.0e+00 6.06 6.1e+00 14320 1
## alpha[8] 3.9e+00 0.00131 0.144 3.6e+00 3.8e+00 3.9e+00 3.96 4.1e+00 12104 1
## alpha[9] 4.8e+00 0.00078 0.094 4.6e+00 4.7e+00 4.8e+00 4.85 5.0e+00 14465 1
## alpha[10] 7.3e+00 0.00021 0.025 7.3e+00 7.3e+00 7.3e+00 7.37 7.4e+00 14685 1
## alpha[11] 4.5e+00 0.00089 0.105 4.3e+00 4.5e+00 4.5e+00 4.61 4.7e+00 13918 1
## delta[1] -2.4e+00 0.01434 0.823 -3.9e+00 -2.9e+00 -2.4e+00 -1.88 -7.2e-01 3289 1
## delta[2] -3.1e+00 0.01574 0.874 -4.8e+00 -3.7e+00 -3.2e+00 -2.57 -1.4e+00 3083 1
## beta1[1] -2.7e-01 0.01324 0.868 -2.2e+00 -7.6e-01 -2.3e-01 0.26 1.4e+00 4298 1
## beta1[2] 6.3e-01 0.01444 0.863 -1.0e+00 7.3e-02 6.1e-01 1.15 2.3e+00 3573 1
## beta1[3] -7.5e-01 0.01500 0.930 -2.8e+00 -1.3e+00 -6.6e-01 -0.13 8.9e-01 3847 1
## beta2[1] -2.4e-01 0.01185 0.651 -1.7e+00 -6.1e-01 -1.9e-01 0.17 9.0e-01 3014 1
## beta2[2] -3.4e-01 0.01218 0.656 -1.8e+00 -7.0e-01 -2.9e-01 0.08 8.1e-01 2905 1
## beta2[3] -3.2e-01 0.01284 0.810 -2.2e+00 -7.5e-01 -2.4e-01 0.21 1.1e+00 3984 1
## beta2[4] 3.1e+00 0.01196 0.655 1.6e+00 2.7e+00 3.1e+00 3.49 4.2e+00 2999 1
## beta2[5] -4.0e-02 0.00710 0.453 -9.3e-01 -3.3e-01 -4.6e-02 0.23 8.8e-01 4074 1
## beta2[6] 3.7e-01 0.00718 0.454 -5.2e-01 7.8e-02 3.6e-01 0.65 1.3e+00 4000 1
## beta2[7] 2.7e-02 0.00719 0.460 -8.7e-01 -2.7e-01 2.0e-02 0.31 9.8e-01 4091 1
## beta2[8] 5.4e-02 0.00729 0.510 -9.5e-01 -2.8e-01 4.4e-02 0.38 1.1e+00 4905 1
## beta2[9] -6.0e-01 0.00975 0.761 -2.3e+00 -1.0e+00 -5.5e-01 -0.11 7.8e-01 6103 1
## beta2[10] 2.9e-01 0.00853 0.642 -9.6e-01 -1.1e-01 2.7e-01 0.67 1.6e+00 5670 1
## beta2[11] -3.0e-01 0.00923 0.709 -1.8e+00 -7.1e-01 -2.7e-01 0.15 1.1e+00 5906 1
## gamma[1] 1.7e-01 0.00559 0.326 -4.6e-01 -4.4e-02 1.6e-01 0.38 8.2e-01 3394 1
## gamma[2] 2.1e-01 0.00594 0.366 -5.0e-01 -3.6e-02 2.1e-01 0.45 9.3e-01 3791 1
## gamma[3] -3.4e+00 0.81222 13.167 -2.0e+01 -2.5e+00 -1.0e+00 -0.19 9.1e-01 263 1
## gamma[4] -5.9e-01 0.00574 0.313 -1.2e+00 -8.0e-01 -5.9e-01 -0.39 2.5e-02 2986 1
## gamma[5] 3.3e-03 0.00589 0.315 -6.3e-01 -2.0e-01 5.5e-03 0.21 6.2e-01 2863 1
## gamma[6] -6.6e-02 0.00562 0.315 -6.8e-01 -2.7e-01 -6.2e-02 0.14 5.6e-01 3142 1
## gamma[7] -3.5e-01 0.00608 0.362 -1.1e+00 -5.8e-01 -3.4e-01 -0.11 3.7e-01 3544 1
## gamma[8] 8.2e-02 0.00617 0.523 -9.8e-01 -2.5e-01 8.3e-02 0.42 1.1e+00 7190 1
## gamma[9] 1.0e+00 0.00884 0.746 -3.0e-01 4.9e-01 9.7e-01 1.50 2.6e+00 7108 1
## gamma[10] 1.3e-01 0.00557 0.333 -5.3e-01 -8.5e-02 1.3e-01 0.35 7.9e-01 3564 1
## gamma[11] 8.1e-02 0.00767 0.737 -1.5e+00 -3.5e-01 9.6e-02 0.53 1.5e+00 9232 1
## mean_n[1] 2.2e+03 0.39863 46.158 2.1e+03 2.2e+03 2.2e+03 2243.36 2.3e+03 13408 1
## mean_n[2] 8.2e+02 0.25208 28.586 7.6e+02 8.0e+02 8.2e+02 838.79 8.8e+02 12860 1
## mean_n[3] 3.0e+01 0.04700 5.296 2.0e+01 2.6e+01 2.9e+01 32.99 4.1e+01 12696 1
## mean_n[4] 3.1e+02 0.15054 17.159 2.8e+02 3.0e+02 3.1e+02 318.77 3.4e+02 12992 1
## mean_n[5] 1.5e+03 0.33894 38.544 1.4e+03 1.4e+03 1.5e+03 1487.50 1.5e+03 12932 1
## mean_n[6] 1.2e+03 0.27969 34.292 1.1e+03 1.1e+03 1.2e+03 1184.72 1.2e+03 15033 1
## mean_n[7] 4.2e+02 0.17141 20.503 3.8e+02 4.0e+02 4.2e+02 430.50 4.6e+02 14307 1
## mean_n[8] 4.8e+01 0.06202 6.861 3.5e+01 4.3e+01 4.8e+01 52.29 6.3e+01 12238 1
## mean_n[9] 1.2e+02 0.09305 11.239 9.9e+01 1.1e+02 1.2e+02 127.70 1.4e+02 14586 1
## mean_n[10] 1.6e+03 0.32333 39.268 1.5e+03 1.5e+03 1.6e+03 1581.86 1.6e+03 14749 1
## mean_n[11] 9.4e+01 0.08323 9.813 7.5e+01 8.7e+01 9.3e+01 100.05 1.1e+02 13902 1
## mean_u[1] 1.2e+02 0.12045 10.916 1.0e+02 1.1e+02 1.2e+02 129.73 1.4e+02 8213 1
## mean_u[2] 4.2e+01 0.06421 6.294 3.0e+01 3.7e+01 4.1e+01 45.58 5.5e+01 9609 1
## mean_u[3] 1.8e+00 0.01137 1.140 3.5e-01 1.0e+00 1.6e+00 2.37 4.8e+00 10046 1
## mean_u[4] 4.7e+02 0.24258 21.662 4.2e+02 4.5e+02 4.7e+02 481.20 5.1e+02 7974 1
## mean_u[5] 2.4e+02 0.16540 15.385 2.1e+02 2.3e+02 2.4e+02 253.46 2.7e+02 8652 1
## mean_u[6] 2.9e+02 0.18951 17.202 2.6e+02 2.8e+02 2.9e+02 303.00 3.3e+02 8240 1
## mean_u[7] 7.4e+01 0.09374 8.617 5.8e+01 6.8e+01 7.4e+01 79.91 9.3e+01 8451 1
## mean_u[8] 9.1e+00 0.02659 2.729 4.6e+00 7.1e+00 8.8e+00 10.69 1.5e+01 10533 1
## mean_u[9] 3.3e+00 0.01622 1.638 8.6e-01 2.1e+00 3.0e+00 4.17 7.1e+00 10199 1
## mean_u[10] 9.1e+01 0.09924 9.352 7.3e+01 8.4e+01 9.1e+01 97.10 1.1e+02 8881 1
## mean_u[11] 3.4e+00 0.01521 1.596 1.1e+00 2.2e+00 3.1e+00 4.20 7.2e+00 11012 1
## mean_d[1] 7.0e+01 0.09005 8.221 5.5e+01 6.4e+01 6.9e+01 74.99 8.7e+01 8334 1
## mean_d[2] 2.5e+01 0.04858 4.856 1.6e+01 2.1e+01 2.4e+01 27.76 3.5e+01 9993 1
## mean_d[3] 4.2e-01 0.00674 0.513 1.2e-09 5.8e-02 2.5e-01 0.59 1.8e+00 5798 1
## mean_d[4] 1.2e+02 0.11886 11.129 1.0e+02 1.2e+02 1.2e+02 131.18 1.5e+02 8768 1
## mean_d[5] 1.2e+02 0.11893 10.633 9.7e+01 1.1e+02 1.2e+02 124.10 1.4e+02 7993 1
## mean_d[6] 1.3e+02 0.12655 11.451 1.1e+02 1.2e+02 1.3e+02 138.60 1.5e+02 8188 1
## mean_d[7] 2.6e+01 0.05149 4.985 1.7e+01 2.2e+01 2.5e+01 28.71 3.6e+01 9374 1
## mean_d[8] 4.9e+00 0.02044 2.046 1.8e+00 3.4e+00 4.6e+00 6.05 9.8e+00 10021 1
## mean_d[9] 4.3e+00 0.02097 1.991 1.4e+00 2.8e+00 4.0e+00 5.36 9.1e+00 9009 1
## mean_d[10] 5.0e+01 0.07244 6.974 3.7e+01 4.5e+01 5.0e+01 54.42 6.4e+01 9267 1
## mean_d[11] 1.9e+00 0.01164 1.178 3.6e-01 1.1e+00 1.7e+00 2.49 4.9e+00 10244 1
## lp__ 5.8e+04 0.10390 4.800 5.8e+04 5.8e+04 5.8e+04 58072.58 5.8e+04 2134 1
Let \(\mu^N_i = \exp(\lambda^N_i)\) and \(\mu^N = \sum_i \mu^N_i\).
Is term \(i\) overrepresented among differentially regulated genes? \[\begin{aligned} \frac{ (\mu^U_i + \mu^D_i) / (\mu^U + \mu^D) } { \mu^N_i / \mu^N } \end{aligned}\]
Is term \(i\) more up- than down-regulated? \[\begin{aligned} \frac{ \mu^U_i / \mu^U } { \mu^D_i / \mu^D } \end{aligned}\]
samps <- rstan::extract(go_fit3)
diff_enrichment <- ((samps$mean_d + samps$mean_u) / (rowSums(samps$mean_d) + rowSums(samps$mean_u))) / (samps$mean_n / rowSums(samps$mean_n))
up_enrichment <- (samps$mean_u / rowSums(samps$mean_u)) / (samps$mean_d / rowSums(samps$mean_d))
colnames(diff_enrichment) <- paste0("diff:", goterms$term)
colnames(up_enrichment) <- paste0("up:", goterms$term)
diff_top_enrichment <- (simplify2array(by(t(samps$mean_u + samps$mean_d), goterms$top, colSums)) / rowSums(samps$mean_u)) / (simplify2array(by(t(samps$mean_n), goterms$top, colSums))/ rowSums(samps$mean_n))
up_top_enrichment <- (simplify2array(by(t(samps$mean_u), goterms$top, colSums)) / rowSums(samps$mean_u)) / (simplify2array(by(t(samps$mean_d), goterms$top, colSums))/ rowSums(samps$mean_d))
colnames(diff_top_enrichment) <- paste0("diff:", levels(goterms$top))
colnames(up_top_enrichment) <- paste0("up:", levels(goterms$top))