\[ %% % Add your macros here; they'll be included in pdf and html output. %% \newcommand{\R}{\mathbb{R}} % reals \newcommand{\E}{\mathbb{E}} % expectation \renewcommand{\P}{\mathbb{P}} % probability \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\logistic}{logistic} \DeclareMathOperator{\SE}{SE} \DeclareMathOperator{\sd}{sd} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\cov}{cov} \DeclareMathOperator{\cor}{cor} \DeclareMathOperator{\Normal}{Normal} \DeclareMathOperator{\LogNormal}{logNormal} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\Beta}{Beta} \DeclareMathOperator{\Binom}{Binomial} \DeclareMathOperator{\Gam}{Gamma} \DeclareMathOperator{\Exp}{Exponential} \DeclareMathOperator{\Cauchy}{Cauchy} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Dirichlet}{Dirichlet} \DeclareMathOperator{\Wishart}{Wishart} \DeclareMathOperator{\StudentsT}{StudentsT} \DeclareMathOperator{\Weibull}{Weibull} \newcommand{\given}{\;\vert\;} \]

Baseball data

Peter Ralph

19 November 2020 – Advanced Biological Statistics

Baseball

Baseball

We have a dataset of batting averages of baseball players, having

  1. name
  2. position
  3. number of at bats
  4. number of hits
batting <- read.csv("data/BattingAverage.csv", header=TRUE, stringsAsFactors=TRUE)
head(batting)
##          Player     PriPos Hits AtBats PlayerNumber PriPosNumber
## 1 Fernando Abad    Pitcher    1      7            1            1
## 2   Bobby Abreu Left Field   53    219            2            7
## 3    Tony Abreu   2nd Base   18     70            3            4
## 4 Dustin Ackley   2nd Base  137    607            4            4
## 5    Matt Adams   1st Base   21     86            5            3
## 6 Nathan Adcock    Pitcher    0      1            6            1

The overall batting average of the 948 players is 0.2546597.

Here is the average by position.

batting %>% group_by(PriPos) %>% 
    summarise(num=n(), BatAvg=sum(Hits)/sum(AtBats)) %>% 
    select(PriPos, num, BatAvg)
## # A tibble: 9 x 3
##   PriPos         num BatAvg
##   <fct>        <int>  <dbl>
## 1 1st Base        81  0.259
## 2 2nd Base        72  0.256
## 3 3rd Base        75  0.265
## 4 Catcher        103  0.247
## 5 Center Field    67  0.264
## 6 Left Field     103  0.259
## 7 Pitcher        324  0.129
## 8 Right Field     60  0.264
## 9 Shortstop       63  0.255

Questions?

  1. What’s the overall batting average?

  2. Do some positions tend to be better batters?

  3. How much variation in batting average is there between players of the same position?

Exercise:

Come up with some quick-and-dirty statistics to answer these questions.

Everyone is the same

first_model <- "
data {
    int N;
    int hits[N];
    int at_bats[N];
}
parameters {
    real<lower=0, upper=1> theta;
}
model {
    hits ~ binomial(at_bats, theta);
    theta ~ beta(1, 1);
} "
first_fit <- stan(model_code=first_model, chains=3, iter=1000,
                  data=list(N=nrow(batting),
                            hits=batting$Hits,
                            at_bats=batting$AtBats))
stan_hist(first_fit, bins=20)

plot of chunk r start_res

Every pitcher is the same

pos_model <- "
data {
    int N;
    int hits[N];
    int at_bats[N];
    int npos; // number of positions
    int position[N];
}
parameters {
    real<lower=0, upper=1> theta[npos];
}
model {
    real theta_vec[N];
    for (k in 1:N) {
        theta_vec[k] = theta[position[k]];
    }
    hits ~ binomial(at_bats, theta_vec);
    theta ~ beta(1, 1);
} "
pos_fit <- stan(model_code=pos_model, chains=3, iter=1000,
                  data=list(N=nrow(batting),
                            hits=batting$Hits,
                            at_bats=batting$AtBats,
                            npos=nlevels(batting$PriPos),
                            position=as.numeric(batting$PriPos)))
theta_samples <- extract(pos_fit)$theta
layout(matrix(1:9, nrow=3))
for (k in 1:ncol(theta_samples)) {
    hist(theta_samples[,k], main=levels(batting$PriPos)[k], xlim=c(0.1, 0.3),
         col=adjustcolor('red',0.6), xlab='batting avg', freq=FALSE)
}

plot of chunk r pos_res

Reparameterization

Recall that if \[\begin{aligned} B \sim \Beta(\alpha, \beta) \end{aligned}\] then \[\begin{aligned} \E[B] &= \frac{\alpha}{\alpha + \beta} \\ \text{and} \sd[B] &= \frac{1}{\alpha + \beta} \sqrt{\frac{\alpha \beta}{\alpha + \beta + 1}}. \end{aligned}\]

Gee, it’d be nice to think about it’s location and scale instead.

Well, if \[\begin{aligned} \mu &= \frac{\alpha}{\alpha + \beta} \quad \text{(the mean)}\\ \kappa &= \alpha + \beta \quad \text{(the concentration)}, \end{aligned}\] then \[\begin{aligned} B \sim \Beta(\mu \kappa, (1-\mu) \kappa) \end{aligned}\] has \[\begin{aligned} \E[B] &= \mu \\ \text{and} \qquad \sd[B] &\propto \frac{1}{\kappa} . \end{aligned}\]

More Baseball

Another model: spot the differences!

\[\begin{aligned} Z_i &\sim \Binom(N_i, \theta_i) \\ \theta_i &\sim \Beta(\mu_{p_i} \kappa_{p_i}, (1-\mu_{p_i})\kappa_{p_i}) \\ \mu_p &\sim \Beta(1, 1) \\ \kappa_p &\sim \Gam(0.1, 0.1) . \end{aligned}\]

pos_model <- "
data {
    int N;   // number of players
    int hits[N];
    int at_bats[N];
    int npos; // number of positions
    int position[N];
}
parameters {
    real<lower=0, upper=1> theta[N];
    real<lower=0, upper=1> mu[npos];
    real<lower=0> kappa[npos];
}
model {
    real alpha;
    real beta;
    hits ~ binomial(at_bats, theta);
    for (i in 1:N) {
        alpha = mu[position[i]] * kappa[position[i]];
        beta = (1 - mu[position[i]]) * kappa[position[i]];
        theta[i] ~ beta(alpha, beta);
    }
    mu ~ beta(1,1);
    kappa ~ gamma(0.1,0.1);
}"

pos_model <- stan_model(model_code=pos_model)
system.time(pos_fit <- sampling(pos_model, chains=3, iter=100,
                            data=list(N=nrow(batting),
                                      hits=batting$Hits,
                                      at_bats=batting$AtBats,
                                      npos=nlevels(batting$PriPos),
                                      position=as.numeric(batting$PriPos))))
## Warning: The largest R-hat is 1.36, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
##    user  system elapsed 
##  37.554   1.142  16.018

Diagnostics

print(pos_fit, pars=c("mu", "kappa"))
## Inference for Stan model: db775bca82d7fc39d379745fb39b0819.
## 3 chains, each with iter=100; warmup=50; thin=1; 
## post-warmup draws per chain=50, total post-warmup draws=150.
## 
##            mean se_mean    sd  2.5%    25%    50%    75%  97.5% n_eff Rhat
## mu[1]      0.25    0.00  0.01  0.24   0.25   0.25   0.26   0.26   173 0.99
## mu[2]      0.25    0.00  0.01  0.24   0.24   0.25   0.25   0.26   133 1.00
## mu[3]      0.26    0.00  0.01  0.24   0.25   0.26   0.26   0.27   299 1.01
## mu[4]      0.24    0.00  0.01  0.23   0.23   0.24   0.24   0.25   147 1.00
## mu[5]      0.26    0.00  0.01  0.25   0.25   0.26   0.26   0.27   164 1.01
## mu[6]      0.25    0.00  0.01  0.24   0.25   0.25   0.25   0.26   145 1.02
## mu[7]      0.13    0.00  0.01  0.11   0.12   0.13   0.13   0.14    97 1.02
## mu[8]      0.26    0.00  0.01  0.25   0.25   0.26   0.26   0.27    82 1.02
## mu[9]      0.25    0.00  0.01  0.23   0.24   0.25   0.25   0.26   157 0.99
## kappa[1] 109.06    1.76 21.20 73.52  94.06 107.48 122.71 158.80   146 0.99
## kappa[2] 105.58    2.67 27.41 63.75  86.54 102.19 120.46 176.61   105 1.03
## kappa[3] 101.70    2.14 21.24 66.49  87.53 100.25 113.29 147.78    99 1.01
## kappa[4]  92.06    1.58 18.23 64.34  78.54  88.53 103.45 129.31   134 1.01
## kappa[5]  98.40    1.63 20.92 61.10  81.37  97.82 112.36 141.36   164 0.99
## kappa[6] 106.36    2.57 21.60 72.38  89.27 105.02 120.46 153.19    71 1.01
## kappa[7]  52.49    3.18 12.55 30.07  44.62  51.72  58.58  77.98    16 1.35
## kappa[8] 119.31    2.78 28.18 74.86 102.69 115.98 129.69 195.12   103 1.01
## kappa[9]  96.69    1.62 21.23 60.25  81.78  94.47 108.98 141.61   172 0.99
## 
## Samples were drawn using NUTS(diag_e) at Thu Nov 19 10:14:43 2020.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Is it mixing?

stan_trace(pos_fit, pars="mu")

plot of chunk r plot_trace

Run longer!

system.time(pos_fit <- sampling(pos_model, chains=3, iter=1000,
                            control=list(max_treedepth=15),
                            data=list(N=nrow(batting),
                                      hits=batting$Hits,
                                      at_bats=batting$AtBats,
                                      npos=nlevels(batting$PriPos),
                                      position=as.numeric(batting$PriPos))))
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
##    user  system elapsed 
##  75.096   1.119  31.448
print(pos_fit, pars=c("mu", "kappa"))
## Inference for Stan model: db775bca82d7fc39d379745fb39b0819.
## 3 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=1500.
## 
##            mean se_mean    sd  2.5%    25%    50%    75%  97.5% n_eff Rhat
## mu[1]      0.25    0.00  0.01  0.24   0.25   0.25   0.26   0.26  1119 1.00
## mu[2]      0.25    0.00  0.01  0.24   0.24   0.25   0.25   0.26   930 1.00
## mu[3]      0.26    0.00  0.01  0.24   0.25   0.26   0.26   0.27  1177 1.00
## mu[4]      0.24    0.00  0.01  0.23   0.24   0.24   0.24   0.25   710 1.00
## mu[5]      0.26    0.00  0.01  0.25   0.25   0.26   0.26   0.27  1121 1.00
## mu[6]      0.25    0.00  0.01  0.24   0.25   0.25   0.25   0.26   988 1.00
## mu[7]      0.13    0.00  0.01  0.12   0.13   0.13   0.13   0.14   232 1.02
## mu[8]      0.26    0.00  0.01  0.25   0.25   0.26   0.26   0.27   975 1.00
## mu[9]      0.25    0.00  0.01  0.23   0.24   0.25   0.25   0.26   969 1.00
## kappa[1] 112.89    0.83 23.30 73.38  96.36 110.84 127.50 161.31   794 1.00
## kappa[2] 104.30    0.82 23.58 64.33  87.44 101.66 118.80 158.57   828 1.00
## kappa[3]  97.85    0.77 22.08 60.78  82.24  96.02 111.15 144.36   819 1.00
## kappa[4]  95.54    0.68 19.25 61.34  81.85  94.62 106.96 137.57   793 1.00
## kappa[5] 101.63    0.82 23.68 62.11  85.09  99.40 116.39 155.29   825 1.00
## kappa[6] 106.83    0.82 22.08 69.51  91.43 105.22 119.58 157.29   733 1.00
## kappa[7]  54.92    1.05 12.74 34.39  45.80  53.59  62.32  83.67   148 1.00
## kappa[8] 119.85    0.92 27.46 72.05 100.67 118.32 136.31 181.80   895 1.00
## kappa[9]  95.60    0.70 22.14 56.48  80.25  93.83 109.42 144.53   996 1.00
## 
## Samples were drawn using NUTS(diag_e) at Thu Nov 19 10:15:21 2020.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Is it mixing?

stan_trace(pos_fit, pars="mu")

plot of chunk r plot_trace_again

stan_trace(pos_fit, pars="kappa")

plot of chunk r plot_kappa_again

pairs(pos_fit, pars=c(sprintf("mu[%d]", 1:3), sprintf("kappa[%d]", 1:3), "lp__"))

plot of chunk r pos_pairs

Let’s look at the results!

stan_hist(pos_fit, pars="mu", bins=30) + xlim(0, 0.4)
## Warning: Removed 18 rows containing missing values (geom_bar).

plot of chunk r first_hist

With labels: position means \(\mu_p\):

plot of chunk r plot_mu

Position “concentrations”, \(\kappa_p\):

plot of chunk r plot_kappa

// reveal.js plugins