\[ %% % Add your macros here; they'll be included in pdf and html output. %% \newcommand{\R}{\mathbb{R}} % reals \newcommand{\E}{\mathbb{E}} % expectation \renewcommand{\P}{\mathbb{P}} % probability \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\logistic}{logistic} \DeclareMathOperator{\SE}{SE} \DeclareMathOperator{\sd}{sd} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\cov}{cov} \DeclareMathOperator{\cor}{cor} \DeclareMathOperator{\Normal}{Normal} \DeclareMathOperator{\MVN}{MVN} \DeclareMathOperator{\LogNormal}{logNormal} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\Beta}{Beta} \DeclareMathOperator{\Binom}{Binomial} \DeclareMathOperator{\Gam}{Gamma} \DeclareMathOperator{\Exp}{Exponential} \DeclareMathOperator{\Cauchy}{Cauchy} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Dirichlet}{Dirichlet} \DeclareMathOperator{\Wishart}{Wishart} \DeclareMathOperator{\StudentsT}{StudentsT} \DeclareMathOperator{\Weibull}{Weibull} \newcommand{\given}{\;\vert\;} \]

Permutation tests

Peter Ralph

Advanced Biological Statistics

Permutation tests

## 
##  Welch Two Sample t-test
## 
## data:  airbnb$price[airbnb$instant_bookable] and airbnb$price[!airbnb$instant_bookable]
## t = 3.6482, df = 5039.8, p-value = 0.0002667
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   4.475555 14.872518
## sample estimates:
## mean of x mean of y 
##  124.6409  114.9668

But, the \(t\) test relies on Normality (a little). Is the distribution of AirBnB prices too “weird”? How can we be sure?

Methods:

  1. Remove the big values and try again.

  2. Use a nonparametric test.

The permutation test

Observation: If there was no meaningful difference in prices between “instant bookable” and not, then randomly shuffling that label won’t change anything.

Strategy:

  1. Shuffle the instant_bookable column.
  2. Compute the difference in means.
  3. Repeat, many times.
  4. Compare: the \(p\)-value is the proportion of “shuffled” values more extreme than observed.

Rightarrow Why is this a \(p\)-value? For what hypothesis?

Shuffle once

fake_is_instant <- sample(airbnb$instant_bookable)
(mean(airbnb$price[fake_is_instant], na.rm=TRUE) - 
                mean(airbnb$price[!fake_is_instant], na.rm=TRUE))
## [1] 2.837541

Many times

real_diff <- (mean(airbnb$price[airbnb$instant_bookable], na.rm=TRUE)
               - mean(airbnb$price[!airbnb$instant_bookable], na.rm=TRUE))
permuted_diffs <- replicate(10000, {
        fake_is_instant <- sample(airbnb$instant_bookable)
        (mean(airbnb$price[fake_is_instant], na.rm=TRUE)
             - mean(airbnb$price[!fake_is_instant], na.rm=TRUE))
   } )
hist(permuted_diffs, xlab="shuffled differences in mean", xlim=range(c(permuted_diffs, real_diff)))
abline(v=real_diff, col='red', lwd=3)

plot of chunk r many_shuf

How surprising was the real value?

mean(abs(permuted_diffs) > abs(real_diff))
## [1] 3e-04

The difference in price between instant bookable and not instant bookable is highly statistically significant (\(p \approx 0.0003\), permutation test).

Our turn

Here’s a nonparametric method to see if omnivores tend to be bigger than herbivores in the Pantheria data:

  1. Find pairs of species in the same family for which one is a omnivore and the other an herbivore.

  2. Compute within each pair (some statistic) comparing omnivore to herbivore size, and average these (the test statistic).

  3. Randomly reassign “trophic level” within families many times, recomputing this statistic.

  4. The \(p\)-value is the proportion of shuffled statistics greater than the observed statistic.

In class

library(tidyverse) 

source("../Datasets/PanTHERIA/read_pantheria.R")
pantheria <- (read_pantheria("../Datasets/PanTHERIA")
              %>% select(Family, Binomial, AdultBodyMass_g, TrophicLevel)
              %>% filter(
                    !is.na(AdultBodyMass_g)
                    & !is.na(TrophicLevel)))

# find families with both an herbivore and an omnivore
families <- unique(pantheria$Family)
family_counts <- table(pantheria$Family, pantheria$TrophicLevel)
good_families <- rownames(family_counts)[
                             (family_counts[,"herbivore"] > 0
                               & family_counts[,"omnivore"] > 0)
                        ]
sp_pairs <- data.frame(
               family=good_families,
               herbivore=NA,
               omnivore=NA
)
for (j in 1:nrow(sp_pairs)) {
    fam <- sp_pairs$family[j]
    sp_pairs$herbivore[j] <- sample(
                    subset(pantheria,
                             Family==fam & TrophicLevel == "herbivore"
                           )$Binomial,
          1)
    sp_pairs$omnivore[j] <- sample(
                    subset(pantheria,
                             Family==fam & TrophicLevel == "omnivore"
                           )$Binomial,
          1)
}

# consistency check:
for (colname in c("herbivore", "omnivore")) {
    stopifnot(all(
        pantheria$TrophicLevel[
                match(sp_pairs[[colname]], pantheria$Binomial)
           ] == colname
    ))
}

Next, compare the sizes:

sp_pairs$herbivore_size <- pantheria$AdultBodyMass_g[
         match(sp_pairs$herbivore, pantheria$Binomial)
 ]
sp_pairs$omnivore_size <- pantheria$AdultBodyMass_g[
        match(sp_pairs$omnivore, pantheria$Binomial)
]                                                    

plot(omnivore_size ~ herbivore_size, data=sp_pairs, log='xy')

plot of chunk r plots

sp_pairs$diff <- sp_pairs$omnivore_size - sp_pairs$herbivore_size
sp_pairs$ratio <- sp_pairs$omnivore_size / sp_pairs$herbivore_size
# let's use this one
sp_pairs$log_ratio <- log(sp_pairs$omnivore_size / sp_pairs$herbivore_size)

observed <- mean(sp_pairs$log_ratio)

On average, ominovres are a fraction 0.8661923 of their paired herbivore size.

shuffled <- replicate(1000, {
    coins <- (runif(nrow(sp_pairs)) < 0.5)
    fake_herbs <- ifelse(coins, sp_pairs$herbivore_size, sp_pairs$omnivore_size)
    fake_omnis <- ifelse(coins, sp_pairs$omnivore_size, sp_pairs$herbivore_size)
    mean(log(fake_omnis / fake_herbs))
})

hist(shuffled, breaks=30)
abline(v=observed, col='red')

plot of chunk r shuffle

// reveal.js plugins