\[%% % Add your macros here; they'll be included in pdf and html output. %% \newcommand{\R}{\mathbb{R}} % reals \newcommand{\E}{\mathbb{E}} % expectation \renewcommand{\P}{\mathbb{P}} % probability \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\logistic}{logistic} \DeclareMathOperator{\sd}{sd} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\cov}{cov} \DeclareMathOperator{\cor}{cor} \DeclareMathOperator{\Normal}{Normal} \DeclareMathOperator{\LogNormal}{logNormal} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\Beta}{Beta} \DeclareMathOperator{\Binom}{Binomial} \DeclareMathOperator{\Gam}{Gamma} \DeclareMathOperator{\Exp}{Exponential} \DeclareMathOperator{\Cauchy}{Cauchy} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Dirichlet}{Dirichlet} \DeclareMathOperator{\Wishart}{Wishart} \DeclareMathOperator{\StudentsT}{StudentsT} \DeclareMathOperator{\Weibull}{Weibull} \newcommand{\given}{\;\vert\;} \]

Homework 4: Capybaras

Assignment: Your task is to use Rmarkdown to write a short report, readable by a technically literate person. The code you used should not be visible in the final report (unless you have a good reason to show it).

Due: Submit your work via Canvas by the end of the day on Thursday, October 28th. Please submit both the Rmd file and the resulting html or pdf file. You can work with other members of class, but I expect each of you to construct and run all of the scripts yourself.

The problem

Researchers are studying how levels of the hormone IGF-1 (insulin-like growth factor 1) affect early growth, and have measured IGF-1 levels along with length, weight, sex, and an index of maturity in 655 young capybara (although there is a certain amount of missing data due to capybara exuberance). Your goal in this report is to investigate how each of length, weight, and IGF-1 level are each related to sex and maturity, addressing the questions (a) how much does length increase with maturity, and is there a difference in mean length between sexes, at the same maturity level, and (b) the same questions for how length depends on IGF-1 level. Be sure to describe the data, and communicate both estimates of effects in real units and the strength of statistical support for your results.

The dataset is here: igf.tsv, and has variables age (in weeks), sex, igf (IGF-1 levels), maturity (1-5, larger values are more sexually mature), weight (in g), and length (body and tail, in cm).

This time, instead of writing your own code, please base your analysis on the following code. You can write your own in addition - you’ll probably at least need to run more code to understand what this is doing! - but as usual, please don’t actually display the code in the report (figures and tables only).

library(tidyverse)  # for ggplot versions below

# read in the data
igf <- read.table("igf.tsv", header=TRUE, stringsAsFactors=TRUE)

# what are sample sizes?
table(igf$maturity, igf$sex)

# what's the range of values
layout(matrix(1:4, nrow=2))
for (varname in c("igf", "age", "weight", "length")) {
    hist(igf[[varname]], main=varname, xlab=varname)
}

pairs(igf[,c("igf", "age", "weight", "length")],
      pch=as.numeric(igf$sex),
      col=as.numeric(igf$maturity)
)

# visualization for part (a)
boxplot(length ~ maturity * sex, data=igf)
# ggplot version:
ggplot(igf) + geom_boxplot(aes(y=length, x=factor(maturity), fill=sex)) +
    xlab("Maturity Level") +
    ylab("Length (cm)")

# analysis for (a)
maturity_lm <- lm(length ~ maturity * sex, data=igf)
summary(maturity_lm)

# compare predicted means to observed means
mean_lengths <- expand.grid(maturity=1:5, sex=levels(igf$sex))
mean_lengths$predicted <- predict(maturity_lm, newdata=mean_lengths)
mean_lengths$observed <- NA
for (j in 1:nrow(mean_lengths)) {
    mean_lengths$observed[j] <- mean(
        igf$length[igf$sex == mean_lengths$sex[j]
                   & igf$maturity == mean_lengths$maturity[j]],
        na.rm=TRUE
    )
}

plot(mean_lengths$maturity,
     mean_lengths$predicted,
     col=ifelse(mean_lengths$sex == "F", 'red', 'grey'), pch=20, cex=2,
     xlab='maturity', ylab='mean length')
with(subset(mean_lengths, sex=="F"), {
    lines(maturity, predicted, col="red")
})
with(subset(mean_lengths, sex=="M"), {
    lines(maturity, predicted, col="grey")
})
points(mean_lengths$maturity,
       mean_lengths$observed,
       pch=1, col=ifelse(mean_lengths$sex == "F", "red", "grey"), cex=2)
legend("topleft", pch=c(1,1,20,20), col=c("red", "grey"),
       legend=c("female observed", "male observed", "female predicted", "male predicted")
)

# is the addition of sex statistically significant?
anova(
    lm(length ~ maturity, data=igf),
    lm(length ~ maturity * sex, data=igf)
)

# visualization for part (b)
plot(length ~ igf, col=ifelse(sex=="F", "red", "grey"), pch=20, data=igf)
# ggplot version:
ggplot(igf) + geom_point(aes(y=length, x=igf, col=sex)) +
    xlab("IGF-1 levels") +
    ylab("Length (cm)")

# analysis for (a) and (b)
igf_lm <- lm(length ~ igf * sex, data=igf)
summary(igf_lm)

# compare predicted means to observed data
plot(length ~ igf, data=igf,
     col=ifelse(sex == "F", 'red', 'grey'), pch=20, cex=2,
     xlab='IGF-1 levels', ylab='mean length')
abline(coef(igf_lm)[["(Intercept)"]], coef(igf_lm)[["igf"]], col='red', lwd=2)
abline(coef(igf_lm)[["(Intercept)"]] + coef(igf_lm)[["sexM"]],
       coef(igf_lm)[["igf"]] + coef(igf_lm)[["igf:sexM"]], col='grey', lwd=2)

# is the addition of sex statistically significant?
anova(
    lm(length ~ igf, data=igf),
    lm(length ~ igf * sex, data=igf)
)