\[%% % Add your macros here; they'll be included in pdf and html output. %% \newcommand{\R}{\mathbb{R}} % reals \newcommand{\E}{\mathbb{E}} % expectation \renewcommand{\P}{\mathbb{P}} % probability \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\logistic}{logistic} \DeclareMathOperator{\sd}{sd} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\cov}{cov} \DeclareMathOperator{\cor}{cor} \DeclareMathOperator{\Normal}{Normal} \DeclareMathOperator{\LogNormal}{logNormal} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\Beta}{Beta} \DeclareMathOperator{\Binom}{Binomial} \DeclareMathOperator{\Gam}{Gamma} \DeclareMathOperator{\Exp}{Exponential} \DeclareMathOperator{\Cauchy}{Cauchy} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Dirichlet}{Dirichlet} \DeclareMathOperator{\Wishart}{Wishart} \DeclareMathOperator{\StudentsT}{StudentsT} \DeclareMathOperator{\Weibull}{Weibull} \newcommand{\given}{\;\vert\;} \]

Homework 3: Capybaras

Assignment: Your task is to use Rmarkdown to write a short report, readable by a technically literate person. The code you used should not be visible in the final report (unless you have a good reason to show it).

Due: Submit your work via Canvas by the end of the day on Thursday, October 20th. Please submit both the Rmd file and the resulting html or pdf file. You can work with other members of class, but I expect each of you to construct and run all of the scripts yourself.

The problem

Researchers are studying how levels of the hormone IGF-1 (insulin-like growth factor 1) affect early growth, and have measured IGF-1 levels along with length, weight, sex, and an index of maturity in 655 young capybara (although there is a certain amount of missing data due to capybara exuberance). Your goal in this report is to investigate how length and weight are related to sex, age, and IGF-1 level. You should addressg the questions: (a) How much does size increase with age, and is there a difference in mean size between sexes, at the same age? (b) Is IGF-1 level predictive of size, after accounting for age and sex? (Both length and weight will be used to measure size.) Be sure to describe the data, and communicate both estimates of effects in real units and the strength of statistical support for your results.

The dataset is here: igf.tsv, and has variables age (in weeks), sex, igf (IGF-1 levels), maturity (1-5, larger values are more sexually mature), weight (in g), and length (body and tail, in cm). We won’t be using the “maturity” values.

This time, instead of writing your own code, please base your analysis on the following code. You can write your own in addition - you’ll probably at least need to run more code to understand what this is doing! - but as usual, please don’t actually display the code in the report (figures and tables only).

library(tidyverse)  # for ggplot versions below
library(cowplot)    # also

# read in the data
igf <- read.table("igf.tsv", header=TRUE, stringsAsFactors=TRUE)

# what are sample sizes?
table(round(igf$age), igf$sex)

# what's the range of values
layout(matrix(1:4, nrow=2))
for (varname in c("igf", "age", "weight", "length")) {
    hist(igf[[varname]], main=varname, xlab=varname)
}

sex_colors <- c("F" = "grey", "M" = "red")
pairs(igf[,c("igf", "age", "weight", "length")],
      col=sex_colors[igf$sex], pch=20
)

# visualization for part (a)
# base R version:
layout(t(1:2))
plot(length ~ age, col=sex_colors[sex], pch=20, data=igf, xlab='', ylab='length (cm)')
plot(weight ~ age, col=sex_colors[sex], pch=20, data=igf, xlab='', ylab='weight (g)')

# ggplot version:
p1 <- ggplot(igf) + geom_point(aes(y=length, x=age, col=sex)) +
    geom_smooth(aes(y=length, x=age, col=sex)) +
    xlab("Age") +
    ylab("Length (cm)")
p2 <- ggplot(igf) + geom_point(aes(y=weight, x=age, col=sex)) +
    geom_smooth(aes(y=weight, x=age, col=sex)) +
    xlab("Age") +
    ylab("Weight (g)")
plot_grid(p1, p2, labels=c("Length", "Weight"))

# analyses for (a)
length_lm <- lm(length ~ age + sex, data=igf)
summary(length_lm)

weight_lm <- lm(weight ~ age + sex, data=igf)
summary(weight_lm)

anova(
    lm(length ~ age, data=igf),
    lm(length ~ age + sex, data=igf)
)

anova(
    lm(weight ~ age, data=igf),
    lm(weight ~ age + sex, data=igf)
)


# look at predicted values against age
# base R version
layout(t(1:2))
plot(length ~ age, data=igf, col=sex_colors[sex], pch=20, xlab="age", ylab="length (cm)", main="Length")
age_vec <- seq(min(igf$age), max(igf$age), length.out=21)
for (sex in c("F", "M")) {
    pred_data <- data.frame(age=age_vec, sex=sex)
    pred_len <- predict(length_lm, newdata=pred_data)
    lines(age_vec, pred_len, col=sex_colors[sex])
}
plot(weight ~ age, data=igf, col=sex_colors[sex], pch=20, xlab="age", ylab="weight (g)", main="Weight")
for (sex in c("F", "M")) {
    pred_data <- data.frame(age=age_vec, sex=sex)
    pred_weight <- predict(weight_lm, newdata=pred_data)
    lines(age_vec, pred_weight, col=sex_colors[sex])
}
legend("topleft", lty=1, col=sex_colors, legend=names(sex_colors))

# ggplot version
p1 <- ggplot(igf) + geom_point(aes(y=length, x=age, col=sex)) +
    geom_smooth(aes(y=length, x=age, col=sex), method='lm') +
    xlab("Age") + ylab("Length (cm)")
p2 <- ggplot(igf) + geom_point(aes(y=weight, x=age, col=sex)) +
    geom_smooth(aes(y=weight, x=age, col=sex), method='lm') +
    xlab("Age") + ylab("Weight (g)")
plot_grid(p1, p2, labels=c("Length", "Weight"))

# is the addition of sex statistically significant?
anova(
    lm(length ~ age, data=igf),
    lm(length ~ age + sex, data=igf)
)

# Visualization for part (b)

# Are residual sizes from part (a) correlated with IGF-level?
igf$residual_length <- igf$length - predict(length_lm, newdata=igf)
igf$residual_weight <- igf$weight - predict(weight_lm, newdata=igf)

# base R version
layout(t(1:2))
plot(residual_length ~ igf, data=igf, col=sex_colors[sex], pch=20, xlab="IGF-1 level", ylab="Residual length (cm)", main="Length")
abline(coef(lm(residual_length ~ igf, data=igf)))
plot(residual_weight ~ igf, data=igf, col=sex_colors[sex], pch=20, xlab="IGF-1 level", ylab="Residual weight (g)", main="Weight")
abline(coef(lm(residual_weight ~ igf, data=igf)))
legend("topleft", lty=1, col=sex_colors, legend=names(sex_colors))

# ggplot version
p1 <- ggplot(igf, aes(y=residual_length, x=igf, col=sex)) +
    geom_point() + geom_smooth(method='lm') +
    xlab("IGF-1 level") + ylab("Residual length (cm)")
p2 <- ggplot(igf, aes(y=residual_weight, x=igf, col=sex)) +
    geom_point() + geom_smooth(method='lm') +
    xlab("IGF-1 level") + ylab("Residual weight (g)")
plot_grid(p1, p2, labels=c("Length", "Weight"))


# analysis for (b)
length_igf_lm <- lm(length ~ age + igf + sex, data=igf)
summary(length_igf_lm)
weight_igf_lm <- lm(weight ~ age + igf + sex, data=igf)
summary(weight_igf_lm)

# Compare predicted to observed values
igf$pred_igf_length <- predict(length_igf_lm, newdata=igf)
igf$pred_igf_weight <- predict(weight_igf_lm, newdata=igf)
# base R version
layout(t(1:2))
plot(pred_igf_length ~ length, data=igf, col=sex_colors[sex], pch=20, asp=2,
     xlab="observed lengths (cm)", ylab="predicted lengths (cm)", main="Length")
abline(0, 1)
plot(pred_igf_weight ~ weight, data=igf, col=sex_colors[sex], pch=20, asp=2,
     xlab="observed weights (g)", ylab="predicted weights (g)", main="Weight")
abline(0, 1)

# ggplot version
p1 <- ggplot(igf, aes(x=length, y=pred_igf_length, col=sex)) +
    geom_point() + geom_abline(slope=1, intercept=0) +
    xlab("observed length (cm)") + ylab("predicted length (cm)")
p2 <- ggplot(igf, aes(x=weight, y=pred_igf_weight, col=sex)) +
    geom_point() + geom_abline(slope=1, intercept=0) +
    xlab("observed weight (g)") + ylab("predicted weight (g)")
plot_grid(p1, p2, labels=c("Length", "Weight"))


# Is there support for adding IGF?

anova(
    lm(length ~ age + sex, data=na.omit(igf)),
    lm(length ~ age + sex + igf, data=na.omit(igf))
)