“No scientific theory is worth anything unless it enables us to predict something which is actually going on. Until that is done, theories are a mere game of words, and not such a good game as poetry.” – J. B. S. Haldane
1/15/2022
“No scientific theory is worth anything unless it enables us to predict something which is actually going on. Until that is done, theories are a mere game of words, and not such a good game as poetry.” – J. B. S. Haldane
The more predictor variables you add to a model the better it will seem to be at predicting the response variables in your dataset
Even if you are not giving it any additional useful information
The more predictor variables you add to a model the better it will seem to be at predicting the response variables in your dataset
Even if you are not giving it any additional useful information
When you have as many predictor variables as observations it is possible to exactly match the data that you initially used to construct your model.
The more predictor variables you add to a model the better it will seem to be at predicting the response variables in your dataset
Even if you are not giving it any additional useful information
When you have as many predictor variables as observations it is possible to exactly match the data that you initially used to construct your model.
It’s possible to come up with a model that will give you precise predictions for the response variables included in your initial dataset
The more predictor variables you add to a model the better it will seem to be at predicting the response variables in your dataset
Even if you are not giving it any additional useful information
When you have as many predictor variables as observations it is possible to exactly match the data that you initially used to construct your model.
It’s possible to come up with a model that will give you precise predictions for the response variables included in your initial dataset
But that is not generalizable to new data at all.
head(by_state[,1:17])
## date AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN ## 528 2020-02-01 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## 406 2020-02-15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## 267 2020-03-01 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 ## 696 2020-03-15 0 13 4 1 47 32 19 14 3 31 52 3 6 4 61 7 ## 1143 2020-04-01 13 112 54 124 1223 385 429 91 37 1031 570 28 52 144 986 406 ## 3290 2020-04-15 8 265 85 156 1171 316 766 139 62 900 686 11 96 101 1346 428
Now let’s see how well the case counts in Oregon are predicted by other states, using only the 1st and 15th of each month:
OR_lm <- lm(OR ~ . - date, data=by_state) plot(OR ~ date, data=by_state, type='l', lwd=2, ylab="Oregon new cases") lines(by_state$date, predict(OR_lm), col='red', lty=2, lwd=2) legend("topleft", lty=c(1, 2), col=c("black", "red"), lwd=c(2,2), legend=c("observed", "predicted using other states"))
y ~ a + b x[1] + c x[2]
N <- 500 df <- data.frame(x1 = rnorm(N), x2 = runif(N)) params <- list(intercept = 2.0, x1 = 7.0, x2 = -8.0, sigma = 1) pred_y <- params$intercept + params$x1 * df$x1 + params$x2 * df$x2 df$y <- rnorm(N, mean=pred_y, sd=params$sigma) pairs(df)
set.seed(23) # Decide how many data points we want in our simulated dataset N <- 500
N <- 500 # Create a dataframe with values for our two predictor variables df <- data.frame(x1 = rnorm(N), x2 = runif(N)) # rnorm samples from a normal distribution with mean 0 # runif samples from a uniform distribution between 0 and 1 head(df)
## x1 x2 ## 1 0.19 0.12 ## 2 -0.43 0.02 ## 3 0.91 0.74 ## 4 1.79 0.57 ## 5 1.00 0.93 ## 6 1.11 0.67
N <- 500 df <- data.frame(x1 = rnorm(N), x2 = runif(N)) # Create a list of parameters that we will use in our model params <- list(intercept = 2.0, x1 = 7.0, x2 = -8.0, sigma = 1) #x1 is the coefficient for the x1 variable #x2 is the coefficient for the x2 variable #sigma is the standard deviation of the random error in our model params
## $intercept ## [1] 2 ## ## $x1 ## [1] 7 ## ## $x2 ## [1] -8 ## ## $sigma ## [1] 1
N <- 500 df <- data.frame(x1 = rnorm(N), x2 = runif(N)) params <- list(intercept = 2.0, x1 = 7.0, x2 = -8.0, sigma = 1) # Write out our model which takes info from our df and the params list pred_y <- params$intercept + params$x1 * df$x1 + params$x2 * df$x2
N <- 500 df <- data.frame(x1 = rnorm(N), x2 = runif(N)) params <- list(intercept = 2.0, x1 = 7.0, x2 = -8.0, sigma = 1) pred_y <- params$intercept + params$x1 * df$x1 + params$x2 * df$x2 # Simulate the values of our responsible variables df$y <- rnorm(N, mean=pred_y, sd=params$sigma)
N <- 500 df <- data.frame(x1 = rnorm(N), x2 = runif(N)) params <- list(intercept = 2.0, x1 = 7.0, x2 = -8.0, sigma = 1) pred_y <- params$intercept + params$x1 * df$x1 + params$x2 * df$x2 df$y <- rnorm(N, mean=pred_y, sd=params$sigma) # Look at the relationship between our variables pairs(df)
y ~ a + b x[1] + c x[2]
sqrt(mean(resid(lm)^2))
N <- 500 df <- data.frame(x1 = rnorm(N), x2 = runif(N)) params <- list(intercept = 2.0, x1 = 7.0, x2 = -8.0, sigma = 1) pred_y <- params$intercept + params$x1 * df$x1 + params$x2 * df$x2 df$y <- rnorm(N, mean=pred_y, sd=params$sigma)
max_M <- 300 # max number of spurious variables noise_df <- matrix(rnorm(nrow(df) * (max_M-2)), nrow=nrow(df)) colnames(noise_df) <- paste0('z', 1:ncol(noise_df)) new_df <- cbind(df, noise_df) head(new_df[,1:10])
## x1 x2 y z1 z2 z3 z4 z5 z6 z7 ## 1 0.19 0.12 1.7 -0.28 0.360 0.80 0.287 -0.37 -2.33 -0.231 ## 2 -0.43 0.02 -1.0 -0.88 0.476 -1.55 0.069 -0.72 -0.91 -1.352 ## 3 0.91 0.74 2.2 -0.64 0.711 1.23 -1.238 1.08 -0.52 -0.745 ## 4 1.79 0.57 9.6 0.15 -0.660 0.49 0.143 0.85 1.27 -0.073 ## 5 1.00 0.93 2.0 0.55 -0.019 0.36 0.222 -1.72 0.54 -0.277 ## 6 1.11 0.67 6.6 0.71 -0.542 -0.68 -1.409 -0.40 0.35 0.554
all_results <- data.frame(m=floor(seq(from=2, to=max_M-1, length.out=40)), error=NA) for (j in 1:nrow(all_results)) { m <- all_results$m[j] the_lm <- lm(y ~ ., data=new_df[,1:(m+1)]) all_results$error[j] <- sqrt(mean(resid(the_lm)^2)) }
# Create a dataframe with an increasing number of values between 2 ansd 299 # We will fit linear models using these different numbers of variables # Then store the error for each model in the data frame # So that we have a calculated error for each number of variables all_results <- data.frame(m=floor(seq(from=2, to=max_M-1, length.out=40)), error=NA) head(all_results)
## m error ## 1 2 NA ## 2 9 NA ## 3 17 NA ## 4 24 NA ## 5 32 NA ## 6 40 NA
# Then for the different numbers of variables in the m column of allresults # We fit linear models and calculate the error for (j in 1:nrow(all_results)) { m <- all_results$m[j] the_lm <- lm(y ~ ., data=new_df[,1:(m+1)]) all_results$error[j] <- sqrt(mean(resid(the_lm)^2)) }
# Set the number of predictors to include in our model m <- all_results$m[2] all_results$m[2]
## [1] 9
# Set the number of predictors to include in our model m <- all_results$m[2] # Fit our linear model using those number of predictor variables the_lm <- lm(y ~ ., data=new_df[,1:(m+1)]) summary(the_lm)$coefficients
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.015 0.090 22.38 6.1e-77 ## x1 6.998 0.043 161.65 0.0e+00 ## x2 -8.054 0.152 -53.02 4.0e-205 ## z1 -0.025 0.041 -0.60 5.5e-01 ## z2 0.013 0.046 0.28 7.8e-01 ## z3 -0.041 0.044 -0.94 3.5e-01 ## z4 -0.015 0.043 -0.35 7.3e-01 ## z5 -0.069 0.043 -1.60 1.1e-01 ## z6 -0.018 0.045 -0.39 6.9e-01 ## z7 0.048 0.043 1.11 2.7e-01
# Set the number of predictors to include in our model m <- all_results$m[2] # Fit our linear model using those number of predictor variables the_lm <- lm(y ~ ., data=new_df[,1:(m+1)]) # Calculate the square root of the mean squared residuals all_results$error[j] <- sqrt(mean(resid(the_lm)^2))
\[\begin{aligned} S = \sqrt{\frac{1}{M} \sum_{k=1}^M (\hat y_i - y_i)^2} \end{aligned}\]
all_results <- data.frame(m=floor(seq(from=2, to=max_M-1, length.out=40)), error=NA) for (j in 1:nrow(all_results)) { m <- all_results$m[j] the_lm <- lm(y ~ ., data=new_df[,1:(m+1)]) all_results$error[j] <- sqrt(mean(resid(the_lm)^2)) }
plot(all_results$m, all_results$error, type='l', col=2, lwd=2, xlab='number of variables', ylab='root mean square error', ylim=range(all_results$error, 0))
To test predictive ability (and diagnose overfitting!):
To test predictive ability (and diagnose overfitting!):
To test predictive ability (and diagnose overfitting!):
To test predictive ability (and diagnose overfitting!):
y ~ a + b x[1] + c x[2]
, and fit a linear model.Put aside 20% of the data for testing.
Refit the model.
Put aside 20% of the data for testing.
Refit the model.
Predict the test data;
Compute test error square root of the mean squared difference between the actual and predicted values
Put aside 20% of the data for testing.
Refit the model.
Predict the test data;
Compute test error square root of the mean squared difference between the actual and predicted values
Repeat for the other four 20%s.
Compare
Randomly put aside 20% of your data for testing
Fit a linear model to the remaining 80% of your data
Compute the training (80%) error: sqrt(mean(resid(lm)^2))
And the test (20%) error: sqrt(mean(("actual y" - "predicted y")^2))
Perform five-fold crossvalidation (repeat steps 1 to 3 four more times)
Compare the prediction error for your test and training data
Repeat this process this with different numbers of predictors
Plot how the test and training errors differ with the number of predictors
kfold <- function (K, df) { Kfold <- sample(rep(1:K, nrow(df)/K)) results <- data.frame(test_error=rep(NA, K), train_error=rep(NA, K)) for (k in 1:K) { the_lm <- lm(y ~ ., data=df, subset=(Kfold != k)) results$train_error[k] <- sqrt(mean(resid(the_lm)^2)) test_y <- df$y[Kfold == k] results$test_error[k] <- sqrt(mean( (test_y - predict(the_lm, newdata=subset(df, Kfold==k)))^2 )) } return(results) }
#Assign every row of our dataframe a random number from 1 to the number of folds df <- new_df[,1:(9+1)] K <- 5 Kfold <- sample(rep(1:K, nrow(df)/K)) Kfold
## [1] 4 5 5 2 4 4 1 2 2 1 2 2 1 1 5 2 1 2 5 2 5 2 2 4 4 1 4 3 4 3 3 3 3 1 4 3 1 ## [38] 3 3 4 5 5 1 2 1 1 1 3 3 2 2 3 5 3 1 4 3 4 3 5 3 4 5 1 1 1 3 2 3 2 5 2 3 3 ## [75] 3 2 2 5 2 4 3 5 3 5 3 5 4 2 4 3 5 1 5 2 2 5 4 4 1 3 4 2 1 5 1 3 2 4 5 2 1 ## [112] 5 4 3 1 1 2 1 2 5 1 2 1 5 2 5 1 3 2 5 2 5 2 4 5 1 4 1 2 2 5 5 5 4 4 2 5 5 ## [149] 1 4 3 3 3 2 2 4 1 4 2 1 2 3 4 4 4 3 2 1 5 5 1 3 3 4 1 4 5 4 5 5 2 5 4 4 2 ## [186] 3 5 3 2 5 5 4 4 1 2 3 4 5 2 4 1 4 5 3 3 1 3 2 2 2 5 1 5 3 3 4 3 5 2 1 4 2 ## [223] 5 3 5 5 3 1 1 2 2 4 4 3 5 5 4 2 2 5 3 4 2 1 5 4 3 2 2 3 5 2 5 5 1 4 3 2 3 ## [260] 1 1 3 3 4 4 1 4 4 4 3 5 2 2 5 3 1 5 3 5 5 4 5 4 1 4 1 5 1 2 3 4 1 5 1 4 2 ## [297] 2 3 3 1 5 4 5 5 4 2 3 3 4 2 2 4 1 1 5 4 1 4 2 4 1 3 2 3 3 2 1 2 5 5 3 4 5 ## [334] 2 1 1 1 4 3 5 1 4 2 3 4 2 1 3 1 5 2 3 3 3 1 2 5 2 1 1 1 4 4 2 5 1 4 5 5 4 ## [371] 1 1 1 4 3 5 3 2 3 1 3 1 4 4 5 2 2 1 2 1 4 4 2 2 5 3 1 1 1 3 3 4 2 4 1 5 1 ## [408] 3 1 4 3 4 2 1 1 4 3 5 4 2 5 1 3 3 5 3 5 4 4 4 3 5 1 5 3 2 2 3 3 1 2 4 3 5 ## [445] 1 4 3 2 5 2 1 5 4 2 3 5 2 3 3 5 5 4 4 1 2 2 5 2 4 1 5 4 1 2 4 3 2 4 3 1 5 ## [482] 1 5 2 4 1 3 4 1 1 3 4 5 5 3 3 5 4 1 1
# Make an empty dataframe to store our test and training error df <- new_df[,1:(9+1)] K <- 5 Kfold <- sample(rep(1:K, nrow(df)/K)) results <- data.frame(test_error=rep(NA, 5), train_error=rep(NA, 5)) results
## test_error train_error ## 1 NA NA ## 2 NA NA ## 3 NA NA ## 4 NA NA ## 5 NA NA
# Fit a linear model with 80% of the data for each fold for (k in 1:K) { the_lm <- lm(y ~ ., data=df, subset=(Kfold != k)) results$train_error[k] <- sqrt(mean(resid(the_lm)^2)) test_y <- df$y[Kfold == k] results$test_error[k] <- sqrt(mean( (test_y - predict(the_lm, newdata=subset(df, Kfold==k)))^2 )) }
# By specifying Kfold != k we only use rows with a label that is not equal to k # Each time we fit a linear model we put aside 20% of the data for testing df <- new_df[,1:(9+1)] K <- 5 Kfold <- sample(rep(1:K, nrow(df)/K)) results <- data.frame(test_error=rep(NA, 5), train_error=rep(NA, 5)) the_lm <- lm(y ~ ., data=df) the_lm$df.residual
## [1] 490
the_lm <- lm(y ~ ., data=df, subset=(Kfold != 1)) the_lm$df.residual
## [1] 390
df <- new_df[,1:(9+1)] K <- 5 Kfold <- sample(rep(1:K, nrow(df)/K)) results <- data.frame(test_error=rep(NA, 5), train_error=rep(NA, 5)) results <- data.frame(test_error=rep(NA, 5), train_error=rep(NA, 5)) for (k in 1:K) { the_lm <- lm(y ~ ., data=df, subset=(Kfold != k)) # Calculate the training error for the models we fit during each fold results$train_error[k] <- sqrt(mean(resid(the_lm)^2)) # Create a vector with our test response variable values test_y <- df$y[Kfold == k] # Calculate the test error for the models we fit during each fold # Store these values in the results dataframe with the training error values results$test_error[k] <- sqrt(mean( (test_y - predict(the_lm, newdata=subset(df, Kfold==k)))^2 )) }
kfold <- function (K, df) { Kfold <- sample(rep(1:K, nrow(df)/K)) results <- data.frame(test_error=rep(NA, K), train_error=rep(NA, K)) for (k in 1:K) { the_lm <- lm(y ~ ., data=df, subset=(Kfold != k)) results$train_error[k] <- sqrt(mean(resid(the_lm)^2)) test_y <- df$y[Kfold == k] results$test_error[k] <- sqrt(mean( (test_y - predict(the_lm, newdata=subset(df, Kfold==k)))^2 )) } return(results) }
all_results <- data.frame(m=floor(seq(from=2, to=max_M-1, length.out=40)), test_error=NA, train_error=NA) for (j in 1:nrow(all_results)) { m <- all_results$m[j] all_results[j,2:3] <- colMeans(kfold(K=5, new_df[,1:(m+1)])) }
plot(all_results$m, all_results$test_error, type='l', lwd=2, xlab='number of variables', ylab='root mean square error', ylim=range(all_results[,2:3], 0)) lines(all_results$m, all_results$train_error, col=2, lwd=2) legend("topleft", lty=1, col=1:2, lwd=2, legend=paste(c("test", "train"), "error"))