pdf("F:\\Rmisc\\graphs\\cruise_ship.pdf") # Route Graphics to a pdf file ####### set random number seed for reproducible sampling based methods set.seed(13579) cruise <- read.fwf("http://www.stat.ufl.edu/~winner/data/cruise_ship.dat", width=c(20,20,rep(8,7)), col.names=c("ship", "cline", "age", "tonnage", "passengers", "length", "cabins", "passdens", "crew")) attach(cruise) ####### Fit Full model fit0 <- lm(crew ~ age + tonnage + passengers + length + cabins + passdens) summary(fit0) anova(fit0) ######### Perform Backward Elimination, Forward Selection, and Stepwise Regression ######### Based on Model AIC (not individual regression coefficients) ######### fit1 and fit2 represent "extreme" models library(MASS) fit1 <- lm(crew ~ age + tonnage + passengers + length + cabins + passdens) fit2 <- lm(crew ~ 1) stepAIC(fit1,direction="backward") stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2)) stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2)) ########## Perform all possible regressions (aka all subset regressions) ########## Prints out best 4 models of each # of predictors install.packages("leaps") library(leaps) allcruise <- regsubsets(crew ~ age + tonnage + passengers + length + cabins + passdens, nbest=4,data=cruise) aprout <- summary(allcruise) n <- length(cruise$crew) p <- apply(aprout$which, 1, sum) aprout$aic <- aprout$bic - log(n) * p + 2 * p with(aprout,round(cbind(which,rsq,adjr2,cp,bic,aic),3)) ## Prints "readable" results plot(allcruise,scale="bic") plot(allcruise,scale="adjr2") ##### Re-fit the "best" model from Stepwise Regression ##### epred3 is the residual when the obs was not used in regression fit ##### epred3 = e3/(1-proj3) where proj3 are diagonal elements of PROJECTION matrix ##### cvpe = sum(epred3^2)/n = PRESS/n fit3 <- lm(crew ~ tonnage + passengers + length + cabins) summary(fit3) anova(fit3) e3 <- residuals(fit3) proj3 <- hatvalues(fit3) epred3 <- e3/(1-proj3) (cvpe <- sum((epred3)^2)/length(e3)) ##### Cross-validation with a hold-out sample ##### Randomly sample 100 ships, fit model, obtain predictions for remaining 58 ships ##### by applying their X-levels to regression coefficients from model ##### Obtain "training" and "validation" sets cruise.cv.samp <- sample(1:length(crew),100,replace=FALSE) cruise.cv.in <- cruise[cruise.cv.samp,] cruise.cv.out <- cruise[-cruise.cv.samp,] ##### Fit model for training set fit.cv.in <- lm(crew ~tonnage + passengers + length + cabins, data=cruise.cv.in) summary(fit.cv.in) anova(fit.cv.in) ##### Obtain Predicted values and prediction errors for validation sample ##### Regression is based on same 4 predictors as fit3 (columns 4:7 of cruise) ##### Compute MSEP, Bias, Percent Bias, and Percent Error of Prediction pred.cv.out <- predict(fit.cv.in,cruise.cv.out[,4:7]) delta.cv.out <- crew[-cruise.cv.samp]-pred.cv.out (msep <- sum((delta.cv.out)^2)/length(crew[-cruise.cv.samp])) (bias <- (mean(delta.cv.out))^2) (pctbias <- 100*bias/msep) (pcterrpred <- 100*sqrt(msep)/mean(crew[-cruise.cv.samp])) ##### Perform k-fold cross-validation (fits 10 regressions, with 10 hold-out samples ##### Obtains avereage R^2 for hold-out samples and compares with full data fit install.packages("bootstrap") crossvalreg <- function(fit, k=10) { require(bootstrap) theta.fit <- function(x,y){lsfit(x,y)} theta.predict <- function(fit,x){cbind(1,x) %*% fit$coef} x <- fit$model[,2:ncol(fit$model)] y <- fit$model[,1] results <- crossval(x, y, theta.fit, theta.predict, ngroup=k) r2 <- cor(y, fit$fitted.values)^2 r2cv <- cor(y, results$cv.fit)^2 cat("Original R-square =", r2, "\n") cat(k, "Fold Cross-Validated R-square =", r2cv, "\n") cat("Change =", r2-r2cv, "\n") } fit2 <- lm(crew ~ tonnage + passengers + length + cabins) crossvalreg(fit2) dev.off()