pdf("F:\\Rmisc\\graphs\\cruise_ship.pdf")   # Route Graphics to a pdf file

####### set random number seed for reproducible sampling based methods
set.seed(13579)                             

cruise <- read.fwf("http://www.stat.ufl.edu/~winner/data/cruise_ship.dat", width=c(20,20,rep(8,7)),
 col.names=c("ship", "cline", "age", "tonnage", "passengers", "length", "cabins", "passdens", "crew"))

attach(cruise)

####### Fit Full model

fit0 <- lm(crew ~ age + tonnage + passengers + length + cabins + passdens)
summary(fit0)
anova(fit0)


######### Perform Backward Elimination, Forward Selection, and Stepwise Regression
######### Based on Model AIC (not individual regression coefficients)
######### fit1 and fit2 represent "extreme" models

library(MASS)
fit1 <- lm(crew ~ age + tonnage + passengers + length + cabins + passdens)
fit2 <- lm(crew ~ 1)
stepAIC(fit1,direction="backward")
stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2))
stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2))

########## Perform all possible regressions (aka all subset regressions)
########## Prints out best 4 models of each # of predictors

install.packages("leaps")
library(leaps)

allcruise <- regsubsets(crew ~ age + tonnage + passengers + length + cabins + passdens,
 nbest=4,data=cruise)
aprout <- summary(allcruise)
n <- length(cruise$crew)
p <- apply(aprout$which, 1, sum)
aprout$aic <- aprout$bic - log(n) * p + 2 * p
with(aprout,round(cbind(which,rsq,adjr2,cp,bic,aic),3))     ## Prints "readable" results
plot(allcruise,scale="bic")
plot(allcruise,scale="adjr2")

##### Re-fit the "best" model from Stepwise Regression
##### epred3 is the residual when the obs was not used in regression fit
##### epred3 = e3/(1-proj3)   where proj3 are diagonal elements of PROJECTION matrix  
##### cvpe = sum(epred3^2)/n = PRESS/n

fit3 <-  lm(crew ~ tonnage + passengers + length + cabins)
summary(fit3)
anova(fit3)
e3 <- residuals(fit3)
proj3 <- hatvalues(fit3)
epred3 <- e3/(1-proj3)
(cvpe <- sum((epred3)^2)/length(e3))

##### Cross-validation with a hold-out sample
##### Randomly sample 100 ships, fit model, obtain predictions for remaining 58 ships
#####    by applying their X-levels to regression coefficients from model

##### Obtain "training" and "validation" sets
 
cruise.cv.samp <- sample(1:length(crew),100,replace=FALSE)
cruise.cv.in <- cruise[cruise.cv.samp,]
cruise.cv.out <- cruise[-cruise.cv.samp,]

##### Fit model for training set

fit.cv.in <- lm(crew ~tonnage + passengers + length + cabins,
   data=cruise.cv.in)
summary(fit.cv.in)
anova(fit.cv.in)

##### Obtain Predicted values and prediction errors for validation sample
##### Regression is based on same 4 predictors as fit3 (columns 4:7 of cruise)
##### Compute MSEP, Bias, Percent Bias, and Percent Error of Prediction

pred.cv.out <- predict(fit.cv.in,cruise.cv.out[,4:7])
delta.cv.out <- crew[-cruise.cv.samp]-pred.cv.out
(msep <- sum((delta.cv.out)^2)/length(crew[-cruise.cv.samp]))
(bias <- (mean(delta.cv.out))^2)
(pctbias <- 100*bias/msep)
(pcterrpred <- 100*sqrt(msep)/mean(crew[-cruise.cv.samp]))

##### Perform k-fold cross-validation (fits 10 regressions, with 10 hold-out samples
##### Obtains avereage R^2 for hold-out samples and compares with full data fit

install.packages("bootstrap")

crossvalreg <- function(fit, k=10) {
 require(bootstrap)
 
 theta.fit <- function(x,y){lsfit(x,y)}
 theta.predict <-  function(fit,x){cbind(1,x) %*% fit$coef}

 x <- fit$model[,2:ncol(fit$model)]
 y <- fit$model[,1]

 results <- crossval(x, y, theta.fit, theta.predict, ngroup=k)
 r2 <- cor(y, fit$fitted.values)^2
 r2cv <- cor(y, results$cv.fit)^2
 cat("Original R-square =", r2, "\n")
 cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")
 cat("Change =", r2-r2cv, "\n")
}

fit2 <- lm(crew ~  tonnage + passengers + length + cabins)
crossvalreg(fit2)

dev.off()