lpga1 <- read.fwf("http://www.stat.ufl.edu/~winner/data/lpga2008.dat",
width=c(30,8,8,8,8,8,8,8,8), 
col.names=c("golfer","drive","fairway","green","putts","sandshot",
"sandsave","prz","logprz"))
attach(lpga1)
lpga <- na.exclude(lpga1)      ### Remove the golfer with missing data
attach(lpga)

#### Fit Regression model on untranformed Y (prz)
lpga.mod0 <- lm(prz ~ drive+fairway+green+putts+sandshot+sandsave)
summary(lpga.mod0)
e0 <- resid(lpga.mod0)
yhat0 <- predict(lpga.mod0)


par(mfrow=c(1,2))
plot(yhat0,e0)             ### Plot of Residuals vs Predicted Values
qqnorm(e0); qqline(e0)     ### Normal Probability Plot of Residuals
par(mfrow=c(1,1))

shapiro.test(e0)        ### Test for Normal errors

#### Run Box-Cox transformation (Goal: obtain lambda that maximizes log-likelihood)

library(MASS)

bc.mod0 <- boxcox(lpga.mod0,plot=T)   # Runs series of power transforms and plots
print(cbind(bc.mod0$x,bc.mod0$y))      # Print out results (lambda,log-like)
print(bc.mod0$x[which.max(bc.mod0$y)])     # Print out "best" lambda
ci.bc <- max(bc.mod0$y)-0.5*qchisq(0.95,1)   # Obtain cut-off for 95% CI (in log-like)
print(bc.mod0$x[bc.mod0$y>= ci.bc])    # Print Values of lambda in 95% CI

##### Obtain "training" and "validation" sets
####set.seed(####)
 
####lpga.cv.samp <- sample(1:length(logprz),100,replace=FALSE)
####lpga.cv.in <- lpga[lpga.cv.samp,]
####lpga.cv.out <- lpga[-lpga.cv.samp,]


######### Perform Backward Elimination, Forward Selection, and Stepwise Regression
######### Based on Model AIC (not individual regression coefficients)
######### fit1 and fit2 represent "extreme" models

library(MASS)
fit1 <- lm(logprz ~ drive+fairway+green+putts+sandshot+sandsave)
fit2 <- lm(logprz ~ 1)
stepAIC(fit1,direction="backward")
stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2))
stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2))

############### Fit best model 

lpga.mod1 <- lm(logprz ~ )
summary(lpga.mod1)
anova(lpga.mod1)
drop1(lpga.mod1,test="F")
e1 <- resid(lpga.mod1)
yhat1 <- predict(lpga.mod1)
## rstudent(lpga.mod1)
## influence.measures(lpga.mod1)

par(mfrow=c(1,2))
plot(yhat1,e1)               
qqnorm(e1); qqline(e1)      
shapiro.test(e1)   ### Test for Normal errors

TSS_e2 <- sum((e1^2-mean(e1^2))^2)          ### Total SS for e1^2
lpga.bp1 <- lm(e1^2 ~ )                     ### Regress e1^2 on X1,...,Xp
SS_RegBP <- TSS_e2 - deviance(lpga.bp1)     ### SS(Reg*) = TSS(e1^2) - SSE(e1^2)
SSE <- sum(e1^2); n <- length(e1); p <- length(coef(lpga.bp1))-1
(X2_BP <- (SS_RegBP/2) / (SSE/n)^2)
(X2_05 <- qchisq(.95,p))
(PV_PB <- 1-pchisq(X2_BP,p))


#### Breusch-Pagan test for Constant error variance
install.packages("lmtest")   # You must have already Set CRAN Mirror under Packages Tab
library(lmtest)

bptest(logprz ~  ,studentize=FALSE)


#### K-fold Cross-Validation - Comparison of Several Models
##### Perform k-fold cross-validation (fits 10 regressions, with 10 hold-out samples

install.packages("DAAG")
library(DAAG)

set.seed(xxxx)

lpgacv.mod1 <-  lm(logprz ~ drive+fairway+green+putts+sandshot+sandsave, 
data=lpga)
CVlm(lpga, lpgacv.mod1, m=10)

lpgacv.mod2 <-  lm(logprz ~ fairway+green+putts+sandshot+sandsave, 
data=lpga)
CVlm(lpga, lpgacv.mod2, m=10)

lpgacv.mod3 <-  lm(logprz ~ fairway+green+putts+sandshot+sandsave, 
data=lpga)
CVlm(lpga, lpgacv.mod3, m=10)

lpgacv.mod4 <-  lm(logprz ~ green + putts + sandshot + sandsave, 
data=lpga)
CVlm(lpga, lpgacv.mod4, m=10)