lpga1 <- read.fwf("http://www.stat.ufl.edu/~winner/data/lpga2008.dat", width=c(30,8,8,8,8,8,8,8,8), col.names=c("golfer","drive","fairway","green","putts","sandshot", "sandsave","prz","logprz")) attach(lpga1) lpga <- na.exclude(lpga1) ### Remove the golfer with missing data attach(lpga) #### Fit Regression model on untranformed Y (prz) lpga.mod0 <- lm(prz ~ drive+fairway+green+putts+sandshot+sandsave) summary(lpga.mod0) e0 <- resid(lpga.mod0) yhat0 <- predict(lpga.mod0) par(mfrow=c(1,2)) plot(yhat0,e0) ### Plot of Residuals vs Predicted Values qqnorm(e0); qqline(e0) ### Normal Probability Plot of Residuals par(mfrow=c(1,1)) shapiro.test(e0) ### Test for Normal errors #### Run Box-Cox transformation (Goal: obtain lambda that maximizes log-likelihood) library(MASS) bc.mod0 <- boxcox(lpga.mod0,plot=T) # Runs series of power transforms and plots print(cbind(bc.mod0$x,bc.mod0$y)) # Print out results (lambda,log-like) print(bc.mod0$x[which.max(bc.mod0$y)]) # Print out "best" lambda ci.bc <- max(bc.mod0$y)-0.5*qchisq(0.95,1) # Obtain cut-off for 95% CI (in log-like) print(bc.mod0$x[bc.mod0$y>= ci.bc]) # Print Values of lambda in 95% CI ##### Obtain "training" and "validation" sets ####set.seed(####) ####lpga.cv.samp <- sample(1:length(logprz),100,replace=FALSE) ####lpga.cv.in <- lpga[lpga.cv.samp,] ####lpga.cv.out <- lpga[-lpga.cv.samp,] ######### Perform Backward Elimination, Forward Selection, and Stepwise Regression ######### Based on Model AIC (not individual regression coefficients) ######### fit1 and fit2 represent "extreme" models library(MASS) fit1 <- lm(logprz ~ drive+fairway+green+putts+sandshot+sandsave) fit2 <- lm(logprz ~ 1) stepAIC(fit1,direction="backward") stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2)) stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2)) ############### Fit best model lpga.mod1 <- lm(logprz ~ ) summary(lpga.mod1) anova(lpga.mod1) drop1(lpga.mod1,test="F") e1 <- resid(lpga.mod1) yhat1 <- predict(lpga.mod1) ## rstudent(lpga.mod1) ## influence.measures(lpga.mod1) par(mfrow=c(1,2)) plot(yhat1,e1) qqnorm(e1); qqline(e1) shapiro.test(e1) ### Test for Normal errors TSS_e2 <- sum((e1^2-mean(e1^2))^2) ### Total SS for e1^2 lpga.bp1 <- lm(e1^2 ~ ) ### Regress e1^2 on X1,...,Xp SS_RegBP <- TSS_e2 - deviance(lpga.bp1) ### SS(Reg*) = TSS(e1^2) - SSE(e1^2) SSE <- sum(e1^2); n <- length(e1); p <- length(coef(lpga.bp1))-1 (X2_BP <- (SS_RegBP/2) / (SSE/n)^2) (X2_05 <- qchisq(.95,p)) (PV_PB <- 1-pchisq(X2_BP,p)) #### Breusch-Pagan test for Constant error variance install.packages("lmtest") # You must have already Set CRAN Mirror under Packages Tab library(lmtest) bptest(logprz ~ ,studentize=FALSE) #### K-fold Cross-Validation - Comparison of Several Models ##### Perform k-fold cross-validation (fits 10 regressions, with 10 hold-out samples install.packages("DAAG") library(DAAG) set.seed(xxxx) lpgacv.mod1 <- lm(logprz ~ drive+fairway+green+putts+sandshot+sandsave, data=lpga) CVlm(lpga, lpgacv.mod1, m=10) lpgacv.mod2 <- lm(logprz ~ fairway+green+putts+sandshot+sandsave, data=lpga) CVlm(lpga, lpgacv.mod2, m=10) lpgacv.mod3 <- lm(logprz ~ fairway+green+putts+sandshot+sandsave, data=lpga) CVlm(lpga, lpgacv.mod3, m=10) lpgacv.mod4 <- lm(logprz ~ green + putts + sandshot + sandsave, data=lpga) CVlm(lpga, lpgacv.mod4, m=10)