> install.packages("rpart") Installing package(s) into ‘C:/Users/Larry/Documents/R/win-library/2.15’ (as ‘lib’ is unspecified) Warning: package ‘rpart’ is in use and will not be installed --- Please select a CRAN mirror for use in this session --- Error in m[, 1L] : incorrect number of dimensions > library(rpart) > > > set.seed(12345) > est.set <- sample(1:705, 352) > val.set <- (1:705)[-est.set] > admin1 <- admin[est.set,] > > > (admintree1 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0001)) n= 352 node), split, n, deviance, yval * denotes terminal node 1) root 352 139.7161000 2.978494 2) HSCR< 84.5 194 71.5004100 2.744923 4) HSCR< 53.5 37 14.4745100 2.445000 8) HSCR>=45.5 15 6.2843680 2.273533 * 9) HSCR< 45.5 22 7.4484420 2.561909 18) HSCR< 37.5 12 3.6992590 2.299333 * 19) HSCR>=37.5 10 1.9290080 2.877000 * 5) HSCR>=53.5 157 52.9132400 2.815605 10) ACT< 21.5 38 17.1034700 2.596105 20) ACT>=19.5 15 8.1863340 2.379200 * 21) ACT< 19.5 23 7.7511680 2.737565 42) HSCR>=72 11 2.3658340 2.491182 * 43) HSCR< 72 12 4.1054750 2.963417 * 11) ACT>=21.5 119 33.3942800 2.885697 22) ACT< 29.5 111 30.2102200 2.852982 44) ACT>=27.5 16 6.4021970 2.701688 * 45) ACT< 27.5 95 23.3801000 2.878463 90) ACT< 25.5 72 16.1282100 2.831583 180) HSCR>=80.5 10 3.1863320 2.575000 * 181) HSCR< 80.5 62 12.1773400 2.872968 362) HSCR>=61.5 55 10.3188900 2.853709 724) HSCR< 76.5 39 7.5683790 2.816615 1448) HSCR>=73.5 8 1.0567860 2.608000 * 1449) HSCR< 73.5 31 6.0735820 2.870452 2898) HSCR< 71.5 24 4.1528240 2.808917 5796) ACT< 23.5 14 3.3570440 2.764500 * 5797) ACT>=23.5 10 0.7294929 2.871100 * 2899) HSCR>=71.5 7 1.5183020 3.081429 * 725) HSCR>=76.5 16 2.5660520 2.944125 * 363) HSCR< 61.5 7 1.6777710 3.024286 * 91) ACT>=25.5 23 6.5983040 3.025217 182) HSCR< 74.5 12 2.6252230 2.830583 * 183) HSCR>=74.5 11 3.0225770 3.237545 * 23) ACT>=29.5 8 1.4168560 3.339625 * 3) HSCR>=84.5 158 44.6365600 3.265285 6) HSCR< 95.5 106 29.2098300 3.124594 12) ACT< 21.5 12 5.9873930 2.779083 * 13) ACT>=21.5 94 21.6070200 3.168702 26) HSCR< 92.5 66 13.8198800 3.102939 52) HSCR>=90.5 20 5.4045850 2.945200 104) ACT< 25.5 13 2.3458920 2.835769 * 105) ACT>=25.5 7 2.6139040 3.148429 * 53) HSCR< 90.5 46 7.7013010 3.171522 106) HSCR< 87.5 20 3.3422430 3.035550 212) ACT>=28 7 1.5293200 2.886000 * 213) ACT< 28 13 1.5720670 3.116077 * 107) HSCR>=87.5 26 3.7048570 3.276115 214) ACT< 26.5 15 1.8983410 3.169933 * 215) ACT>=26.5 11 1.4067790 3.420909 * 27) HSCR>=92.5 28 6.8289000 3.323714 54) HSCR>=94.5 9 3.9468860 3.056444 * 55) HSCR< 94.5 19 1.9345840 3.450316 * 7) HSCR>=95.5 52 9.0516020 3.552077 14) HSCR< 98.5 37 5.7897360 3.526946 28) ACT>=29.5 11 2.2270050 3.382364 * 29) ACT< 29.5 26 3.2355030 3.588115 58) HSCR< 96.5 8 0.8029115 3.535250 * 59) HSCR>=96.5 18 2.4002960 3.611611 * 15) HSCR>=98.5 15 3.1808570 3.614067 * > > par(mfrow=c(1,2)) > > plot(admintree1,margin=.10) > text(admintree1) > > plot(admintree1,compress=T,uniform=T,branch=0.4,margin=.10) > text(admintree1) > > printcp(admintree1) Regression tree: rpart(formula = GPA ~ HSCR + ACT, data = admin1, method = "anova", cp = 1e-04) Variables actually used in tree construction: [1] ACT HSCR Root node error: 139.72/352 = 0.39692 n= 352 CP nsplit rel error xerror xstd 1 0.16876451 0 1.00000 1.00530 0.079912 2 0.04562917 1 0.83124 0.87919 0.073701 3 0.02943581 2 0.78561 0.86736 0.078109 4 0.01728854 3 0.75617 0.87801 0.076388 5 0.01264855 4 0.73888 0.87555 0.075735 6 0.01156210 5 0.72623 0.88425 0.076586 7 0.00916816 6 0.71467 0.88118 0.076433 8 0.00875285 8 0.69634 0.87740 0.077297 9 0.00685847 10 0.67883 0.89939 0.078453 10 0.00678110 11 0.67197 0.89633 0.077728 11 0.00511034 12 0.66519 0.89016 0.076285 12 0.00484795 13 0.66008 0.89943 0.077493 13 0.00468237 17 0.64006 0.89781 0.077517 14 0.00318352 18 0.63538 0.90203 0.076466 15 0.00286107 19 0.63220 0.91328 0.076841 16 0.00191616 20 0.62934 0.91869 0.077716 17 0.00172390 24 0.62071 0.92624 0.077704 18 0.00146095 25 0.61898 0.92705 0.077583 19 0.00047444 27 0.61606 0.93164 0.077093 20 0.00023115 28 0.61559 0.93594 0.077292 21 0.00010000 29 0.61536 0.93603 0.077294 > > adminols1 <- lm(GPA ~ HSCR+ACT+I(HSCR^2)+I(ACT^2)+I(HSCR*ACT),admin1) > summary(adminols1) Call: lm(formula = GPA ~ HSCR + ACT + I(HSCR^2) + I(ACT^2) + I(HSCR * ACT), data = admin1) Residuals: Min 1Q Median 3Q Max -1.86075 -0.29346 0.07365 0.40073 1.17043 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.385e+00 9.204e-01 3.677 0.000273 *** HSCR -2.552e-02 1.143e-02 -2.232 0.026235 * ACT -4.071e-02 7.039e-02 -0.578 0.563359 I(HSCR^2) 1.732e-04 8.392e-05 2.064 0.039749 * I(ACT^2) 4.353e-04 1.577e-03 0.276 0.782671 I(HSCR * ACT) 6.242e-04 5.283e-04 1.181 0.238246 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.5538 on 346 degrees of freedom Multiple R-squared: 0.2404, Adjusted R-squared: 0.2294 F-statistic: 21.9 on 5 and 346 DF, p-value: < 2.2e-16 > > #### See below for how 7 region model was selected > > (admintree7 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0091)) n= 352 node), split, n, deviance, yval * denotes terminal node 1) root 352 139.716100 2.978494 2) HSCR< 84.5 194 71.500410 2.744923 4) HSCR< 53.5 37 14.474510 2.445000 8) HSCR>=45.5 15 6.284368 2.273533 * 9) HSCR< 45.5 22 7.448442 2.561909 18) HSCR< 37.5 12 3.699259 2.299333 * 19) HSCR>=37.5 10 1.929008 2.877000 * 5) HSCR>=53.5 157 52.913240 2.815605 10) ACT< 21.5 38 17.103470 2.596105 * 11) ACT>=21.5 119 33.394280 2.885697 22) ACT< 29.5 111 30.210220 2.852982 * 23) ACT>=29.5 8 1.416856 3.339625 * 3) HSCR>=84.5 158 44.636560 3.265285 6) HSCR< 95.5 106 29.209830 3.124594 12) ACT< 21.5 12 5.987393 2.779083 * 13) ACT>=21.5 94 21.607020 3.168702 * 7) HSCR>=95.5 52 9.051602 3.552077 * > > par(mfrow=c(1,2)) > > plot(admintree7,margin=.10) > text(admintree7) > > plot(admintree7,compress=T,uniform=T,branch=0.4,margin=.10) > text(admintree7) > > > (mse.tree.est <- sum((GPA[est.set]-predict(admintree7,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2763898 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree7,admin[val.set,]))^2)/ + length(val.set)) [1] 0.317886 > > (mse.ols.est <- sum((GPA[est.set]-predict(adminols1,admin[est.set,]))^2)/ + length(est.set)) [1] 0.3015151 > > (mspr.ols.val <- sum((GPA[val.set]-predict(adminols1,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3202886 > > #### This uses the 2-9 Region models to determine model with > #### minimum MSPR for the validtion sample, which is 7, not the 5 > #### in textbook which used different estimation and validation samples > > admintree2 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0456) > admintree3 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0294) > admintree4 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0172) > admintree5 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0126) > admintree6 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0115) > admintree7 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0091) > admintree8 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0087) > admintree9 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0068) > > (mse.tree.est <- sum((GPA[est.set]-predict(admintree2,admin[est.set,]))^2)/ + length(est.set)) [1] 0.3118234 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree2,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3345405 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree3,admin[est.set,]))^2)/ + length(est.set)) [1] 0.3001397 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree3,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3321208 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree4,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2932775 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree4,admin[val.set,]))^2)/ + length(val.set)) [1] 0.328989 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree5,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2882571 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree5,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3283926 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree6,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2836678 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree6,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3249454 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree7,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2763898 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree7,admin[val.set,]))^2)/ + length(val.set)) [1] 0.317886 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree8,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2694414 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree8,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3246911 > (mse.tree.est <- sum((GPA[est.set]-predict(admintree9,admin[est.set,]))^2)/ + length(est.set)) [1] 0.2667191 > > (mspr.tree.val <- sum((GPA[val.set]-predict(admintree9,admin[val.set,]))^2)/ + length(val.set)) [1] 0.3262891 > > > dev.off() null device 1 >