nba2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/nba_player_201617a1.csv") attach(nba2016); names(nba2016) pts36 <- 36*PTS/Minutes # pts36 <- log(36*PTS/Minutes+1) (n.play <- max(Player_ID1)) (n.play.game <- length(PTS)) game.play <- as.vector(tapply(PTS,Player_ID1,length)) ## mean.play <- as.vector(tapply(pts36,Player_ID1,mean)) eff36 <- 36*efficiency/Minutes mean.play <- as.vector(tapply(eff36,Player_ID1,mean)) (mu <- mean(mean.play)) alpha.play <- mean.play-mu var.e <- rep(0,n.play) eps.play.game <- rep(0, n.play.game) count <- 0 for (i1 in 1:n.play) { count1 <- count + 1 count2 <- count + game.play[i1] for(i2 in 1:game.play[i1]) { eps.play.game[count1:count2] <- eff36[Player_ID1==i1] - mean.play[i1] } count <- count2 var.e[i1] <- var(eff36[Player_ID1==i1] - mean.play[i1]) } sum(eps.play.game[1:game.play[1]]) par(mfrow=c(1,2)) hist(alpha.play,breaks=30,prob=T, main="Distribution of Player Effects") lines(density(alpha.play)) hist(eps.play.game,breaks=100,prob=T, main="Distribution of Within Player Errors") lines(density(eps.play.game)) par(mfrow=c(1,1)) plot(var.e ~ alpha.play) abline(lm(var.e ~ alpha.play)) summary(lm(var.e ~ alpha.play)) (sigma2a <- var(alpha.play)) (sigma2e <- mean(var.e)) (rho_I <- sigma2a/(sigma2a+sigma2e)) ## Set up and Take Random Samples n.sample <- 10000 n.play.s <- 15 n.game.play.s <- 6 set.seed(2468) mean.out <- matrix(rep(0,n.sample*n.play.s),ncol=n.play.s) var.out <- matrix(rep(0,n.sample*n.play.s),ncol=n.play.s) MS_TRT.out <- rep(0,n.sample) MS_ERR.out <- rep(0,n.sample) for (i in 1:n.sample) { play.s <- sample(1:n.play,n.play.s) for (i1 in 1:n.play.s) { play.s1 <- eff36[Player_ID1 == play.s[i1]] play.g.s <- sample(1:game.play[play.s[i1]],n.game.play.s) mean.out[i,i1] <- mean(play.s1[play.g.s]) var.out[i,i1] <- var(play.s1[play.g.s]) } MS_TRT.out[i] <- n.game.play.s * sum((mean.out[i,] - mean(mean.out[i,]))^2)/(n.play.s-1) MS_ERR.out[i] <- mean(var.out[i,]) } ## Inference for Mean (sigma2_muhat <- sigma2a/n.play.s + sigma2e/(n.play.s*n.game.play.s)) muhat <- rowMeans(mean.out) summary(muhat) var(muhat) t.025 <- qt(.975,n.play.s-1) mu_L <- muhat - t.025 * sqrt(MS_TRT.out/(n.play.s*n.game.play.s)) mu_U <- muhat + t.025 * sqrt(MS_TRT.out/(n.play.s*n.game.play.s)) sum((mu_L <= mu) & (mu_U >= mu)) / n.sample # Histogram of mu-hat with Normal Super-Imposed # Frequency histogram (why we multiply density by sample size and binsize (1)) hist(muhat,breaks=seq(10,25,.05),xlab="muhat") lines(seq(10,25,.05), 0.05*n.sample*dnorm(seq(10,25,.05),mu,sqrt(sigma2_muhat)), lwd=4, col="green") abline(v=mu, lwd=4, col="brown") ## Inference for Error Variance s2e <- rowSums(var.out)/n.play.s df.e <- n.play.s*(n.game.play.s-1) summary(s2e) var(s2e) X2.025 <- qchisq(.025,df.e) X2.975 <- qchisq(.975,df.e) sigma2e_L <-df.e * s2e / X2.975 sigma2e_U <- df.e * s2e / X2.025 sum((sigma2e_L <= sigma2e) & (sigma2e_U >= sigma2e)) / n.sample s2e_scaled <- df.e * s2e / sigma2e ### Histogram of scaled variance ### Using s2e_scaled[s2e_scaled <= 130] leaves out very ### large values that "stretch plot out" yxlim <- 1.1*n.sample*max(dchisq(0:130,df.e)) hist(s2e_scaled[s2e_scaled <= 130],breaks=30:130, ylim=c(0,yxlim), xlab="Scaled Error Variance", main=expression(paste("Sampling Distribution of Scaled Error Variance - ", X[N-g]^2))) lines(30:130,1*n.sample*dchisq(30:130,df.e),lwd=4,col="purple") abline(v=df.e, lwd=4, col="red") ## Inference for Between Player Variance s2a <- (MS_TRT.out-MS_ERR.out)/n.game.play.s df.trt <- n.play.s-1 summary(s2a) var(s2a) df.a1 <- s2a^2 df.a2 <- ((MS_TRT.out/n.game.play.s)^2/df.trt) + ((MS_ERR.out/n.game.play.s)^2/df.e) df.a <- df.a1 / df.a2 sigma2a_L <- df.a * s2a / qchisq(.975,df.a) sigma2a_U <- df.a * s2a / qchisq(.025,df.a) sum((sigma2a_L <= sigma2a) & (sigma2a_U >= sigma2a)) / n.sample hist(s2a,breaks=80) abline(v=sigma2a, lwd=4, col="aquamarine4") rho_I_hat <- s2a / (s2a + s2e) summary(rho_I_hat) var(rho_I_hat) hist(rho_I_hat, breaks=80) abline(v=rho_I, lwd=4, col="blue") F_L <- qf(.025,df.trt,df.e) F_U <- qf(.975,df.trt,df.e) F_obs <- MS_TRT.out / MS_ERR.out rho_L <- 1/(n.game.play.s*(1/(F_obs/F_U-1))+1) rho_U <- 1/(n.game.play.s*(1/(F_obs/F_L-1))+1) sum((rho_L <= rho_I) & (rho_U >= rho_I)) / n.sample F_obs <- MS_TRT.out / MS_ERR.out F.05 <- qf(.95,df.trt,df.e) sum(F_obs >= F.05) / n.sample F.theor <- (MS_TRT.out / (n.game.play.s*sigma2a + sigma2e)) / (MS_ERR.out / sigma2e) ### Histogram of F.theor ### Multiplies F-density by binsize*n.sample = 0.05*10000 hist(F.theor,breaks=seq(0,6,0.05),main="Sampling Distribution of F") lines(seq(0,6,0.05),0.05*n.sample*df(seq(0,6,0.05), df.trt,df.e), lwd=4, col="red")