Part 1 - Selecting form of model
bolly1 <- read.csv(
"http://www.stat.ufl.edu/~winner/data/bollywood_boxoffice2.csv")
attach(bolly1); names(bolly1)
## [1] "Movie" "Gross" "Budget"
head(bolly1)
## Movie Gross Budget
## 1 1920London 13.47 12.0
## 2 2 States 101.61 36.0
## 3 24(Tamil,Telugu) 55.00 75.0
## 4 Aashiqui 2 78.42 10.5
## 5 AeDilHainMushkil 107.25 92.0
## 6 Agentleman 17.15 60.0
tail(bolly1)
## Movie Gross Budget
## 183 Wazir 35.18 45.0
## 184 WelcomeBack 96.88 100.0
## 185 Yaariyan 31.04 19.0
## 186 Yeh Hain Bakrapur 0.97 4.5
## 187 Yeh Jawani Hain Deewani 185.83 50.0
## 188 Youngistaan 6.76 27.0
## Experimenting with transformations of Y and X
Y1 <- Gross/100; Y2 <- log(Gross)
X1 <- Budget/100; X2 <- log(Budget)
par(mfrow=c(2,2))
plot(Y1 ~ X1, main="Gross vs Budget")
abline(lm(Y1 ~ X1))
plot(Y2 ~ X1, main="ln(Gross) vs Budget")
abline(lm(Y2 ~ X1))
plot(Y1 ~ X2, main="Gross vs ln(Budget)")
abline(lm(Y1 ~ X2))
plot(Y2 ~ X2, main="ln(Gross) vs ln(Budget)")
abline(lm(Y2 ~ X2))

par(mfrow=c(1,1))
Part 2 - Ordinary Least Squares (log-log model)
CI for mean and PI for individual movie with Budget = 100
bolly.mod1 <- lm(Y1 ~ X1)
summary(bolly.mod1)
##
## Call:
## lm(formula = Y1 ~ X1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4429 -0.2170 -0.0610 0.0677 3.9708
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.09572 0.06066 -1.578 0.116
## X1 1.41548 0.10777 13.134 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5019 on 186 degrees of freedom
## Multiple R-squared: 0.4812, Adjusted R-squared: 0.4784
## F-statistic: 172.5 on 1 and 186 DF, p-value: < 2.2e-16
confint(bolly.mod1)
## 2.5 % 97.5 %
## (Intercept) -0.2153944 0.02394531
## X1 1.2028710 1.62808449
(ci.mean <- predict(bolly.mod1, list(X1=100), int="c"))
## fit lwr upr
## 1 141.4521 120.2867 162.6174
(pi.indiv <- predict(bolly.mod1, list(X1=100), int="p"))
## fit lwr upr
## 1 141.4521 120.2635 162.6406
Part 4 - Robust (to heteroskedasticity) Standard Errors (Chapter 11, Slide 8)
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.6.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(bolly.mod1, studentize=FALSE)
##
## Breusch-Pagan test
##
## data: bolly.mod1
## BP = 114.37, df = 1, p-value < 2.2e-16
E2 <- diag(resid(bolly.mod1)^2)
V.betahat.W <- solve(XpX) %*% t(X.mat) %*% E2 %*% X.mat %*% solve(XpX)
SE.betahat.W <- sqrt(diag(V.betahat.W))
t.betahat.W <- betahat/SE.betahat.W
p.betahat.W <- 2*(1-pt(abs(t.betahat.W),n-2))
beta.out.W <- cbind(betahat, SE.betahat.W, t.betahat.W, p.betahat.W)
colnames(beta.out.W) <- c("Estimate", "Robust Std Err", "t", "P(>|t|)")
rownames(beta.out.W) <- c("Intercept", "Budget")
round(beta.out.W, 5)
## Estimate Robust Std Err t P(>|t|)
## Intercept -0.09572 0.04757 -2.01221 0.04564
## Budget 1.41548 0.15439 9.16799 0.00000
# install.packages("sandwich")
library(sandwich)
library(lmtest)
coeftest(bolly.mod1, vcov=vcovHC(bolly.mod1, type = "HC0"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.095725 0.047572 -2.0122 0.04564 *
## X1 1.415478 0.154394 9.1680 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## type=:
## const => E2 = diag{s^2} (OLS)
## HC0 => E2 = diag{e_i^2}
## HC1 => E2 = (n/(n-p))diag{e_i^2}
## HC2 => E2 = diag(e_i^2/(1-h_ii))
## HC3 => E2 = diag(e_i^2/(1-h_ii)^2)
## HC4 => E2 = diag(e_i^2/(1-h_ii)^delta_i) with delta_i = min(4, h_ii/hbar) with hbar = p/n
Part 5 - Estimated weighted least squares (Chapter 11 - Slide 7)
e.ols <- resid(bolly.mod1)
yhat.ols <- predict(bolly.mod1)
par(mfrow=c(1,3))
plot(sqrt(abs(e.ols)) ~ yhat.ols, pch=16, cex=0.65)
abline(lm(sqrt(abs(e.ols)) ~ yhat.ols), col="red4", lwd=2)
plot(abs(e.ols)~yhat.ols, pch=16, cex=0.65)
abline(lm(abs(e.ols)~yhat.ols), col="green4", lwd=2)
plot(e.ols^2 ~ yhat.ols, pch=16, cex=0.65)
abline(lm(e.ols^2 ~ yhat.ols), col="blue4", cex=0.65)

par(mfrow=c(1,1))
cor(sqrt(abs(e.ols)) , yhat.ols)
## [1] 0.5338977
cor(abs(e.ols) , yhat.ols)
## [1] 0.4341081
cor(e.ols^2 , yhat.ols)
## [1] 0.2217087
# Fit regression of sqrt(|e|) on Y-hat
X <- X.mat; Y <- Y1
n <- length(Y); p <- ncol(X)
e.reg.ols <- lm(sqrt(abs(e.ols)) ~ yhat.ols)
summary(e.reg.ols)
##
## Call:
## lm(formula = sqrt(abs(e.ols)) ~ yhat.ols)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74810 -0.12421 -0.02744 0.08482 1.38750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28504 0.02592 10.997 < 2e-16 ***
## yhat.ols 0.30884 0.03586 8.611 3.03e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2364 on 186 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2812
## F-statistic: 74.16 on 1 and 186 DF, p-value: 3.029e-15
s.ols <- predict(e.reg.ols) # Predicted sqrt(|e|)
w.ols <- 1/s.ols^4 # WLS weights= 1/s^2 = 1/sqrt(|e|)^4
b.ols <- solve(t(X) %*% X) %*% t(X) %*% Y # OLS estimator
## Begin iterations to obtain WLS estimator b_w
b.old <- b.ols # Start iterations with OLS estimator
wm.old <- as.matrix(diag(w.ols)) # W = diagonal matrix with w_i=1/s_i^2
b.diff <- 100 # Set high starting difference
num.iter <- 0 # Counter for number of iterations
# Keep iterating until (b_new-b_old)'(b_new-b_old) < .00001
while (b.diff > 0.00001) {
num.iter <- num.iter+1 # Increment number of iterations
# b_new = (X'WX)^(-1)X'WY
b.new <- solve(t(X) %*% wm.old %*% X) %*% t(X) %*% wm.old %*% Y
yhat.new <- X %*% b.new # Yhat_new = Xb_new
abs.e.new <- abs(Y - yhat.new) # new |e| = Y-Yhat_new
# Create new weight matrix from regression of sqrt(|e_new|) on yhat_new
wm.new <- as.matrix(diag(1/predict(lm(sqrt(abs.e.new)~yhat.new))^4))
b.diff <- sum((b.new-b.old)^2) # sum of squared differences of b_new,b_old
b.old <- b.new # b.old is assigned b.new
wm.old <- wm.new # Old weight matrix is assigned new weight matrix
}
# End of loop
num.iter # Number of iterations needed
## [1] 3
# Apply wm.new to get b.new (probably not necessary)
b.new <- solve(t(X) %*% wm.new %*% X) %*% t(X) %*% wm.new %*% Y
b.wls <- b.new # Obtain b.wls as result from iterative process
wm.wls <- wm.new # Obtain WLS matrix from iterative process
## MSE_wls = (Y-Xb-w)'W(Y-Xb_w)
mse.w <- (t(Y-X%*%b.wls) %*% wm.wls %*% (Y-X%*%b.wls))/(n-p)
# s2{b_w} = MSE*(X'WX)^(-1)
s2.b.wls <- mse.w[1,1]*solve(t(X) %*% wm.wls %*% X)
s.b.wls <- sqrt(diag(s2.b.wls)) # s{b_w} = sqrt(diag(s2{b_w}))
t.b.wls <- b.wls/s.b.wls # t = b_w/s{b}
p.b.wls <- 2*(1-pt(abs(t.b.wls), n-p)) # P-value
wls.out <- cbind(b.wls, s.b.wls, t.b.wls, p.b.wls)
colnames(wls.out) <- c("b_wls", "Std. Error", "t*", "P(>|t*|)")
rownames(wls.out) <- c("Intercept", "Budget")
round(ols.out, 4)
## Estimate Std Err t P(>|t|)
## Intercept -0.0957 0.0607 -1.5781 0.1163
## Budget 1.4155 0.1078 13.1344 0.0000
round(wls.out, 4)
## b_wls Std. Error t* P(>|t*|)
## Intercept -0.0407 0.0323 -1.2617 0.2086
## Budget 1.2465 0.1162 10.7261 0.0000
Part 6 - Estimaes based on gls function in nlme package
library(nlme)
mod1.gls <- gls(Y1 ~ X1, weights=varPower(form = ~fitted(.)), method="ML")
# glsControl(maxIter=200)
summary(mod1.gls)
## Generalized least squares fit by maximum likelihood
## Model: Y1 ~ X1
## Data: NULL
## AIC BIC logLik
## 195.3274 208.2731 -93.66368
##
## Variance function:
## Structure: Power of variance covariate
## Formula: ~fitted(.)
## Parameter estimates:
## power
## 0.4309212
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -0.021250 0.03506528 -0.606014 0.5452
## X1 1.258731 0.10526443 11.957797 0.0000
##
## Correlation:
## (Intr)
## X1 -0.71
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8161497 -0.5490651 -0.2820182 0.1484272 6.7217329
##
## Residual standard error: 0.6020388
## Degrees of freedom: 188 total; 186 residual