Part 1 - Selecting form of model

bolly1 <- read.csv(
    "http://www.stat.ufl.edu/~winner/data/bollywood_boxoffice2.csv")
attach(bolly1); names(bolly1)
## [1] "Movie"  "Gross"  "Budget"
head(bolly1)
##              Movie  Gross Budget
## 1       1920London  13.47   12.0
## 2         2 States 101.61   36.0
## 3 24(Tamil,Telugu)  55.00   75.0
## 4       Aashiqui 2  78.42   10.5
## 5 AeDilHainMushkil 107.25   92.0
## 6       Agentleman  17.15   60.0
tail(bolly1)
##                       Movie  Gross Budget
## 183                   Wazir  35.18   45.0
## 184             WelcomeBack  96.88  100.0
## 185                Yaariyan  31.04   19.0
## 186       Yeh Hain Bakrapur   0.97    4.5
## 187 Yeh Jawani Hain Deewani 185.83   50.0
## 188             Youngistaan   6.76   27.0
## Experimenting with transformations of Y and X


Y1 <- Gross/100; Y2 <- log(Gross)
X1 <- Budget/100; X2 <- log(Budget)

par(mfrow=c(2,2))
plot(Y1 ~ X1, main="Gross vs Budget")
abline(lm(Y1 ~ X1))
plot(Y2 ~ X1, main="ln(Gross) vs Budget")
abline(lm(Y2 ~ X1))
plot(Y1 ~ X2, main="Gross vs ln(Budget)")
abline(lm(Y1 ~ X2))
plot(Y2 ~ X2, main="ln(Gross) vs ln(Budget)")
abline(lm(Y2 ~ X2))

par(mfrow=c(1,1))

Part 2 - Ordinary Least Squares (log-log model)

CI for mean and PI for individual movie with Budget = 100

bolly.mod1 <- lm(Y1 ~ X1)
summary(bolly.mod1)
## 
## Call:
## lm(formula = Y1 ~ X1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4429 -0.2170 -0.0610  0.0677  3.9708 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.09572    0.06066  -1.578    0.116    
## X1           1.41548    0.10777  13.134   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5019 on 186 degrees of freedom
## Multiple R-squared:  0.4812, Adjusted R-squared:  0.4784 
## F-statistic: 172.5 on 1 and 186 DF,  p-value: < 2.2e-16
confint(bolly.mod1)
##                  2.5 %     97.5 %
## (Intercept) -0.2153944 0.02394531
## X1           1.2028710 1.62808449
(ci.mean <- predict(bolly.mod1, list(X1=100), int="c"))
##        fit      lwr      upr
## 1 141.4521 120.2867 162.6174
(pi.indiv <- predict(bolly.mod1, list(X1=100), int="p"))
##        fit      lwr      upr
## 1 141.4521 120.2635 162.6406

Part 3 - Matrix form

n <- length(Y2)
X.mat <- cbind(rep(1, n), X1)
XpX <- t(X.mat) %*% X.mat
XpY <- t(X.mat) %*% Y1
betahat <- solve(XpX) %*% XpY

J_n <- matrix(rep(1/n,n^2), ncol=n)
I <- diag(n)
P <- X.mat %*% solve(XpX) %*% t(X.mat)

SSE <- t(Y1) %*% (I-P) %*% Y1
s2 <- SSE/(n-2)

V.betahat <- s2[1,1] * solve(XpX)
SE.betahat <- sqrt(diag(V.betahat))
t.betahat <- betahat/SE.betahat
p.betahat <- 2*(1-pt(abs(t.betahat),n-2))

ols.out <- cbind(betahat, SE.betahat, t.betahat, p.betahat)
colnames(ols.out) <- c("Estimate", "Std Err", "t", "P(>|t|)")
rownames(ols.out) <- c("Intercept", "Budget")
round(ols.out, 5)
##           Estimate Std Err        t P(>|t|)
## Intercept -0.09572 0.06066 -1.57805 0.11625
## Budget     1.41548 0.10777 13.13437 0.00000

Part 4 - Robust (to heteroskedasticity) Standard Errors (Chapter 11, Slide 8)

library(lmtest)
## Warning: package 'lmtest' was built under R version 3.6.3
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(bolly.mod1, studentize=FALSE)
## 
##  Breusch-Pagan test
## 
## data:  bolly.mod1
## BP = 114.37, df = 1, p-value < 2.2e-16
E2 <- diag(resid(bolly.mod1)^2)

V.betahat.W <- solve(XpX) %*% t(X.mat) %*% E2 %*% X.mat %*% solve(XpX)
SE.betahat.W <- sqrt(diag(V.betahat.W))
t.betahat.W <- betahat/SE.betahat.W
p.betahat.W <- 2*(1-pt(abs(t.betahat.W),n-2))

beta.out.W <- cbind(betahat, SE.betahat.W, t.betahat.W, p.betahat.W)
colnames(beta.out.W) <- c("Estimate", "Robust Std Err", "t", "P(>|t|)")
rownames(beta.out.W) <- c("Intercept", "Budget")
round(beta.out.W, 5)
##           Estimate Robust Std Err        t P(>|t|)
## Intercept -0.09572        0.04757 -2.01221 0.04564
## Budget     1.41548        0.15439  9.16799 0.00000
# install.packages("sandwich")
library(sandwich)
library(lmtest)

coeftest(bolly.mod1, vcov=vcovHC(bolly.mod1, type = "HC0"))
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.095725   0.047572 -2.0122  0.04564 *  
## X1           1.415478   0.154394  9.1680  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## type=:
## const => E2 = diag{s^2}  (OLS)
## HC0 => E2 = diag{e_i^2}
## HC1 => E2 = (n/(n-p))diag{e_i^2}
## HC2 => E2 = diag(e_i^2/(1-h_ii))
## HC3 => E2 = diag(e_i^2/(1-h_ii)^2)
## HC4 => E2 = diag(e_i^2/(1-h_ii)^delta_i)   with delta_i = min(4, h_ii/hbar)   with hbar = p/n

Part 5 - Estimated weighted least squares (Chapter 11 - Slide 7)

e.ols <- resid(bolly.mod1)
yhat.ols <- predict(bolly.mod1)

par(mfrow=c(1,3))
plot(sqrt(abs(e.ols)) ~ yhat.ols, pch=16, cex=0.65)
abline(lm(sqrt(abs(e.ols)) ~ yhat.ols), col="red4", lwd=2)
plot(abs(e.ols)~yhat.ols, pch=16, cex=0.65)
abline(lm(abs(e.ols)~yhat.ols), col="green4", lwd=2)
plot(e.ols^2 ~ yhat.ols, pch=16, cex=0.65)
abline(lm(e.ols^2 ~ yhat.ols), col="blue4", cex=0.65)

par(mfrow=c(1,1))

cor(sqrt(abs(e.ols)) , yhat.ols)
## [1] 0.5338977
cor(abs(e.ols) , yhat.ols)
## [1] 0.4341081
cor(e.ols^2 , yhat.ols)
## [1] 0.2217087
# Fit regression of sqrt(|e|) on Y-hat
X <- X.mat; Y <- Y1
n <- length(Y); p <- ncol(X)
e.reg.ols <- lm(sqrt(abs(e.ols)) ~ yhat.ols)
summary(e.reg.ols)
## 
## Call:
## lm(formula = sqrt(abs(e.ols)) ~ yhat.ols)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74810 -0.12421 -0.02744  0.08482  1.38750 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28504    0.02592  10.997  < 2e-16 ***
## yhat.ols     0.30884    0.03586   8.611 3.03e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2364 on 186 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.2812 
## F-statistic: 74.16 on 1 and 186 DF,  p-value: 3.029e-15
s.ols <- predict(e.reg.ols)          # Predicted sqrt(|e|)
w.ols <- 1/s.ols^4                   # WLS weights= 1/s^2 = 1/sqrt(|e|)^4
b.ols <- solve(t(X) %*% X) %*% t(X) %*% Y   # OLS estimator

## Begin iterations to obtain WLS estimator b_w
b.old <- b.ols                       # Start iterations with OLS estimator
wm.old <- as.matrix(diag(w.ols))     # W = diagonal matrix with w_i=1/s_i^2
b.diff <- 100                        # Set high starting difference 
num.iter <- 0                        # Counter for number of iterations

# Keep iterating until (b_new-b_old)'(b_new-b_old) < .00001
while (b.diff > 0.00001) {
   num.iter <- num.iter+1           # Increment number of iterations
   # b_new = (X'WX)^(-1)X'WY
   b.new <- solve(t(X) %*% wm.old %*% X) %*% t(X) %*% wm.old %*% Y
   yhat.new <- X %*% b.new        # Yhat_new = Xb_new
   abs.e.new <- abs(Y - yhat.new) # new |e| = Y-Yhat_new
   # Create new weight matrix from regression of sqrt(|e_new|) on yhat_new
   wm.new <- as.matrix(diag(1/predict(lm(sqrt(abs.e.new)~yhat.new))^4))
   b.diff <- sum((b.new-b.old)^2) # sum of squared differences of b_new,b_old
   b.old <- b.new                 # b.old is assigned b.new
   wm.old <- wm.new        # Old weight matrix is assigned new weight matrix
}

# End of loop
num.iter       # Number of iterations needed
## [1] 3
# Apply wm.new to get b.new (probably not necessary)
b.new <- solve(t(X) %*% wm.new %*% X) %*% t(X) %*% wm.new %*% Y
b.wls <- b.new          # Obtain b.wls as result from iterative process
wm.wls <- wm.new        # Obtain WLS matrix from iterative process

## MSE_wls = (Y-Xb-w)'W(Y-Xb_w)
mse.w <- (t(Y-X%*%b.wls) %*% wm.wls %*% (Y-X%*%b.wls))/(n-p)

# s2{b_w} = MSE*(X'WX)^(-1)
s2.b.wls <- mse.w[1,1]*solve(t(X) %*% wm.wls %*% X)
s.b.wls <- sqrt(diag(s2.b.wls))          # s{b_w} = sqrt(diag(s2{b_w}))
t.b.wls <- b.wls/s.b.wls                 # t = b_w/s{b}
p.b.wls <- 2*(1-pt(abs(t.b.wls), n-p))   # P-value

wls.out <- cbind(b.wls, s.b.wls, t.b.wls, p.b.wls)
colnames(wls.out) <- c("b_wls", "Std. Error", "t*", "P(>|t*|)")
rownames(wls.out) <- c("Intercept", "Budget")

round(ols.out, 4)
##           Estimate Std Err       t P(>|t|)
## Intercept  -0.0957  0.0607 -1.5781  0.1163
## Budget      1.4155  0.1078 13.1344  0.0000
round(wls.out, 4)
##             b_wls Std. Error      t* P(>|t*|)
## Intercept -0.0407     0.0323 -1.2617   0.2086
## Budget     1.2465     0.1162 10.7261   0.0000

Part 6 - Estimaes based on gls function in nlme package

library(nlme)
mod1.gls <- gls(Y1 ~ X1, weights=varPower(form = ~fitted(.)), method="ML")
# glsControl(maxIter=200)
summary(mod1.gls)
## Generalized least squares fit by maximum likelihood
##   Model: Y1 ~ X1 
##   Data: NULL 
##        AIC      BIC    logLik
##   195.3274 208.2731 -93.66368
## 
## Variance function:
##  Structure: Power of variance covariate
##  Formula: ~fitted(.) 
##  Parameter estimates:
##     power 
## 0.4309212 
## 
## Coefficients:
##                 Value  Std.Error   t-value p-value
## (Intercept) -0.021250 0.03506528 -0.606014  0.5452
## X1           1.258731 0.10526443 11.957797  0.0000
## 
##  Correlation: 
##    (Intr)
## X1 -0.71 
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8161497 -0.5490651 -0.2820182  0.1484272  6.7217329 
## 
## Residual standard error: 0.6020388 
## Degrees of freedom: 188 total; 186 residual