### -------------------------------------------------------------------------###
### NMSA407 - Linear Regression ###
### Winter Term 2021/2022 ###
### ###
### EXERCISE #9 Checking model assumptions ###
### -------------------------------------------------------------------------###
### -------------------------------------------------------------------------###
### Block 1: TEACHING MATERIAL ###
### -------------------------------------------------------------------------###
library(mffSM)
### Housing values in the suburbs of Boston.
###
### Data contain information on 506 municipalities in the metropolitan area of
### Boston and Massachusetts. Data were collected in 1978.
###
### 'zn' proportion of residential land zones for lots over 25,000 sq.ft.
### 'indus' proportion of non-retail business acres per municipality.
### 'chas' Charles River dummy variable
### (= 1 if tract bounds river; 0 otherwise).
### 'nox' nitrogen oxides concentration (parts per 10 million).
### 'rm' average number of rooms per dwelling.
### 'age' proportion of owner-occupied units built prior to 1940.
### 'rad' index of accessibility to radial highways.
### 'tax' full-value property-tax rate per $10,000.
### 'ptratio' pupil-teacher ratio by town.
### 'black' the proportion of blacks by town.
### 'lstat' percentage of population with low income
### 'medv' median value of owner-occupied homes (in thousands of $).
### 'crime' categorized number of crimes per 1000 thousand citizens
### categorized as follows:
### 1 .... almost no crime (<= 0.1)
### 2 .... some crime (0.1-1)
### 3 .... high crime rate (> 1)
### 'ldis' logarithm (to the power of 2) of the weighted mean of distances
### to five Boston employment centres.
Data <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/Boston2.csv",
header = TRUE, sep = ",", na.strings = "")
head(Data)
### The main task is to find a model that evaluates influence the of
### nitrogen oxides concentration [nox] on the median value of owner-occupied
### home [medv] (or its transformation), while possibly taking into account
### also some other variables, specifically the following:
###
### * crime rate [crime]
### * logarithm (base 2) of the distance from the Boston centre [ldis]
### * pupil-teacher ratio [ptratio]
### * proportion of non-retail business acres per municipality [indus]
### * proportion of owner-occupied units built prior to 1940 [age]
### * the proportion of blacks by town [black]
Data <- transform(Data, fcrime = factor(crime, levels = 1:3,
labels = c("None", "Some", "High")))
Data <- subset(Data, select = c("medv", "nox", "fcrime", "ldis", "ptratio",
"indus", "age", "black"))
head(Data)
summary(Data)
with(Data, summary(medv))
hist(Data[, "medv"], prob = TRUE, col = "olivedrab",
xlab = "Median value of the owners-occupied houses [1000 USD]")
subset(Data, medv == 50)
table(Data$medv)
### Is a value of 50 a real value or only the lower bound of the true value
### ("right-censoring")? Who knows...
### Scatterplot matrix
scatterplotMatrix(~medv + nox + ldis + indus + age + ptratio + black,
regLine = FALSE, smooth = FALSE,
diagonal = TRUE, data = Data, pch = 16, cex=0.5)
### Dependence of medv on nox while taking crime into account
BGC3 <- rainbow_hcl(3)
COL3 <- c("red3", "darkgreen", "blue3")
PCH3 <- c(21, 22, 23)
names(BGC3) <- names(COL3) <- names(PCH3) <- levels(Data[, "fcrime"])
plot(medv ~ nox, pch = PCH3[fcrime], col = COL3[fcrime], bg = BGC3[fcrime],
data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)",
ylab = "Median value of the owners-occupied houses [1000 USD]")
legend(0.75, 50, title = "Crime level", legend = levels(Data[, "fcrime"]),
pch = PCH3, pt.bg = BGC3, col = COL3)
### As a starting model (denote it m1), we consider a linear model where we
### include all regressors (nox, crime, ldis, ptratio, indus, age, black),
### all two-way interactions between nox and the remaining covariates
### (that is, only interactions nox:z, where z are covariates other than nox)
### and additionally a quadratic term of nox (to account for possible
### non-linear effect of nox on medv).
m1 <- lm(medv ~ nox*(fcrime + ldis + ptratio + indus + age + black) +
I(nox^2), data = Data)
summary(m1)
plotLM(m1)
### Shapiro-Wilk test of normality
shapiro.test(residuals(m1))
### or using standardized residuals (function stdres from library MASS)
shapiro.test(MASS::stdres(m1))
### Breusch-Pagan test - assuming normality of errors
### See Section 9.3.2 of the lecture notes
library("lmtest") # for function bptest
ncvTest(m1) # ?ncvTest from library car
ncvTest(m1,~fitted(m1))
bptest(m1, ~fitted(m1), studentize = FALSE); # ?bptest from library lmtest
### ncvTest :
### var.formula: "a one-sided formula for the error variance;
### if omitted, the error variance depends on the fitted values."
### bptest :
### var.formula: "by default the same explanatory variables are taken as in
### the main regression model."
### Koenker's studentized variant of the Breusch-Pagan test -
### normality of errors not assumed
bptest(m1, ~fitted(m1))
### Up to some minor (and asymptotically negligible) differences
### this test can be also performed by considering the overall test
### of the following 'linear model'
anova(lm(I(residuals(m1)^2)~fitted(m1)))
### One can also try to consider all the covariates
bptest(m1, studentize=FALSE)
### Studentized version
bptest(m1)
### Again, up to some minor (and asymptotically negligible) differences
### this test can be also performed by considering the overall test
### of the following 'linear model
anova(lm(I(residuals(m1)^2)~model.matrix(m1)))
###
### Box-Cox transformation
###
library(MASS)
boxcox(m1)
boxcox(m1, lambda = seq(-0.1, 0.5, by = 0.05))
### Fit the same model as above, however, with the transformed response
### as suggested by the Box-Cox transformation
### (denote the transformed response by lmedv).
Data <- transform(Data, lmedv = log(medv));
lm1 <- lm(lmedv ~ nox*(fcrime + ldis + ptratio + indus + age + black)
+ I(nox^2), data = Data)
summary(lm1)
plotLM(lm1)
# compare with the previous model
summary(m1)$r.
plotLM(m1)
### Shapiro-Wilk test of normality
shapiro.test(residuals(lm1))
### Koenker's studentized variance of Breusch-Pagan test
### - normality of errors not assumed
bptest(lm1,~fitted(lm1))
### compare with
bptest(m1,~fitted(m1))
### The model still does not seem to be homoscedastic, and it is not clear how
### to make it "more" homoscedastic. The inference should ideally take this into
### consideration.
###
### During the following lectures the so called sandwich estimator of the
### variance matrix of the regression coefficient will be explained. That
### estimator is a consistent estimator of the variance matrix even under
### heteroscedasticity.
### In what follows we IGNORE the problem of HETEROSCEDASTICITY.
### QUESTIONS:
### Can we simplify the model?
drop1(lm1, test = "F")
### Leaving out not significant interactions we arrive at model
lm2 <- lm(lmedv ~ nox + I(nox^2) + fcrime + ldis + ptratio + indus + age +
black + nox:(ldis + ptratio + indus), data = Data)
anova(lm2, lm1)
### What about further simplification of a model?
drop1(lm2, test = "F")
### Model that leaves out the quadratic term of nox and fcrime
lm3 <- lm(lmedv ~ nox + ldis + ptratio + indus + age + black +
nox:(ldis + ptratio + indus), data = Data)
anova(lm3, lm2)
### Comparison with the big model
anova(lm3,lm1)
### Thus a candidate on the final model would be lm3
summary(lm3)
###
### MULTICOLLINEARITY
###
### When we are interested in the effect of particular regressors
### (i.e. 'nox' in our situation) then we should also explore the possible
### problem of multicollinearity.
###
### In particular we are interested in how are the other variables related
### with 'nox'. The reason is that while we want to have 'nox', we do not want
### to have in the model variables that are highly correlated with 'nox' but do
### not contribute in terms of the R-squared value, for instance.
### Correlation coefficients among regressors
(R = round(cor(Data[, c("nox", "ldis", "ptratio", "indus", "age", "black")]),2))
### Mutually high correlations: nox, ldis, indus, age
R[c(1,2,4,5),c(1,2,4,5)]
COL <- "red3"
BG <- "orange"
PCH <- 23
par(mfrow = c(2, 3))
plot(nox ~ ldis, data = Data, pch = PCH, col = COL, bg = BG,
main = paste("r =", R["nox", "ldis"]))
plot(nox ~ indus, data = Data, pch = PCH, col = COL, bg = BG,
main = paste("r =", R["nox", "indus"]))
plot(nox ~ age, data = Data, pch = PCH, col = COL, bg = BG,
main = paste("r =", R["nox", "age"]))
plot(ldis ~ indus, data = Data, pch = PCH, col = COL, bg = BG,
main = paste("r =", R["ldis", "indus"]))
plot(ldis ~ age, data = Data, pch = PCH, col = COL, bg = BG,
main = paste("r =", R["ldis", "age"]))
plot(indus ~ age, data = Data, pch = PCH, col = COL, bg = BG,
main = paste("r =", R["indus", "age"]))
par(mfrow = c(1, 1))
### Squared coefficient of multiple correlation between nox and (ldis,indus,age)
### Where is it?
summary(mnox<-lm(nox ~ ldis + indus + age, data = Data))
### Multiple correlation of Y and X_1, ..., X_d:
### Correlation of Y and the best linear prediction of Y using X_1, ..., X_d
sqrt(summary(mnox)$r.squared)
cor(Data$nox, fitted(mnox))
### How much does the multiple correlation decrease if we remove ldis?
### (visually the closest linear dependence with nox)?
mnox2 <- lm(nox ~ indus + age, data = Data)
cor(mnox2[["model"]][, "nox"], fitted(mnox2))
### Relationship of the quantitative regressors with crime
par(mfrow = c(2, 3))
plot(nox ~ fcrime, data = Data, col = BGC3)
plot(ldis ~ fcrime, data = Data, col = BGC3)
plot(ptratio ~ fcrime, data = Data, col = BGC3)
plot(indus ~ fcrime, data = Data, col = BGC3)
plot(age ~ fcrime, data = Data, col = BGC3)
plot(black ~ fcrime, data = Data, col = BGC3)
par(mfrow = c(1, 1))
### --> also mostly quite strong dependence
### --> another potential cause of multicollinearity
### Initial model (to treat multicollinearity first)
m1 <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black + fcrime,
data = Data)
summary(m1)
plotLM(m1)
### (Generalized) variance inflation factors (function from package car)
vif(m1)
### For a numerical covariate GVIF coincides with VIF and it is calculated as
### 1/(1-R_j^2), where R_j is the coefficient of multiple correlation between
### the j-th covariate and the remaining covariates. For instance:
Rj2 <- summary(mj<-lm(nox~ldis + indus + age + ptratio + black + fcrime,
data = Data))[["r.squared"]]
### The variance inflation factor for variable nox in m1 is defined as
1/(1-Rj2)
sqrt(1/(1-Rj2))
###
### For a categorical variable fcrime, GVIF is calculated as
###
R <- cor(model.matrix(m1)[,-1]) # correlation matrix of regressors in the model
# correlation submatrix corresponding to regressors generated by 'fcrime'
R11 <- R[7:8,7:8]
# correlation submatrix corresponding to the remaining regressors
R22 <- R[1:6,1:6]
det(R11)*det(R22)/det(R)
### Again,
vif(m1)
### Not that bad. But still, the worst inflation for the coefficient on nox.
### (also the most interesting one for us)
### How about if we remove each of ldis, indus, age, fcrime
### (one at a time)
m2 <- list()
m2[["ldis"]] = lm(lmedv ~ nox + indus + age + ptratio + black + fcrime, data = Data)
m2[["indus"]] = lm(lmedv ~ nox + ldis + age + ptratio + black + fcrime, data = Data)
m2[["age"]] = lm(lmedv ~ nox + ldis + indus + ptratio + black + fcrime, data = Data)
m2[["fcrime"]]= lm(lmedv ~ nox + ldis + indus + age + ptratio + black, data = Data)
summary(m1)[["r.squared"]]
sapply(lapply(m2, summary), "[[", "r.squared")
vif(m1)
lapply(m2, vif)
### Removal of fcrime practically does not change R^2
### and slightly (by about one unit) decreases VIF for nox.
### How about if we additionally remove each of ldis, indus, age
### or two of them or all of them?
m3 <- list()
m3[["ldis"]] = lm(lmedv ~ nox + indus + age + ptratio + black, data = Data)
m3[["indus"]] = lm(lmedv ~ nox + ldis + age + ptratio + black, data = Data)
m3[["age"]] = lm(lmedv ~ nox + ldis + indus + ptratio + black, data = Data)
m3[["ldis-indus"]] = lm(lmedv ~ nox + age + ptratio + black, data = Data)
m3[["ldis-age"]] = lm(lmedv ~ nox + indus + ptratio + black, data = Data)
m3[["indus-age"]] = lm(lmedv ~ nox + ldis + ptratio + black, data = Data)
m3[["ldis-indus-age"]] = lm(lmedv ~ nox + ptratio + black, data = Data)
summary(m1)[["r.squared"]]
sapply(lapply(m3, summary), "[[", "r.squared")
vif(m1)
lapply(m3, vif)
### Suggestion:
### Do not consider also indus.
### The reason is that after its removal, R^2 still does not drop too much
### (more or less in the same magnitude as after removal of age) but the VIF of
### nox after removal of indus is lower than the VIF of nox after removal of
### age
### Let us consider now a more complex model including only the variables
### nox, ldis, age, ptratio, black
lm4 <- lm(lmedv ~ nox + I(nox^2) + nox*(ldis + age + ptratio + black),
data = Data)
drop1(lm4, test = "F")
### Is it possible to remove all three interactions
### which are not significant at .20
lm5 <- lm(lmedv ~ nox + I(nox^2) + ldis + age + ptratio + black + nox:ldis,
data = Data)
summary(lm5)
anova(lm5, lm4)
### More complex model lm4 is far from being
### significantly better than model lm5.
drop1(lm5, test = "F")
### All terms that could be removed
### are now quite highly significant.
### Final model?
mFin <- lm5 # Model with an initial analysis of multicollinearity
mFin0 <- lm3 # Model without the initial analysis of multicollinearity
summary(mFin)
### How would you describe dependence
### of medv on nox given this model?
plotLM(mFin)
### Heteroscedasticity is likely a problem.
### Inference on beta's (or their linear combinations)
### should probably be corrected.
### --> time for SANDWICH ESTIMATOR of variance (coming soon)
### Comparison of the two final models
summary(mFin) # Model with initial analysis of multicollinearity
summary(mFin0) # Model without initial analysis of multicollinearity
### -------------------------------------------------------------------------###
### Block 2: INDIVIDUAL WORK ###
### -------------------------------------------------------------------------###
### 1. Coefficient of multiple correlation
### Squared coefficient of multiple correlation between nox and (ldis,indus,age)
### Where is it?
summary(m<-lm(nox ~ ldis + indus + age, data = Data))
### Multiple correlation of Y and X_1, ..., X_d:
### Correlation of Y and the best linear prediction of Y using X_1, ..., X_d
### We already saw in session 2: squared correlation of Y and X is the R^2
### in model Y~X
m0<-lm(nox~ldis, data = Data)
cor(Data$ldis, Data$nox)^2
summary(m0)$r.
### Coefficient of multiple correlation between nox and (ldis, indus, age)
### (up to a sign) and including the sign
mnox <- lm(nox ~ ldis + indus + age, data = Data)
sqrt(summary(mnox)[["r.squared"]])
cor(mnox[["model"]][, "nox"], fitted(mnox))
### Multiple correlation as computed from correlation matrices
### https://en.wikipedia.org/wiki/Multiple_correlation
rxy = cor(mnox[["model"]][,-1], Data$nox)
rxx = cor(mnox[["model"]][,-1])
sqrt(t(rxy)%*%solve(rxx)%*%rxy)
### 2) A possible explanation of why nox^2 was (quite highly) non-significant
### in model mFin0 in the first part of the exercise class, but significant
### in model mFin when a preliminary multicollinearity treatment was included
### Some visualization
summary(Data)
lgr <- 200
xindus <- c(5, 10, 18) ## approx quartiles and median of indus
xnox <- seq(0.35, 0.90, length = lgr)
xdata <- data.frame(nox = rep(xnox, 3), indus = rep(xindus, each = lgr),
ptratio = 19, ldis = 1.17, age = 75, black = 0.01)
head(xdata)
pFin <- predict(mFin, newdata = xdata)
pFin0 <- predict(mFin0, newdata = xdata)
Data <- transform(Data, findus = cut(indus, breaks = c(0.1, 6.2, 18.1, 28)))
summary(Data[, "findus"])
COL33 <- diverge_hcl(3); COL33[2] <- "grey60"
BGC33 <- c("lightblue", "grey80", "red")
PCH33 <- c(21, 22, 23)
names(BGC33) <- names(COL33) <- names(PCH33) <- levels(Data[, "findus"])
par(mfrow = c(1, 2))
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus],
data = Data, xlab = "NOx (parts per 10 million)",
ylab = "Log median of price [log(1000 USD)]",
main="mFin (multicoll. considered)")
lines(xnox, pFin[1:lgr], lwd = 2)
legend("bottomleft", legend = names(COL33), title = "indus", col = COL33,
pt.bg = BGC33, pch = PCH33)
#
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus],
data = Data, xlab = "NOx (parts per 10 million)",
ylab = "Log median of price [log(1000 USD)]",
main="mFin (multicoll. not considered)")
lines(xnox, pFin0[1:lgr], col = COL33[1], lwd = 2)
lines(xnox, pFin0[(lgr+1):(2*lgr)], col = COL33[2], lwd = 2)
lines(xnox, pFin0[(2*lgr+1):(3*lgr)], col = COL33[3], lwd = 2)
par(mfrow = c(1, 1))
layout(matrix(c(1,1,0,4, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:3){
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus],
bg = BGC33[findus], data = subset(Data, findus == levels(findus)[i]),
xlab = "NOx (parts per 10 million)",
ylab = "Log median of price [log(1000 USD)]")
lines(xnox, pFin[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2)
lines(xnox, pFin0[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2, lty = 2)
}
plot.new()
legend(0.1, 1.0, legend = names(COL33), title = "indus", col = COL33,
pt.bg = BGC33, pch = PCH33)
legend(0.1, 0.4, legend = c("Fin", "Fin0"), title = "Model", lty = c(1, 2),
lwd = 2, col = "black")
par(mfrow = c(1, 1))
### -------------------------------------------------------------------------###
### Block 3: ADDITIONAL MATERIAL ###
### -------------------------------------------------------------------------###
### The Breusch-Pagan test of homoscedasticity: manual computation of the
### test statistics and p-values
ncvTest(m1,~fitted(m1))
bptest(m1,~fitted(m1), studentize = FALSE); # ?bptest from library lmtest
### Manual calculation
U <- residuals(m1);
U2 <- U^2
Yhat <- fitted(m1);
n <- length(U)
### Test statistic
(BP <- sum(U2 * (Yhat - mean(Yhat)))^2 /
(2 * (mean(U2))^2 * sum((Yhat-mean(Yhat))^2)))
### p-value
1-pchisq(BP, df=1)
bptest(m1, ~fitted(m1), studentize = FALSE)$p.value
### Koenker's studentized variant of the Breusch-Pagan test -
### normality of errors not assumed
bptest(m1, ~fitted(m1))
### Manual calculation
(BP.Koenker <- (sqrt(n)*cor(U2, Yhat))^2)
1-pchisq(BP.Koenker, df=1)
bptest(m1, ~fitted(m1))$p.value
### Up to some minor (and asymptotically negligible) differences
### this test can be also performed by considering the overall test
### of the following 'linear model'
anova(lm(U2~Yhat))
### One can also try to consider all the covariates
bptest(m1, studentize=FALSE)
### Manual calculation
Z <- model.matrix(m1)[,-1]
sigma2 <- mean(U^2)
Suz <- colMeans(U^2/sigma2*(Z - matrix(colMeans(Z), nrow(Z), ncol(Z), byrow=T)))
n <- nrow(Z);
### Test statistic
(stat<-n*t(Suz)%*%solve(2*var(Z)*(n-1)/n)%*%Suz)
pchisq(stat,df=ncol(Z),lower=FALSE)
bptest(m1, studentize=FALSE)$p.val
### Studentized version
bptest(m1)
### Manual calculation
kurt.estim <- mean((U^2-sigma2)^2)/sigma2^2
n*t(Suz)%*%solve(kurt.estim*var(Z)*(n-1)/n)%*%Suz
### Again, up to some minor (and asymptotically negligible) differences
### this test can be also performed by considering the overall test
### of the following 'linear model
anova(lm(I(U^2)~model.matrix(m1)))
###
### Variance Inflation Factor
###
m1 <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black + fcrime,
data = Data)
vif(m1)
### The estimator of the variance of hat(beta_j) corresponding to nox
vcov(m1)[2,2]
# Alternatively, the formula for the estimate of var(hat(beta_j))
# is sometimes written in the following forms:
Y = Data$lmedv
Xj = Data$nox
Rj2 = summary(lm(nox~ldis + indus + age + ptratio + black + fcrime,
data = Data))[["r.squared"]]
#
var(Y)/var(Xj)*(1-summary(m1)$r.squared)/m1$df.residual*1/(1-Rj2)
summary(m1)$sigma^2/((length(Xj)-1)*var(Xj))*1/(1-Rj2)
# Can you explain why these expressions are equivalent?
### For a numerical covariate GVIF coincides with VIF and it is calculated as
### 1/(1-R_j^2), where R_j^2 is the multiple correlation coefficient between the
### j-th covariate and the remaining covariates. For instance:
Rj2 <- summary(mj<-lm(nox~ldis + indus + age + ptratio + black + fcrime,
data = Data))[["r.squared"]]
### The variance inflation factor for variable nox in m1 is defined as
1/(1-Rj2)
sqrt(1/(1-Rj2))
### Recall that VIF can be interpreted as the factor by which the estimated
### variance of hat(beta_j) is inflated as a consequence of linear dependence
### of the regressor X_j, and the remaining regressors.
### Estimator of var(hat(beta_j)) in the original model
vcov(m1)[2,2]
### Orthogonalization of the regressor X_j - We use the Gram-Schmidt
### orthogonalization process to obtain a transformed variables tilde(X_j)
### that is orthogonal to the remaining covariates, yet the variables
### X_1, ..., X_(j-1), tilde(X_j), X_(j+1), ..., X_k still span the same
### regression space
# 1) Project the column X_j (nox) onto the linear space spanned by the remaining
# regressors
m.ort <- lm(nox~ldis + indus + age + ptratio + black + fcrime, data = Data)
# 2) Orthogonalized X_j is now the vector of residuals in m.org
nox2.orig = residuals(m.ort)
# 3) Scale the orthogonalized X_j so that it has the same length as the original
# regressor X_j
nox2 = nox2.orig/sd(nox2.orig)*sd(Data$nox)
# 4) Run a model analogous to m1, with X_j replaced by its orthogonalized
# version tilde(X_j)
m1.o = lm(lmedv ~ nox2 + ldis + indus + age + ptratio + black + fcrime,
data = Data)
# Of course, models m1 and m1.o are still equivalent
all.equal(fitted(m1),fitted(m1.o))
# Which estimates of beta in model m1.o are different from those in model m1?
# 5) VIF as a ratio of estimated variances of the regression coefficients
vif(m1)
vcov(m1)[2,2]/vcov(m1.o)[2,2]
# Equivalently, there are several other expressions for VIF in this situation:
1/(1-var(fitted(m.ort))/var(Data$nox))
var(Data$nox)/var(nox2.orig)
# Yet another way to express VIF:
R = cor(model.matrix(m1)[,-1]) # correlation matrix of regressors in the model
R22 = R[-1,-1]
R12 = R[-1,1]
#
1/(1-R12%*%solve(R22)%*%R12)
det(R22)/det(R)
diag(solve(R))[1]
# Can you explain why all these numbers are equal
# (some of these may be hard to show, original proofs will be appreciated)
### VIF as a ratio of estimated variances in a regression with orthogonal
### predictors
X = model.matrix(m1) # model matrix in model m1
Q.orig = qr.Q(qr(X)) # full Gram-Schmidt orthogonalization of X
max(abs(Q.orig%*%qr.R(qr(X))-X))
round(t(Q.orig)%*%Q.orig,10) # matrix Q contains the orthogonalized
# columns of X, with the linear span of the
# first j columns of X and Q the same
### Scale the columns of Q to have the same length as the columns of X
Q = Q.orig
for(i in 1:ncol(Q)) Q[,i] = Q.orig[,i]*sd(X[,i])/sd(Q.orig[,i])
Q[,1] = 1 # manually input the intercept
all.equal(apply(Q,2,sd),unname(apply(X,2,sd)))
### Linear model with orthogonalized columns equivalent with m1
m1.o = lm(lmedv ~ Q[,-1], data=Data)
all.equal(fitted(m1),fitted(m1.o))
### VIF of the individual coefficients as a ratio of estimated variances
vif(m1)
for(i in 2:nrow(vcov(m1)))
cat(rownames(vcov(m1))[i],"...",vcov(m1)[i,i]/vcov(m1.o)[i,i],"\n")
### Can you explain how is it possible that even if all the regressors are
### orthogonalized, each ratio corresponds to VIF?
###
### For a categorical variable fcrime, GVIF is calculated as
###
vif(m1)
R <- cor(model.matrix(m1)[,-1]) # correlation matrix of regressors in the model
# correlation submatrix corresponding to regressors generated by 'fcrime'
R11 <- R[7:8,7:8]
# correlation submatrix corresponding to the remaining regressors
R22 <- R[1:6,1:6]
det(R11)*det(R22)/det(R)
### This quantity corresponds to the variance inflation in the case when the
### confidence ellipsoids for a whole group of regression coefficient are
### considered, and the increase in their volumes due to the collinearity of
### the corresponding regressors
### That should correspond to an expression like the following one:
### (volumes of ellipsoids are known to be proportional to the determinant of
### the matrix of their quadratic form)
(det(vcov(m1)[7:8,7:8])/det(vcov(m1.o)[7:8,7:8]))
### This does not appear to be the case. Can you explain why?