### --------------------------------------------------------------------------------------###
### Winter Term 2017/2018 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #10 Generalized Least Squares, Breusch-Pagan test, Box-Cox transformation, ###
### and VIF (Variance inflantion factor) ###
### ###
### --------------------------------------------------------------------------------------###
### Working directory
setwd("H:/nmsa407_LinRegr/")
### Load the needed packages
library("mffSM")
### 1. G E N E R A L I Z E D L E A S T S Q U A R E S
### Dataset Hlavy
### * Application of the Generalized Least Squares (GLS)
### * Piecewise polynomial model fitted using a linear model
### with constraints on continuity/smoothness
###
### Data on head sizes of fetuses (from the ultrasound monitoring) in different periods of pregnancy.
### Each data value (variable y) is the average over n measurements.
###
### i: serial number
###
### t: week of pregnancy
###
### n: number of measurements used to calculate the average head size
### given in the y variable
###
### y: the average head size obtained from the n measurements
### performed in week t of pregnancy
data(Hlavy, package = "mffSM")
# ?Hlavy
### Exploratory analysis
head(Hlavy)
dim(Hlavy)
summary(Hlavy)
plot(y ~ t, data = Hlavy, pch = 23, col = "red3", bg = "orange")
### Basic scatterplot showing also sample size used to calculate each value of y
with(Hlavy, quantile(n, prob = seq(0, 1, by = 0.2)))
Hlavy <- transform(Hlavy, fn = cut(n, breaks = c(0, 10, 30, 50, 60, 90), labels = c("<=10", "11-30", "31-50", "51-60", ">60")))
with(Hlavy, summary(fn))
PAL <- diverge_hcl(length(levels(Hlavy[, "fn"])))
names(PAL) <- levels(Hlavy[, "fn"])
plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn])
legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL)
plotData <- function(){
PAL <- rev(heat_hcl(length(levels(Hlavy[, "fn"])), c.=c(80, 30), l=c(30, 90), power=c(1/5, 1.3)))
names(PAL) <- levels(Hlavy[, "fn"])
plot(y ~ t, data = Hlavy, pch = 23, col = "black", bg = PAL[fn])
legend(17, 9, legend = levels(Hlavy[, "fn"]), title = "n", fill = PAL)
abline(v = 27, col = "lightblue", lty = 2)
}
plotData()
### MOTIVATION:
### Practitioner says that y ~ t relationship could be described by a piecewise quadratic
### function with t = 27 being a point where the quadratic relationship changes its shape.
###
### From the practical point of view, the fitted curve should be CONTINUOUS at t = 27
### and perhaps also SMOOTH (i.e., with continuous at least the first derivative)
### as it is biologically not defensible to claim that at t = 27 the growth has a jump
### in the speed (rate) of the growth.
Hlavy <- transform(Hlavy, t27 = t - 27)
### Grid to calculate the fitted regression function
tgrid <- seq(16, 42, length = 100)
pdata <- data.frame(t27 = tgrid - 27)
### Piecewise quadratic model which has a regression function
### CONTINUOUS at t = 27
### Generalized least squares (GLS) used to calculate the estimates.
summary(mCont <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy))
### QUESTIONS:
### Can you identify both quadratic functions in the output?
### Suppose that m(t) denotes the regression function. Can you find a p-value of the test
###
### H0: the first derivative of m(t) is continous (we will say that m(t) is SMOOTH)
###
### against the alternative
###
### H1: the first derivative of m(t) is not continous (we will say that m(t) is CONTINUOUS)
### Some naive calculation
W <- diag(Hlavy$n);
Y <- Hlavy$y;
X <- model.matrix(mCont)
### Estimated regression coefficients
(betahat <- solve(t(X)%*%W%*%X)%*%(t(X)%*%W%*%Y));
coef(mCont)
# compare with oLS estimation
solve(t(X)%*%X)%*%t(X)%*%Y
### Generalized fitted values
(Yg <- X%*%betahat);
fitted(mCont)
all.equal(as.numeric(Yg), as.numeric(fitted(mCont)))
### Generalized residual sum of squares
(RSS <- t(Y - Yg)%*%W%*%(Y - Yg));
### Generalized mean square
RSS/(length(Y)-ncol(X))
(summary(mCont)$sigma)^2
### Be careful that Multiple R-squared: 0.9968 reported in the output "summary(mCont)" is not equal to
sum((Yg-mean(Yg))^2)/sum((Y-mean(Y))^2)
### Instead, it equals the following quantity that is more difficult to interpret
barYw <- sum(Y*Hlavy$n)/sum(Hlavy$n); ### weigthed mean of the original observations
### something as "generalized" regression sum of squares
SSR <- sum((Yg - barYw)^2 * Hlavy$n)
### something as "generalized" total variance
SST <- sum((Y - barYw)^2 * Hlavy$n);
### "generalized" R squared
SSR/SST
summary(mCont)$r.squared
### The regression coefficients can be also estimated by the standard
### OLS method from the data, where each row is repeated Hlavy$n - times.
### But then one CANNOT USE the standard errors of the regression coefficients
### in the output of the lm -function.
iii <- rep(1:nrow(Hlavy), Hlavy$n)
Hlavy2 <- Hlavy[iii,]; # new dataset
summary(mCont2 <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy2))
summary(mCont)
# much smaller residual errors in mCont2
# interpretation of R squared in mCont
barYw
mean(Hlavy2$y)
#
SSR
sum((fitted(mCont2)-mean(Hlavy2$y))^2) # regression sum of squares in mCont2
#
SST
sum((Hlavy2$y-mean(Hlavy2$y))^2) # total sum of squares in mCont2
#
summary(mCont2)$r.
summary(mCont)$r.
### Figure for the fit
fitCont <- predict(mCont, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
### Just for comparison, GLS fit versus OLS (ordinary least squares)
### fit which does not take the weights into account
summary(mContOLS <- lm(y ~ t27 + I(t27^2) + (t27 > 0):t27 + (t27 > 0):I(t27^2), data = Hlavy))
fitContOLS <- predict(mContOLS, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
lines(tgrid, fitContOLS, col = "blue", lwd = 2)
legend(33, 7, legend = c("GLS", "OLS"), col = c("red2", "blue"), lwd = 2, lty = 1)
### QUESTIONS:
### Do you have explanation why the red line does not follow so tightly
### the points corresponding to the low values of t?
### Piecewise quadratic model which has a regression function SMOOTH at the point t = 27
### Generalized least squares (GLS) used to calculate the estimates.
summary(mSmooth <- lm(y ~ t27 + I(t27^2) + (t27 > 0):I(t27^2), weights = n, data = Hlavy))
fitSmooth <- predict(mSmooth, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
lines(tgrid, fitSmooth, col = "blue", lwd = 2)
legend(33, 7, legend = c("Continuous", "Smooth"), col = c("red2", "blue"), lwd = 2, lty = 1)
### Strictly speaking, this model is not Hierarchically Well Formulated (HWF).
### Nevertheless, is it sensible in this context?
### Piecewise quadratic model which has a regression function not even continuous at t = 27
### Generalized least squares (GLS) used to calculate the estimates.
summary(mJump <- lm(y ~ t27 + I(t27^2) + (t27 > 0) + (t27 > 0):t27 + (t27 > 0):I(t27^2), weights = n, data = Hlavy))
### QUESTIONS:
### Can you find a p-value of the test
###
### H0: Jump of m(t) at t = 27 is equal to 0
###
### against
###
### H1: Jump of m(t) at t = 27 is not equal to 0?
fitJump <- predict(mJump, newdata = pdata)
plotData()
lines(tgrid, fitCont, col = "red2", lwd = 2)
lines(tgrid, fitSmooth, col = "blue", lwd = 2)
lines(tgrid, fitJump, col = "darkgreen", lwd = 2)
legend(33, 7, legend = c("Jump", "Continuous", "Smooth"), col = c("darkgreen", "red2", "blue"), lwd = 2, lty = 1)
anova(mJump,mSmooth)
### 2. B R E U S C H - P A G A N | B O X - C O X | V I F
### Dataset on housing values in suburbs of Boston.
###
### Data contain information on 506 municipalities in the metropolitan area of Boston
### and Massachusetts. Data were collected in 1978.
###
### 'zn' proportion of residential land zones for lots over 25,000 sq.ft.
###
### 'indus' proportion of non-retail business acres per municipality.
###
### 'chas' Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
###
### 'nox' nitrogen oxides concentration (parts per 10 million).
###
### 'rm' average number of rooms per dwelling.
###
### 'age' proportion of owner-occupied units built prior to 1940.
###
### 'rad' index of accessibility to radial highways.
###
### 'tax' full-value property-tax rate per $10,000.
###
### 'ptratio' pupil-teacher ratio by town.
###
### 'black' the proportion of blacks by town.
###
### 'lstat' percentage of population with low income
###
### 'medv' median value of owner-occupied homes (in thousands of $).
###
### 'crime' categorized number of crimes per 1000 thousand citizens categorized as follows:
### 1 .... almost no crime (<= 0.1)
### 2 .... some crime (0.1-1)
### 3 .... high crime rate (> 1)
###
### 'ldis' logarithm (to the power of 2) of the weighted mean of distances to five Boston employment centres.
Data <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/Boston2.csv", header = TRUE, sep = ",", na.strings = "")
head(Data)
### The main task is to create a model that evaluates influence the of nitrogen oxides concentration [nox]
### on the median value of owner-occupied home [medv] (or its transformation)
### while possibly taking into account also some other variables, specifically the following:
###
### * crime rate [crime]
### * logarithm (to the power of 2) of the distance from the Boston centre [ldis]
### * pupil-teacher ratio [ptratio]
### * proportion of non-retail business acres per municipality [indus]
### * proportion of owner-occupied units built prior to 1940 [age]
### * the proportion of blacks by town [black]
Data <- transform(Data, fcrime = factor(crime, levels = 1:3, labels = c("None", "Some", "High")))
Data <- subset(Data, select = c("medv", "nox", "fcrime", "ldis", "ptratio", "indus", "age", "black"))
head(Data)
summary(Data)
with(Data, summary(medv))
hist(Data[, "medv"], prob = TRUE, col = "olivedrab", xlab = "Median value of the owners-occupied houses [1000 USD]")
subset(Data, medv == 50)
### Is a value of 50 a real value or only the lower bound of the true value ("right-censoring")? Who knows...
### Scatterplot matrix
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~medv + nox + ldis + indus + age + ptratio + black, reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = Data, pch = 16, cex=0.5)
### Dependence of medv on nox while taking crime into account
BGC3 <- rainbow_hcl(3)
COL3 <- c("red3", "darkgreen", "blue3")
PCH3 <- c(21, 22, 23)
names(BGC3) <- names(COL3) <- names(PCH3) <- levels(Data[, "fcrime"])
plot(medv ~ nox, pch = PCH3[fcrime], col = COL3[fcrime], bg = BGC3[fcrime], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Median value of the owners-occupied houses [1000 USD]")
legend(0.75, 50, title = "Crime level", legend = levels(Data[, "fcrime"]), pch = PCH3, pt.bg = BGC3, col = COL3)
### As a starting model (denote it m1), we consider a linear model where we include all regressors
### (nox, crime, ldis, ptratio, indus, age, black), all two-way interactions
### between nox and the remaining covariates (that is, only interactions nox:z,
### where z are covariates other than nox) and additionally
### a quadratic term of nox (to account for possible non-linear effect of nox on medv).
m1 <- lm(medv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data)
summary(m1)
plotLM(m1)
### Shapiro-Wilk test of normality
shapiro.test(residuals(m1))
### Breusch-Pagan test - assuming normality of errors
# see Section 12.3.1 of the lecture notes
library("lmtest")
ncvTest(m1); # ?ncvTest
ncvTest(m1,~fitted(m1))
bptest(m1, ~fitted(m1), studentize = FALSE);
### ncvTest : var.formula: "a one-sided formula for the error variance; if omitted, the error variance depends on the fitted values."
### bptest : var.formula: "by default the same explanatory variables are taken as in the main regression model."
### 'Manual calculation'
U <- residuals(m1);
U2 <- U^2
Yhat <- fitted(m1);
n <- length(U)
### Test statistic
(BP <- sum(U2 * (Yhat - mean(Yhat)))^2 / (2 * (mean(U2))^2 * sum((Yhat-mean(Yhat))^2)))
### p-value
1-pchisq(BP, df=1)
bptest(m1, ~fitted(m1), studentize = FALSE)$p.value
### Koenker's studentized variant of Breusch-Pagan test - normality of errors not assumed
bptest(m1, ~fitted(m1))
### Manual calculation
(BP.Koenker <- (sqrt(n)*cor(U2, Yhat))^2)
1-pchisq(BP.Koenker, df=1)
bptest(m1, ~fitted(m1))$p.value
### Up to some minor (and asymptotically negligible) differences
### this test can be also performed by considering the overall test
### of the following 'linear model'
anova(lm(I(U2)~Yhat))
### One can also try to consider all the covariates
bptest(m1, studentize=FALSE);
### Manual calculation
Z <- model.matrix(m1)[,-1];
sigma2 <- mean(U^2);
Suz <- colMeans(U^2/sigma2*(Z - matrix(colMeans(Z), nrow(Z), ncol(Z), byrow=T)));
n <- nrow(Z);
### Test statistic
(stat<-n*t(Suz)%*%solve(2*var(Z)*(n-1)/n)%*%Suz)
pchisq(stat,df=ncol(Z),lower=FALSE)
bptest(m1, studentize=FALSE)$p.val
### Studentized version
bptest(m1);
### Manual calculation
kurt.estim <- mean((U^2-sigma2)^2)/sigma2^2
n*t(Suz)%*%solve(kurt.estim*var(Z)*(n-1)/n)%*%Suz
### Again, up to some minor (and asymptotically negligible) differences
### this test can be also performed by considering the overall test
### of the following 'linear model
anova(lm(I(U^2)~model.matrix(m1)))
### Box-Cox transformation
library(MASS)
boxcox(m1)
boxcox(m1, lambda = seq(-0.1, 0.5, by = 0.05))
### Fit the same model as above, however, with the transformed response
### as suggested by the Box-Cox transformation (denote the transformed response as lmedv).
Data <- transform(Data, lmedv = log(medv));
lm1 <- lm(lmedv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data)
summary(lm1)
plotLM(lm1)
summary(m1)$r.
plotLM(m1)
### Shapiro-Wilk test of normality
shapiro.test(residuals(lm1))
### Koenker's studentized variance of Breusch-Pagan test - normality of errors not assumed
bptest(lm1, ~fitted(lm1));
bptest(m1,~fitted(m1))
### The model still does not seem to be homoscedastic and it is not clear how to make
### it "more" homoscedastic. The inference should ideally take this into consideration.
###
### During the following lectures the so called sandwich estimator of the variance matrix
### of the regression coefficient will be explained. This estimator is consistent
### estimator of the variance matrix even under heteroscedasticity.
### In what follows we IGNORE the problem of HETEROSCEDASTICITY.
### QUESTIONS:
### Can we simplify the model?
drop1(lm1, test = "F")
### Leaving out not significant interactions we arrive at model
lm2 <- lm(lmedv ~ nox + I(nox^2) + fcrime + ldis + ptratio + indus + age + black + nox:(ldis + ptratio + indus), data = Data)
anova(lm2, lm1);
### What about further simplification of a model?
drop1(lm2, test = "F")
### Model that leaves out the quadratic term of nox and fcrime
lm3 <- lm(lmedv ~ nox + ldis + ptratio + indus + age + black + nox:(ldis + ptratio + indus), data = Data)
anova(lm3, lm2)
anova(lm3,lm1)
### Thus a candidate on the final model would be lm3
summary(lm3);
### M U L T I C O L I N E A R I T Y
### When we are interested in the effect of particular regressors (i.e. 'nox' in our situation)
### then we should also explore the possible problem of multicollinearity.
###
### In particular we are interested how are the other variables related with 'nox'.
### The reason is that while we want to have 'nox' we do not want to have in
### the model variables that are highly correlated with 'nox' but do not contribute
### in terms of the R-squared value, for instance.
##### Correlation coefficients among regressors
(R <- round(cor(Data[, c("nox", "ldis", "ptratio", "indus", "age", "black")]), 2))
### Mutually high correlations: nox, ldis, indus, age
R[c(1,2,4,5),c(1,2,4,5)]
COL <- "red3"
BG <- "orange"
PCH <- 23
par(mfrow = c(2, 3))
plot(nox ~ ldis, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "ldis"]))
plot(nox ~ indus, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "indus"]))
plot(nox ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["nox", "age"]))
plot(ldis ~ indus, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["ldis", "indus"]))
plot(ldis ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["ldis", "age"]))
plot(indus ~ age, data = Data, pch = PCH, col = COL, bg = BG, main = paste("r =", R["indus", "age"]))
par(mfrow = c(1, 1))
### Squared coefficient of multiple correlation between nox and (ldis, indus, age)
### Where is it?
summary(m<-lm(nox ~ ldis + indus + age, data = Data))
# multiple correlation of Y and X1, ..., Xd:
# correlation of Y and the best linear prediction of Y using X1, ..., Xd
# we already saw before: squared correlation of Y and X is the R^2 in model Y~X
m0<-lm(nox~ldis, data = Data)
cor(Data$ldis, Data$nox)^2
summary(m0)$r.
### Coefficient of multiple correlation between nox and (ldis, indus, age)
### (up to a sign) and including the sign
mnox <- lm(nox ~ ldis + indus + age, data = Data)
sqrt(summary(mnox)[["r.squared"]])
cor(mnox[["model"]][, "nox"], fitted(mnox))
# multiple correlation from correlation matrices
# https://en.wikipedia.org/wiki/Multiple_correlation
rxy = cor(mnox[["model"]][,-1], Data$nox)
rxx = cor(mnox[["model"]][,-1])
sqrt(t(rxy)%*%solve(rxx)%*%rxy)
### How much does the multiple correlation decrease if we remove ldis?
### (visually the closest linear dependence with nox)?
mnox2 <- lm(nox ~ indus + age, data = Data)
cor(mnox2[["model"]][, "nox"], fitted(mnox2))
### Relationship of quantitative regressors with crime
par(mfrow = c(2, 3))
plot(nox ~ fcrime, data = Data, col = BGC3)
plot(ldis ~ fcrime, data = Data, col = BGC3)
plot(ptratio ~ fcrime, data = Data, col = BGC3)
plot(indus ~ fcrime, data = Data, col = BGC3)
plot(age ~ fcrime, data = Data, col = BGC3)
plot(black ~ fcrime, data = Data, col = BGC3)
par(mfrow = c(1, 1))
### --> also mostly quite strong dependence
### --> again potential cause of multicollinearity
### Initial model (to treat multicollinearity first)
m1 <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black + fcrime, data = Data)
summary(m1)
plotLM(m1)
### (Generalized) variance inflation factors (function from package car)
vif(m1)
### Not that bad. But still, the worst inflation for the coefficient on nox.
### (also the most interesting one for us)
### For a numeric covariate GVIF conincides with VIF and it is calculated as 1/(1-R_j^2), where
### R_j^2 is the multiple correlation coefficent between the j-th covariate
### and the remaining covariates. For instance:
Rj2 <- summary(lm(nox~ldis + indus + age + ptratio + black + fcrime, data = Data))[["r.squared"]];
1/(1-Rj2)
### For a categorical variable fcrime, GIV is calculated as
R <- cor(model.matrix(m1)[,-1]); # correlation matrix of regressors in the model
R11 <- R[7:8,7:8] # correlation submatrix corresponding to regressors generated by 'fcrime'
R22 <- R[1:6,1:6] # correlation submatrix corresponding to the remaining regressors
R12 <- R[1:6,7:8]
det(R11)*det(R22)/det(R)
### How about if we remove each of ldis, indus, age, fcrime
### (always only one of them)
m2 <- list()
m2[["ldis"]] <- lm(lmedv ~ nox + indus + age + ptratio + black + fcrime, data = Data)
m2[["indus"]] <- lm(lmedv ~ nox + ldis + age + ptratio + black + fcrime, data = Data)
m2[["age"]] <- lm(lmedv ~ nox + ldis + indus + ptratio + black + fcrime, data = Data)
m2[["fcrime"]] <- lm(lmedv ~ nox + ldis + indus + age + ptratio + black, data = Data)
summary(m1)[["r.squared"]]
sapply(lapply(m2, summary), "[[", "r.squared")
vif(m1)
lapply(m2, vif)
### Removal of fcrime practically does not change R^2
### and slightly (by about one unit) decreases VIF for nox.
### How about if we additionally remove each of ldis, indus, age
### or two of them or all of them?
m3 <- list()
m3[["ldis"]] <- lm(lmedv ~ nox + indus + age + ptratio + black, data = Data)
m3[["indus"]] <- lm(lmedv ~ nox + ldis + age + ptratio + black, data = Data)
m3[["age"]] <- lm(lmedv ~ nox + ldis + indus + ptratio + black, data = Data)
m3[["ldis-indus"]] <- lm(lmedv ~ nox + age + ptratio + black, data = Data)
m3[["ldis-age"]] <- lm(lmedv ~ nox + indus + ptratio + black, data = Data)
m3[["indus-age"]] <- lm(lmedv ~ nox + ldis + ptratio + black, data = Data)
m3[["ldis-indus-age"]] <- lm(lmedv ~ nox + ptratio + black, data = Data)
summary(m1)[["r.squared"]]
sapply(lapply(m3, summary), "[[", "r.squared")
# sapply(lapply(m3, summary), function(x) x$r.)
vif(m1)
lapply(m3, vif)
### Suggestion:
### Do not consider also indus.
### The reason is that after its removal, R^2 still does not drop too much
### (more or less in the same magnitude as after removal of age) but the VIF
### of nox after removal of indus is lower than the VIF of nox after removal of age.
### Let us consider now a more complex model including only the variables
### nox, ldis, age, ptratio, black
lm4 <- lm(lmedv ~ nox + I(nox^2) + nox*(ldis + age + ptratio + black), data = Data)
drop1(lm4, test = "F")
### Is it possible to remove all three interactions
### which are not significant at 20%
lm5 <- lm(lmedv ~ nox + I(nox^2) + ldis + age + ptratio + black + nox:ldis, data = Data)
summary(lm5)
anova(lm5, lm4)
### More complex model lm4 is far from being
### significantly better than model lm5.
drop1(lm5, test = "F")
### All terms that could be removed
### are now quite highly significant.
### Final model?
mFin <- lm5; # Model with an initial analysis of multicollinearity
mFin0 <- lm3; # Model without the initial analysis of multicollinearity
summary(mFin)
### How would you describe dependence
### of medv on nox given this model?
plotLM(mFin)
### Heteroscedasticity is likely a problem.
### Inference on beta's (or their linear combinations)
### should probably be corrected.
### --> time for SANDWICH ESTIMATOR of variance (coming soon)
### Final model from the first part of the exercise class
summary(mFin); # Model with initial analysis of multicollinearity
summary(mFin0); # Model without initial analysis of multicollinearity
### Possible explanation of why nox^2 was (quite highly) non-significant
### in model mFin0 (the first part of the exercise class)
### Some visualization
summary(Data)
lgr <- 200
xindus <- c(5, 10, 18) ## approx quartiles and median of indus
xnox <- seq(0.35, 0.90, length = lgr)
xdata <- data.frame(nox = rep(xnox, 3), indus = rep(xindus, each = lgr), ptratio = 19, ldis = 1.17, age = 75, black = 0.01)
head(xdata)
pFin <- predict(mFin, newdata = xdata)
pFin0 <- predict(mFin0, newdata = xdata)
Data <- transform(Data, findus = cut(indus, breaks = c(0.1, 6.2, 18.1, 28)))
summary(Data[, "findus"])
COL33 <- diverge_hcl(3); COL33[2] <- "grey60"
BGC33 <- c("lightblue", "grey80", "red")
PCH33 <- c(21, 22, 23)
names(BGC33) <- names(COL33) <- names(PCH33) <- levels(Data[, "findus"])
par(mfrow = c(1, 2))
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]", main="Final model with multicoll. considered")
lines(xnox, pFin[1:lgr], lwd = 2)
legend(0.4, 2, legend = names(COL33), title = "indus", col = COL33, pt.bg = BGC33, pch = PCH33)
#
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = Data, xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]", main="Final model without multicoll. considered")
lines(xnox, pFin0[1:lgr], col = COL33[1], lwd = 2)
lines(xnox, pFin0[(lgr+1):(2*lgr)], col = COL33[2], lwd = 2)
lines(xnox, pFin0[(2*lgr+1):(3*lgr)], col = COL33[3], lwd = 2)
par(mfrow = c(1, 1))
layout(matrix(c(1,1,0,4, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:3){
plot(lmedv ~ nox, pch = PCH33[findus], col = COL33[findus], bg = BGC33[findus], data = subset(Data, findus == levels(findus)[i]), xlab = "Nitrogen oxides concentration (parts per 10 million)", ylab = "Log median value of the owners-occupied houses [log(1000 USD)]")
lines(xnox, pFin[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2)
lines(xnox, pFin0[((i-1)*lgr+1):(i*lgr)], col = COL33[i], lwd = 2, lty = 2)
}
plot.new()
legend(0.1, 0.9, legend = names(COL33), title = "indus", col = COL33, pt.bg = BGC33, pch = PCH33)
legend(0.1, 0.4, legend = c("Fin", "Fin0"), title = "Model", lty = c(1, 2), lwd = 2, col = "black")
par(mfrow = c(1, 1))