### --------------------------------------------------------------------------------------###
### Winter Term 2017/2018 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #7 Simple models with interactions ###
### (Log-transformation of the response) ###
### ###
### --------------------------------------------------------------------------------------###
### Working directory
setwd("H:/nmsa407_LinRegr/") # To be changed to something which exists on your PC.
### Load needed packages
library("mffSM")
### The command below assumes that a data file chicago.csv
### has been downloaded into your working directory.
###
### Alternatively, if the computer is connected to the internet,
### the data can be loaded in by using the command:
### chicago <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/chicago.csv", header = T)
###
### Chicago insurance redlining in 1970 (47 districts)
###
### minor: percentage of minority (0 - 100) in a given district
### fire: number of fires per 100 households during the reference period
### theft: number of reported thefts per 1000 inhabitants during the reference period
### old: percentage of residential units built before 1939
### insur: number of policies per 100 households
### income: median income (USD 1000) per household
### side: location of the district within Chicago (0 = North, 1 = South)
chicago <- read.csv("chicago.csv", header = TRUE)
head(chicago)
summary(chicago)
### Create a factor variable fside with values "North" and "South" from the original 0-1 variable side
chicago <- transform(chicago, fside = factor(side, levels = 0:1, labels=c("North", "South")))
summary(chicago)
### Scatterplot matrix
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~fire + minor + theft + old + insur + income, reg.line = FALSE, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = chicago, pch = 16, cex=0.75)
### We start by looking for a model of dependence of the number of fires [fire]
### on the percentage of minority [minor]. We are also aware of the fact
### that the form of this dependence may differ in north and south parts of Chicago.
### The model should take this knowledge into account.
XLAB <- "Minority (%)"
YLAB <- "Fires (#/100 households)"
COLS <- c(North = "darkgreen", South = "red")
BGS <- c(North = "aquamarine", South = "orange")
PCHS <- c(North = 21, South = 23)
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = YLAB, data = chicago)
legend(5, 40, legend = c("North", "South"), pch = PCHS, col = COLS, pt.bg = BGS, bty = "n")
lines(with(subset(chicago, fside == "North"), lowess(minor, fire)), col = COLS["North"], lwd = 2)
lines(with(subset(chicago, fside == "South"), lowess(minor, fire)), col = COLS["South"], lwd = 2)
### QUESTIONS
### How would you interpret the plot? Do we need interactions in the model?
### Does the side of Chicago modify the relationship between the number of fires and the percentage of minority?
### Can you interpret each of those models?
m <- list()
m[["Line"]] <- lm(fire ~ minor * fside, data = chicago)
m[["Log2"]] <- lm(fire ~ log2(minor) * fside, data = chicago)
m[["Parab"]] <- lm(fire ~ (minor + I(minor^2)) * fside, data = chicago)
summary(m[["Line"]])
summary(m[["Log2"]])
summary(m[["Parab"]])
### QUESTIONS:
### Based on the above output describe the effect of the percentage
### of minority on the number of fires.
### Based on the above output describe the effect of the side on the number of fires.
### Equivalently - the summary for all three models at once:
lapply(m, summary)
### All R^2's at once
sm <- lapply(m, summary)
sapply(sm, "[[", "r.squared")
sapply(sm, "[[", "adj.r.squared")
### Let us put them in a data.frame
R2s <- data.frame(R2 = sapply(sm, "[[", "r.squared"),
R2adj = sapply(sm, "[[", "adj.r.squared"))
print(R2s)
### QUESTIONS
### Which model seems to be the best with respect to prediction ability?
### Which model would you prefer for the interpretation purposes?
### Fitted regression functions
lpred <- 500
pminor <- seq(1, 100, length = lpred)
pdata <- data.frame(minor = rep(pminor, 2), fside = factor(rep(c("North", "South"), each = lpred)))
### Look at pdata
pdata[1:10,]
pdata[(lpred + 1):(lpred + 10),]
### Calculation of the fitted regression functions,
### again use vectorized calculation using lapply command.
fit <- lapply(m, predict, newdata = pdata)
print(fit[["Line"]][1:10])
print(fit[["Log2"]][1:10])
print(fit[["Parab"]][1:10])
### Display fitted regression curves for all three models
layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:length(fit)){
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = YLAB, data = chicago)
lines(pminor, fit[[i]][1:lpred], col = COLS["North"], lwd = 2)
lines(pminor, fit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"], lwd = 2)
title(main = names(fit)[i])
}
plot.new()
legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3, title = "Side")
par(mfrow = c(1, 1))
### QUESTIONS
### Do we need an interaction term?
### Which of the three suggested models seem to be the best one (for the moment being)?
### Does any of the models suffer from some problems with respect to the assumptions of the normal linear model?
### Graphical diagnostics of the model assumptions
plotLM(m[["Line"]])
plotLM(m[["Log2"]])
plotLM(m[["Parab"]])
### HOMEWORK
### Plot of the standardized residuals against predictor (regressor, e.g., minor) with lowess.
### Why is it useful? Explain, what can be concluded from such plot.
###
### u1 <- rstandard(m[["Line"]])
### plot(u1 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "skyblue")
### lines(lowess(chicago[, "minor"], u1), col = "red3", lwd = 2)
###
### u2 <- rstandard(m[["Log2"]])
### plot(u2 ~ log(minor), data = chicago, pch = 21, col = "blue4", bg = "skyblue")
### lines(lowess(log(chicago[, "minor"]), u2), col = "red3", lwd = 2)
###
### u3 <- rstandard(m[["Parab"]])
### plot(u3 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "skyblue")
### lines(lowess(chicago[, "minor"], u3), col = "red3", lwd = 2)
### When the residual variance increases with the response
### expectation, log-transformation of response often
### ensures a homoscedastic model.
### Scatterplots with log-transformed response equipped with LOWESS
LYLAB <- "Log(Fires)"
par(mfrow = c(1, 1))
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, data = chicago)
legend(5, 3.5, legend = c("North", "South"), pch = PCHS, col = COLS, pt.bg = BGS, bty = "n")
lines(with(subset(chicago, fside == "North"), lowess(minor, log(fire))), col = COLS["North"], lwd = 2)
lines(with(subset(chicago, fside == "South"), lowess(minor, log(fire))), col = COLS["South"], lwd = 2)
### QUESTIONS
### Are we still modeling the expectation EY or something else?
### What are the consequences for the interpretation of model parameters?
### What is the interpretation of the parameters of the "Log" model with the log-transformed response?
### TASK:
### Discuss a link to a multiplicative model for Y with log-normal errors.
tm <- list()
summary(tm[["Line"]] <- lm(log(fire) ~ minor * fside, data = chicago));
#### QUESTIONS:
### Based on the above output describe the effect of the percentage of minority on the number of fires.
### Based on the above output describe the effect of the side of the river on the number of fires.
summary(tm[["Log"]] <- lm(log(fire) ~ log2(minor) * fside, data = chicago))
### QUESTIONS:
### Based on the above output describe the effect of the percentage of minority on the number of fires.
### Based on the above output describe the effect of the side of the riveron the number of fires.
tm[["Parab"]] <- lm(log(fire) ~ (minor + I(minor^2)) * fside, data = chicago)
### R-squared values again
(stm <- lapply(tm, summary))
sapply(stm, "[[", "r.squared")
sapply(stm, "[[", "adj.r.squared")
### QUESTIONS:
### Does it make sense to compare the following two sets of R^2's?
### If yes, why? If no, why?
sapply(sm, "[[", "r.squared") ## models for Y
sapply(stm, "[[", "r.squared") ## models for log(Y)
### Fitted regression functions
tfit <- lapply(tm, predict, newdata = pdata)
layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:length(fit)){
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, data = chicago)
lines(pminor, tfit[[i]][1:lpred], col = COLS["North"], lwd = 2)
lines(pminor, tfit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"], lwd = 2)
title(main = names(tfit)[i])
}
plot.new()
legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3, title = "Side")
par(mfrow = c(1, 1))
### QUESTIONS:
### Which model seems to be the best one?
### Compare and discuss the basic residual plots below.
### Which model seems to be better, Log or Parab?
plotLM(tm[["Line"]])
plotLM(tm[["Log"]])
plotLM(tm[["Parab"]])
sapply(stm, "[[", "r.squared")
sapply(stm, "[[", "adj.r.squared")
### Take also a look at scatterplots equipped with the fitted regression functions
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, data = subset(chicago, fside == ss), xlim = range(chicago[, "minor"]), ylim = log(range(chicago[, "fire"])))
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(pminor, tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1)
lines(pminor, tfit[["Parab"]][ind], col = COLS[ss], lwd = 2, lty = 5)
title(main = ss)
legend(40, 1.5, legend = c("Log", "Parab"), title = "Model", col = COLS[ss], lwd = 2, lty = c(1, 5))
}
par(mfrow = c(1, 1))
### The "Log" model seems to be a good model, so in the following we will work with this model.
mWin <- tm[["Log"]]
summary(mWin)
### QUESTIONS:
### Does side of Chicago significantly modify the relationship of log(minor) and log(fire)?
### What are we estimating here?
### What is the connection with the plots below?
Lsl <- matrix(c(0,1,0,0, 0,1,0,1), nrow = 2, byrow = TRUE)
colnames(Lsl) <- names(coef(mWin))
rownames(Lsl) <- c("North", "South")
print(Lsl)
LSest(mWin, L = Lsl)
LXLAB <- "Log(Minority)"
LYLAB <- "Log(Fires)"
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(log(fire) ~ log(minor), col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = LXLAB, ylab = LYLAB, xlim = log(range(chicago[, "minor"])), ylim = log(range(chicago[, "fire"])),
data = subset(chicago, fside == ss))
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(log(pminor), tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1)
title(main = ss)
}
par(mfrow = c(1, 1))
### What is the meaning of the numbers below?
LSm <- LSest(mWin, L = Lsl)
someEstim <- data.frame(Estimate = exp(LSm[["Estimate"]]),
Lower = exp(LSm[["Lower"]]),
Upper = exp(LSm[["Upper"]]))
print(someEstim)
### The fitted regression functions including the confidence band
### AROUND the regression function and the prediction band.
cifit <-predict(tm[["Log"]], newdata = pdata, interval = "confidence")
pfit <-predict(tm[["Log"]], newdata = pdata, interval = "prediction")
YLIM <- range(log(chicago[, "fire"]), pfit)
XLIM <- range(chicago[, "minor"])
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = LYLAB, xlim = XLIM, ylim = YLIM,
data = subset(chicago, fside == ss), main = ss)
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(pminor, cifit[ind, "fit"], col = COLS[ss], lwd = 3)
lines(pminor, cifit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 2)
lines(pminor, cifit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 2)
lines(pminor, pfit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 3)
lines(pminor, pfit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 3)
legend(x=25, y=1, lwd=c(3,2,2), lty=c(1,2,3), col=COLS[ss], legend=c("estim. E[log(Y)|X=x]", "conf. band around E[log(Y)|X=x]", "prediction band for log(Y)"))
}
par(mfrow = c(1, 1))
### Note that based on the model mWin one can make only a prediction band for 'fires'
YLIM <- range(chicago[, "fire"], exp(pfit[, "upr"]))
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = "Fires", xlim = XLIM, ylim = YLIM,
data = subset(chicago, fside == ss), main = ss)
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(pminor, exp(pfit[ind, "lwr"]), col = COLS[ss], lwd = 2, lty = 3)
lines(pminor, exp(pfit[ind, "upr"]), col = COLS[ss], lwd = 2, lty = 3)
legend(x=1, y=70, lwd=c(2), lty=c(3), col=COLS[ss], legend=c("prediction band for Y"))
}
par(mfrow = c(1, 1))
### Different parameterization of the winning model.
mWinSum <- lm(log(fire) ~ log(minor) * fside, data = chicago, contrasts = list(fside = contr.sum))
summary(mWinSum)
### QUESTIONS:
### What is the interpretation of the parameters labeled as "Intercept" and "log(minor)"?
### What is the interpretation of numbers in the summary output?
contr.sum(2)
L1sum <- matrix(c(1,0,0,0, 0,0,1,0, 0,0,-1,0), nrow = 3, byrow = TRUE)
colnames(L1sum) <- names(coef(mWinSum))
rownames(L1sum) <- c("Mean Intercept", levels(chicago[, "fside"]))
print(L1sum)
LSest(mWinSum, L = L1sum)
L2sum <- matrix(c(0,1,0,0, 0,0,0,1, 0,0,0,-1), nrow = 3, byrow = TRUE)
colnames(L2sum) <- names(coef(mWinSum))
rownames(L2sum) <- c("Mean Slope", levels(chicago[, "fside"]))
print(L2sum)
LSest(mWinSum, L = L2sum)
### QUESTIONS:
### Can any of the remaining possible covariates improve significantly the model?
### Again, effect of each of the covariates may differ in the north and south.
m0 <- lm(log(fire) ~ log(minor) * fside, data = chicago)
m1 <- list()
m1[["theft"]] <- lm(log(fire) ~ (log(minor) + theft) * fside, data = chicago)
m1[["old"]] <- lm(log(fire) ~ (log(minor) + old) * fside, data = chicago)
m1[["insur"]] <- lm(log(fire) ~ (log(minor) + insur) * fside, data = chicago)
m1[["income"]] <- lm(log(fire) ~ (log(minor) + income) * fside, data = chicago)
### And what can be concluded from here?
anova(m0, m1[["theft"]])
anova(m0, m1[["old"]])
anova(m0, m1[["insur"]])
anova(m0, m1[["income"]])
### The most significant improvement (by a formal test) was adding "insur".
### What are the results?
m1Win <- m1[["insur"]]
summary(m0) ## original model for comparison
summary(m1Win) ## "bigger" model
### QUESTIONS:
### How about the improvement of the prediction ability?
### How about the basic residual plots?
plotLM(m1Win)
### Note that the fitted regression planes describing the dependence of log(fire) on log(minor) and insur
### based on m1Win are the same as if the model is fitted
### separately in North and South.
###
### Hence if we are interested in just a description of those
### planes (and are lazy to extract the appropriate intercepts
### and slopes from m1Win), we can fit separate North/South models
### and take the coefficients from there.
m1WinN <- lm(log(fire) ~ log(minor) + insur, data = chicago, subset = (fside == "North"))
m1WinS <- lm(log(fire) ~ log(minor) + insur, data = chicago, subset = (fside == "South"))
### Compare the regression coefficients
(beN <- coef(m1WinN))
(beS <- coef(m1WinS))
coef(m1Win)
rbind(beS, beN)
### Now compare the standard errors?
summary(m1Win)$coef
summary(m1WinN)$coef
summary(m1WinS)$coef
### Finally, a fancy (but of limited use) plot
minorGrid <- seq(1, 100, length = 50)
insurGrid <- seq(0, 2.5, length = 50)
xyGrid <- expand.grid(minorGrid, insurGrid)
dim(xyGrid)
xyGrid[1:10,]
xyGrid[2491:2500,]
lFireN <- matrix(beN[1] + beN[2] * log(xyGrid[, 1]) + beN[3] * xyGrid[, 2], nrow = length(minorGrid), ncol = length(insurGrid))
lFireS <- matrix(beS[1] + beS[2] * log(xyGrid[, 1]) + beS[3] * xyGrid[, 2], nrow = length(minorGrid), ncol = length(insurGrid))
par(mfrow = c(1, 2))
persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", main="North")
persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", main="South")
par(mfrow = c(1, 1))
### A rotated version of the same fancy plot
par(mfrow = c(1, 2))
persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", theta = 60, phi = 45, main="North")
persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", theta = 60, phi = 45, main="South")
par(mfrow = c(1, 1))
### More sophisticated version with the 'rgl' library
### (with an additional option to rotate the plot using the mouse)
library(rgl)
### Plot for North of Chicago
persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "aquamarine")
with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire), add = TRUE, size = 7, col = "darkgreen"))
### Plot for South of Chicago
persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "orange")
with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire), add = TRUE, size = 5, col = "red"))
### Plot for both North and South of Chicago
persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "aquamarine")
persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "orange", add=T)
with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire), add = TRUE, size = 7, col = "darkgreen"))
with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire), add = TRUE, size = 7, col = "red"))