### -------------------------------------------------------------------------###
### NMSA407 - Linear Regression ###
### Winter Term 2021/2022 ###
### ###
### EXERCISE #7 Regression models with interactions (log-transformation of Y)###
### -------------------------------------------------------------------------###
### -------------------------------------------------------------------------###
### Block 1: TEACHING MATERIAL ###
### -------------------------------------------------------------------------###
library("mffSM")
### The command below assumes that a data file chicago.csv
### has been downloaded into your working directory.
chicago <- read.csv("chicago.csv", header = TRUE)
### Alternatively, if the computer is connected to the internet,
### the data can be also loaded in by using the following command:
chicago <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/chicago.csv",
header=T, stringsAsFactors = TRUE)
### Brief data description (Chicago insurance redlining in 47 districts in 1970)
### minor: percentage of minority (0 - 100) in a given district
### fire: number of fires per 100 households during the reference period
### theft: number of reported thefts per 1000 inhabitants
### old: percentage of residential units built before 1939
### insur: number of insurance policies per 100 households
### income: median income (USD 1000) per household
### side: location of the district within Chicago (0 = North, 1 = South)
dim(chicago)
head(chicago)
summary(chicago)
### Create a factor variable fside with values "North" and "South" from the
### original 0-1 variable denoted as 'side'
chicago <- transform(chicago, fside = factor(side, levels = 0:1,
labels=c("North", "South")))
summary(chicago)
### Scatterplot matrix
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~fire + minor + theft + old + insur + income,
smooth = FALSE, data = chicago, pch = 16, cex=0.75)
### We start by looking for a model of dependence of the number of fires [fire]
### on the percentage of minority [minor]. We are also aware of the fact that
### the form of this dependence may differ in north and south parts of Chicago.
XLAB <- "Minority (%)"
YLAB <- "Fires (#/100 households)"
COLS <- c(North = "darkgreen", South = "red")
BGS <- c(North = "aquamarine", South = "orange")
PCHS <- c(North = 21, South = 23)
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = YLAB, data = chicago)
legend(5, 40, legend = c("North", "South"), pch = PCHS,
col = COLS, pt.bg = BGS, bty = "n")
lines(with(subset(chicago, fside == "North"), lowess(minor, fire)),
col = COLS["North"], lwd = 2)
lines(with(subset(chicago, fside == "South"), lowess(minor, fire)),
col = COLS["South"], lwd = 2)
### How would you interpret the plot? Do we need interactions in the model?
### Does the side of Chicago modify the relationship between the number of fires
### and the percentage of minority?
m <- list()
m[["Line"]] <- lm(fire ~ minor * fside, data = chicago)
m[["Log2"]] <- lm(fire ~ log2(minor) * fside, data = chicago)
m[["Parab"]] <- lm(fire ~ (minor + I(minor^2)) * fside, data = chicago)
summary(m[["Line"]])
summary(m[["Log2"]])
summary(m[["Parab"]])
### QUESTIONS:
### Based on the above output describe the effect of the percentage of minority
### on the number of fires. Describe also the effect of the side.
### Equivalently - the summary for all three models at once:
lapply(m, summary)
### All R^2's at once
sm <- lapply(m, summary)
sapply(sm, "[[", "r.squared")
sapply(sm, "[[", "adj.r.squared")
### Let us put them in a data.frame
R2s <- data.frame(R2 = sapply(sm, "[[", "r.squared"),
R2adj = sapply(sm, "[[", "adj.r.squared"))
print(R2s)
### QUESTIONS
### Which model seems to be the best with respect to prediction ability?
### Which model would you prefer for the interpretation purposes?
### Fitted regression functions
lpred <- 500
pminor <- seq(1, 100, length = lpred)
pdata <- data.frame(minor = rep(pminor, 2),
fside = factor(rep(c("North", "South"),
each = lpred)))
### pdata
pdata[1:10,]
pdata[(lpred + 1):(lpred + 10),]
### Calculation of the fitted regression functions,
### again use vectorized calculation using lapply command.
fit <- lapply(m, predict, newdata = pdata)
print(fit[["Line"]][1:10])
print(fit[["Log2"]][1:10])
print(fit[["Parab"]][1:10])
### Plotting the fitted regression curves for all three models
layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:length(fit)){
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = YLAB, data = chicago)
lines(pminor, fit[[i]][1:lpred], col = COLS["North"], lwd = 2)
lines(pminor, fit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"], lwd = 2)
title(main = names(fit)[i])
}
plot.new()
legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3,
title = "Side")
par(mfrow = c(1, 1))
### Do we need interaction terms in the given models?
### Which of the three suggested models seems to be the best one?
### Does any of the models suffer from with some issues related to the
### assumptions of the normal linear model?
### Graphical diagnostics of the model assumptions
plotLM(m[["Line"]])
plotLM(m[["Log2"]])
plotLM(m[["Parab"]])
### When the residual variance increases with the response
### expectation, log-transformation of response often
### ensures a homoscedastic model.
### Scatterplots with the log-transformed response equipped with LOWESS
LYLAB <- "Log(Fires)"
par(mfrow = c(1, 1))
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = LYLAB, data = chicago)
legend(5, 3.5, legend = c("North", "South"), pch = PCHS,
col = COLS, pt.bg = BGS, bty = "n")
lines(with(subset(chicago, fside == "North"), lowess(minor, log(fire))),
col = COLS["North"], lwd = 2)
lines(with(subset(chicago, fside == "South"), lowess(minor, log(fire))),
col = COLS["South"], lwd = 2)
### Does the plot looks better in some sense? Let's try the corresponding model.
### (note a connection to a multiplicative model for Y with log-normal errors)
tm <- list()
summary(tm[["Line"]] <- lm(log(fire) ~ minor * fside, data = chicago));
### Are we still modeling the expectation EY or something else?
### What are the consequences for the interpretation of model parameters?
### Interpret the parameters of the model with the log-transformed response.
summary(tm[["Log"]] <- lm(log(fire) ~ log2(minor) * fside, data = chicago))
### Describe the effect of the percentage of minority on the number of fires.
### Describe the effect of the side of the riveron the number of fires.
tm[["Parab"]] <- lm(log(fire) ~ (minor + I(minor^2)) * fside, data = chicago)
### Comparing the models with the log-transformed response
(stm <- lapply(tm, summary))
sapply(stm, "[[", "r.squared")
sapply(stm, "[[", "adj.r.squared")
### Does it make sense to compare the following two sets of R^2's?
### If yes, why? If no, why?
sapply(sm, "[[", "r.squared") ## models for Y
sapply(stm, "[[", "r.squared") ## models for log(Y)
### SST for both models (datasets):
sum((chicago$fire - mean(chicago$fire))^2)
sum((log(chicago$fire) - mean(log(chicago$fire)))^2)
### Fitted regression functions
tfit <- lapply(tm, predict, newdata = pdata)
### Plotting again the fitted regression curves for all three models
layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:length(fit)){
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = LYLAB, data = chicago)
lines(pminor, tfit[[i]][1:lpred], col = COLS["North"], lwd = 2)
lines(pminor, tfit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"],lwd = 2)
title(main = names(tfit)[i])
}
plot.new()
legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3,
title = "Side")
par(mfrow = c(1, 1))
### Alternative view with a more detailed comparison
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = LYLAB,
data = subset(chicago, fside == ss),
xlim = range(chicago[, "minor"]),
ylim = log(range(chicago[, "fire"])))
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(pminor, tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1)
lines(pminor, tfit[["Parab"]][ind], col = COLS[ss], lwd = 2, lty = 5)
title(main = ss)
legend(40, 1.5, legend = c("Log", "Parab"), title = "Model",
col = COLS[ss], lwd = 2, lty = c(1, 5))
}
par(mfrow = c(1, 1))
### Which model seems to be the best one?
plotLM(tm[["Line"]])
plotLM(tm[["Log"]])
plotLM(tm[["Parab"]])
sapply(stm, "[[", "r.squared")
sapply(stm, "[[", "adj.r.squared")
### Let's take the "Log" model for the following...
mWin <- tm[["Log"]]
summary(mWin)
### Does the side of Chicago significantly modify the relationship between the
### log(minor) and log(fire)? What are we estimating in the following?
Lsl <- matrix(c(0,1,0,0, 0,1,0,1), nrow = 2, byrow = TRUE)
colnames(Lsl) <- names(coef(mWin))
rownames(Lsl) <- c("North", "South")
print(Lsl)
LSest(mWin, L = Lsl)
### What is the connection with the plots below?
LXLAB <- "Log(Minority)"
LYLAB <- "Log(Fires)"
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(log(fire) ~ log(minor), col = COLS[fside], bg = BGS[fside],
pch = PCHS[fside], xlab = LXLAB, ylab = LYLAB,
xlim = log(range(chicago[, "minor"])),
ylim = log(range(chicago[, "fire"])),
data = subset(chicago, fside == ss))
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(log(pminor), tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1)
title(main = ss)
}
par(mfrow = c(1, 1))
### What is the meaning of the numbers below?
LSm <- LSest(mWin, L = Lsl)
someEstim <- data.frame(Estimate = exp(LSm[["Estimate"]]),
Lower = exp(LSm[["Lower"]]),
Upper = exp(LSm[["Upper"]]))
print(someEstim)
### The fitted regression functions including the confidence band
### AROUND the regression function and the prediction band.
cifit <-predict(tm[["Log"]], newdata = pdata, interval = "confidence")
pfit <-predict(tm[["Log"]], newdata = pdata, interval = "prediction")
YLIM <- range(log(chicago[, "fire"]), pfit)
XLIM <- range(chicago[, "minor"])
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = LYLAB, xlim = XLIM, ylim = YLIM,
data = subset(chicago, fside == ss), main = ss)
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(pminor, cifit[ind, "fit"], col = COLS[ss], lwd = 3)
lines(pminor, cifit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 2)
lines(pminor, cifit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 2)
lines(pminor, pfit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 3)
lines(pminor, pfit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 3)
legend(x=25, y=1, lwd=c(3,2,2), lty=c(1,2,3), col=COLS[ss],
legend=c("estim. E[log(Y)|X=x]",
"conf. band around E[log(Y)|X=x]",
"prediction band for log(Y)"))
}
par(mfrow = c(1, 1))
### Note that based on the mWin model one can only make prediction bands
YLIM <- range(chicago[, "fire"], exp(pfit[, "upr"]))
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
xlab = XLAB, ylab = "Fires", xlim = XLIM, ylim = YLIM,
data = subset(chicago, fside == ss), main = ss)
ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
lines(pminor, exp(pfit[ind, "lwr"]), col = COLS[ss], lwd = 2, lty = 3)
lines(pminor, exp(pfit[ind, "upr"]), col = COLS[ss], lwd = 2, lty = 3)
legend(x=1, y=70, lwd=c(2), lty=c(3), col=COLS[ss],
legend=c("prediction band for Y"))
}
par(mfrow = c(1, 1))
### Different parameterization of the winning model.
mWinSum <- lm(log(fire) ~ log(minor) * fside, data = chicago,
contrasts = list(fside = contr.sum))
summary(mWinSum)
### What is the interpretation of the estimated parameters nd the numbers
### providsed in the summary output?
contr.sum(2)
L1sum <- matrix(c(1,0,0,0, 0,0,1,0, 0,0,-1,0), nrow = 3, byrow = TRUE)
colnames(L1sum) <- names(coef(mWinSum))
rownames(L1sum) <- c("Mean Intercept", levels(chicago[, "fside"]))
print(L1sum)
LSest(mWinSum, L = L1sum)
L2sum <- matrix(c(0,1,0,0, 0,0,0,1, 0,0,0,-1), nrow = 3, byrow = TRUE)
colnames(L2sum) <- names(coef(mWinSum))
rownames(L2sum) <- c("Mean Slope", levels(chicago[, "fside"]))
print(L2sum)
LSest(mWinSum, L = L2sum)
### We now improve the model by adding the information about insurance policies.
m1Win <- lm(log(fire) ~ (log2(minor) + insur) * fside, data = chicago)
summary(mWin) ## original model
summary(m1Win) ## "bigger" model
plotLM(m1Win)
### Note that the fitted regression planes describing the dependence
### of log(fire) on log(minor) and insur based on the m1Win model are the same
### as if the model is fitted separately in the North and South side of Chicago.
### Hence if we are interested in just a description of those
### planes (and are lazy to extract the appropriate intercepts
### and slopes from m1Win), we can fit separate North/South models
### and take the coefficients from there.
m1WinN <- lm(log(fire) ~ log2(minor) + insur, data = chicago,
subset = (fside == "North"))
m1WinS <- lm(log(fire) ~ log2(minor) + insur, data = chicago,
subset = (fside == "South"))
### Compare the regression coefficients
(beN <- coef(m1WinN))
(beS <- coef(m1WinS))
coef(m1Win)
rbind(beS, beN)
### Now compare the standard errors?
summary(m1Win)$coef
summary(m1WinN)$coef
summary(m1WinS)$coef
### Can you explain the differences which you observe in the outputs?
### -------------------------------------------------------------------------###
### Block 2: INDIVIDUAL WORK ###
### -------------------------------------------------------------------------###
### Standard model diagnostics in some more details:
### Plots for the standardized residuals against the predictor using lowess.
### Why is it useful? Explain, what can be concluded from such plot.
u1 <- rstandard(m[["Line"]])
plot(u1 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "blue")
lines(lowess(chicago[, "minor"], u1), col = "red3", lwd = 2)
u2 <- rstandard(m[["Log2"]])
plot(u2 ~ log(minor), data = chicago, pch = 21, col = "blue4", bg = "blue")
lines(lowess(log(chicago[, "minor"]), u2), col = "red3", lwd = 2)
u3 <- rstandard(m[["Parab"]])
plot(u3 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "blue")
lines(lowess(chicago[, "minor"], u3), col = "red3", lwd = 2)
### Improving the model 'mWin' (or 'mWinSum' respectively) by adding covariates.
### Which of the existing covariates contributes most significantly?
m0 <- lm(log(fire) ~ log(minor) * fside, data = chicago)
m1 <- list()
m1[["theft"]] <- lm(log(fire) ~ (log(minor) + theft) * fside, data = chicago)
m1[["old"]] <- lm(log(fire) ~ (log(minor) + old) * fside, data = chicago)
m1[["insur"]] <- lm(log(fire) ~ (log(minor) + insur) * fside, data = chicago)
m1[["income"]] <- lm(log(fire) ~ (log(minor) + income) * fside, data = chicago)
### What can be concluded from here?
anova(m0, m1[["theft"]])
anova(m0, m1[["old"]])
anova(m0, m1[["insur"]])
anova(m0, m1[["income"]])
### Which model would you prefer and what are the corresponding qualities of the
### final model?
### -------------------------------------------------------------------------###
### Block 3: ADDITIONAL MATERIAL ###
### -------------------------------------------------------------------------###
### The naked human eye is mostly used to read two dimensional plots but only a
### limited data structure can be described by two dimensional plots. More
### complex plot can be obtained by three dimensional plots which may be
### considered to be more fancy (but still of limited use)...
minorGrid <- seq(1, 100, length = 50)
insurGrid <- seq(0, 2.5, length = 50)
xyGrid <- expand.grid(minorGrid, insurGrid)
dim(xyGrid)
xyGrid[1:10,]
xyGrid[2491:2500,]
lFireN <- matrix(beN[1] + beN[2] * log(xyGrid[, 1]) + beN[3] * xyGrid[, 2],
nrow = length(minorGrid), ncol = length(insurGrid))
lFireS <- matrix(beS[1] + beS[2] * log(xyGrid[, 1]) + beS[3] * xyGrid[, 2],
nrow = length(minorGrid), ncol = length(insurGrid))
par(mfrow = c(1, 2))
persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "antiquewhite2", main="North")
persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "antiquewhite2", main="South")
par(mfrow = c(1, 1))
### A rotated version of the same plot
par(mfrow = c(1, 2))
persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "antiquewhite2", theta = 60,
phi = 45, main="North")
persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "antiquewhite2", theta = 60,
phi = 45, main="South")
par(mfrow = c(1, 1))
### Some improvement with the 'rgl' library
### (with an additional option to rotate the plot)
library(rgl)
### Plot for North of Chicago
persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "aquamarine")
with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire),
add = TRUE, size = 7, col = "darkgreen"))
### Plot for South of Chicago
persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "orange")
with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire),
add = TRUE, size = 5, col = "red"))
### Plot for both North and South of Chicago
persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "aquamarine")
persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance",
zlab = "Log(Fire)", col = "orange", add=T)
with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire),
add = TRUE, size = 7, col = "darkgreen"))
with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire),
add = TRUE, size = 7, col = "red"))