### -------------------------------------------------------------------------### ### NMSA407 - Linear Regression ### ### Winter Term 2020/2021 | Online ### ### ### ### EXERCISE #7 Regression models with interactions (log-transformation of Y)### ### -------------------------------------------------------------------------### ### -------------------------------------------------------------------------### ### Block 1: TEACHING MATERIAL ### ### -------------------------------------------------------------------------### library("mffSM") ### The command below assumes that a data file chicago.csv ### has been downloaded into your working directory. chicago <- read.csv("chicago.csv", header = TRUE) ### Alternatively, if the computer is connected to the internet, ### the data can be also loaded in by using the following command: chicago <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/chicago.csv", header=T, stringsAsFactors = TRUE) ### Brief data description (Chicago insurance redlining in 47 districts in 1970) ### minor: percentage of minority (0 - 100) in a given district ### fire: number of fires per 100 households during the reference period ### theft: number of reported thefts per 1000 inhabitants ### old: percentage of residential units built before 1939 ### insur: number of policies per 100 households ### income: median income (USD 1000) per household ### side: location of the district within Chicago (0 = North, 1 = South) dim(chicago) head(chicago) summary(chicago) ### Create a factor variable fside with values "North" and "South" from the ## original 0-1 variable denoted as 'side' chicago <- transform(chicago, fside = factor(side, levels = 0:1, labels=c("North", "South"))) summary(chicago) ### Scatterplot matrix palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5))) scatterplotMatrix(~fire + minor + theft + old + insur + income, smooth = FALSE, data = chicago, pch = 16, cex=0.75) ### We start by looking for a model of dependence of the number of fires [fire] ### on the percentage of minority [minor]. We are also aware of the fact that ### the form of this dependence may differ in north and south parts of Chicago. XLAB <- "Minority (%)" YLAB <- "Fires (#/100 households)" COLS <- c(North = "darkgreen", South = "red") BGS <- c(North = "aquamarine", South = "orange") PCHS <- c(North = 21, South = 23) plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = YLAB, data = chicago) legend(5, 40, legend = c("North", "South"), pch = PCHS, col = COLS, pt.bg = BGS, bty = "n") lines(with(subset(chicago, fside == "North"), lowess(minor, fire)), col = COLS["North"], lwd = 2) lines(with(subset(chicago, fside == "South"), lowess(minor, fire)), col = COLS["South"], lwd = 2) ### How would you interpret the plot? Do we need interactions in the model? ### Does the side of Chicago modify the relationship between the number of fires ### and the percentage of minority? m <- list() m[["Line"]] <- lm(fire ~ minor * fside, data = chicago) m[["Log2"]] <- lm(fire ~ log2(minor) * fside, data = chicago) m[["Parab"]] <- lm(fire ~ (minor + I(minor^2)) * fside, data = chicago) summary(m[["Line"]]) summary(m[["Log2"]]) summary(m[["Parab"]]) ### QUESTIONS: ### Based on the above output describe the effect of the percentage of minority ### on the number of fires. Describe also the effect of the side. ### Equivalently - the summary for all three models at once: lapply(m, summary) ### All R^2's at once sm <- lapply(m, summary) sapply(sm, "[[", "r.squared") sapply(sm, "[[", "adj.r.squared") ### Let us put them in a data.frame R2s <- data.frame(R2 = sapply(sm, "[[", "r.squared"), R2adj = sapply(sm, "[[", "adj.r.squared")) print(R2s) ### QUESTIONS ### Which model seems to be the best with respect to prediction ability? ### Which model would you prefer for the interpretation purposes? ### Fitted regression functions lpred <- 500 pminor <- seq(1, 100, length = lpred) pdata <- data.frame(minor = rep(pminor, 2), fside = factor(rep(c("North", "South"), each = lpred))) ### pdata pdata[1:10,] pdata[(lpred + 1):(lpred + 10),] ### Calculation of the fitted regression functions, ### again use vectorized calculation using lapply command. fit <- lapply(m, predict, newdata = pdata) print(fit[["Line"]][1:10]) print(fit[["Log2"]][1:10]) print(fit[["Parab"]][1:10]) ### Plotting the fitted regression curves for all three models layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE)) for (i in 1:length(fit)){ plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = YLAB, data = chicago) lines(pminor, fit[[i]][1:lpred], col = COLS["North"], lwd = 2) lines(pminor, fit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"], lwd = 2) title(main = names(fit)[i]) } plot.new() legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3, title = "Side") par(mfrow = c(1, 1)) ### Do we need interaction terms in the given models? ### Which of the three suggested models seems to be the best one? ### Does any of the models suffer from with some issues related to the ### assumptions of the normal linear model? ### Graphical diagnostics of the model assumptions plotLM(m[["Line"]]) plotLM(m[["Log2"]]) plotLM(m[["Parab"]]) ################################################################################ ### When the residual variance increases with the response ### expectation, log-transformation of response often ### ensures a homoscedastic model. ### Scatterplots with the log-transformed response equipped with LOWESS LYLAB <- "Log(Fires)" par(mfrow = c(1, 1)) plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, data = chicago) legend(5, 3.5, legend = c("North", "South"), pch = PCHS, col = COLS, pt.bg = BGS, bty = "n") lines(with(subset(chicago, fside == "North"), lowess(minor, log(fire))), col = COLS["North"], lwd = 2) lines(with(subset(chicago, fside == "South"), lowess(minor, log(fire))), col = COLS["South"], lwd = 2) ### Does the plot looks better in some sense? Let's try the corresponding model. ### (note a connection to a multiplicative model for Y with log-normal errors) tm <- list() summary(tm[["Line"]] <- lm(log(fire) ~ minor * fside, data = chicago)); ### Are we still modeling the expectation EY or something else? ### What are the consequences for the interpretation of model parameters? ### Interpret the parameters of the model with the log-transformed response. summary(tm[["Log"]] <- lm(log(fire) ~ log2(minor) * fside, data = chicago)) ### Describe the effect of the percentage of minority on the number of fires. ### Describe the effect of the side of the riveron the number of fires. summary(tm[["Parab"]] <- lm(log(fire) ~ (minor + I(minor^2)) * fside, data = chicago)) ### Comparing the models with the log-transformed response (stm <- lapply(tm, summary)) sapply(stm, "[[", "r.squared") sapply(stm, "[[", "adj.r.squared") ### Does it make sense to compare the following two sets of R^2's? ### If yes, why? If no, why? sapply(sm, "[[", "r.squared") ## models for Y sapply(stm, "[[", "r.squared") ## models for log(Y) ### SST for both models (datasets): sum((chicago$fire - mean(chicago$fire))^2) sum((log(chicago$fire) - mean(log(chicago$fire)))^2) ### Fitted regression functions tfit <- lapply(tm, predict, newdata = pdata) ### Plotting again the fitted regression curves for all three models layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE)) for (i in 1:length(fit)){ plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, data = chicago) lines(pminor, tfit[[i]][1:lpred], col = COLS["North"], lwd = 2) lines(pminor, tfit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"],lwd = 2) title(main = names(tfit)[i]) } plot.new() legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3, title = "Side") par(mfrow = c(1, 1)) ### Alternative view with a more detailed comparison par(mfrow = c(1, 2)) for (ss in levels(chicago[, "fside"])){ plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, data = subset(chicago, fside == ss), xlim = range(chicago[, "minor"]), ylim = log(range(chicago[, "fire"]))) ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred) lines(pminor, tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1) lines(pminor, tfit[["Parab"]][ind], col = COLS[ss], lwd = 2, lty = 5) title(main = ss) legend(40, 1.5, legend = c("Log", "Parab"), title = "Model", col = COLS[ss], lwd = 2, lty = c(1, 5)) } par(mfrow = c(1, 1)) ### Which model seems to be the best one? plotLM(tm[["Line"]]) plotLM(tm[["Log"]]) plotLM(tm[["Parab"]]) sapply(stm, "[[", "r.squared") sapply(stm, "[[", "adj.r.squared") ### Let's take the "Log" model for the following... mWin <- tm[["Log"]] summary(mWin) ### Does the side of Chicago significantly modify the relationship between the ### log(minor) and log(fire)? What are we estimating in the following? Lsl <- matrix(c(0,1,0,0, 0,1,0,1), nrow = 2, byrow = TRUE) colnames(Lsl) <- names(coef(mWin)) rownames(Lsl) <- c("North", "South") print(Lsl) LSest(mWin, L = Lsl) ### What is the connection with the plots below? LXLAB <- "Log(Minority)" LYLAB <- "Log(Fires)" par(mfrow = c(1, 2)) for (ss in levels(chicago[, "fside"])){ plot(log(fire) ~ log(minor), col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = LXLAB, ylab = LYLAB, xlim = log(range(chicago[, "minor"])), ylim = log(range(chicago[, "fire"])), data = subset(chicago, fside == ss)) ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred) lines(log(pminor), tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1) title(main = ss) } par(mfrow = c(1, 1)) ### What is the meaning of the numbers below? LSm <- LSest(mWin, L = Lsl) someEstim <- data.frame(Estimate = exp(LSm[["Estimate"]]), Lower = exp(LSm[["Lower"]]), Upper = exp(LSm[["Upper"]])) print(someEstim) ### The fitted regression functions including the confidence band ### AROUND the regression function and the prediction band. cifit <-predict(tm[["Log"]], newdata = pdata, interval = "confidence") pfit <-predict(tm[["Log"]], newdata = pdata, interval = "prediction") YLIM <- range(log(chicago[, "fire"]), pfit) XLIM <- range(chicago[, "minor"]) par(mfrow = c(1, 2)) for (ss in levels(chicago[, "fside"])){ plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = LYLAB, xlim = XLIM, ylim = YLIM, data = subset(chicago, fside == ss), main = ss) ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred) lines(pminor, cifit[ind, "fit"], col = COLS[ss], lwd = 3) lines(pminor, cifit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 2) lines(pminor, cifit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 2) lines(pminor, pfit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 3) lines(pminor, pfit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 3) legend(x=25, y=1, lwd=c(3,2,2), lty=c(1,2,3), col=COLS[ss], legend=c("estim. E[log(Y)|X=x]", "conf. band around E[log(Y)|X=x]", "prediction band for log(Y)")) } par(mfrow = c(1, 1)) ### Note that based on the mWin model one can only make prediction bands YLIM <- range(chicago[, "fire"], exp(pfit[, "upr"])) par(mfrow = c(1, 2)) for (ss in levels(chicago[, "fside"])){ plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], xlab = XLAB, ylab = "Fires", xlim = XLIM, ylim = YLIM, data = subset(chicago, fside == ss), main = ss) ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred) lines(pminor, exp(pfit[ind, "lwr"]), col = COLS[ss], lwd = 2, lty = 3) lines(pminor, exp(pfit[ind, "upr"]), col = COLS[ss], lwd = 2, lty = 3) legend(x=1, y=70, lwd=c(2), lty=c(3), col=COLS[ss], legend=c("prediction band for Y")) } par(mfrow = c(1, 1)) ### Different parameterization of the winning model. mWinSum <- lm(log(fire) ~ log(minor) * fside, data = chicago, contrasts = list(fside = contr.sum)) summary(mWinSum) ### What is the interpretation of the estimated parameters and the numbers ### providsed in the summary output? contr.sum(2) L1sum <- matrix(c(1,0,0,0, 0,0,1,0, 0,0,-1,0), nrow = 3, byrow = TRUE) colnames(L1sum) <- names(coef(mWinSum)) rownames(L1sum) <- c("Mean Intercept", levels(chicago[, "fside"])) print(L1sum) LSest(mWinSum, L = L1sum) L2sum <- matrix(c(0,1,0,0, 0,0,0,1, 0,0,0,-1), nrow = 3, byrow = TRUE) colnames(L2sum) <- names(coef(mWinSum)) rownames(L2sum) <- c("Mean Slope", levels(chicago[, "fside"])) print(L2sum) LSest(mWinSum, L = L2sum) ### We now improve the model by adding the information about insurance policies. m1Win <- lm(log(fire) ~ (log(minor) + insur) * fside, data = chicago) summary(m0) ## original model summary(m1Win) ## "bigger" model plotLM(m1Win) ### Note that the fitted regression planes describing the dependence ### of log(fire) on log(minor) and insur based on the m1Win model are the same ### as if the model is fitted separately in the North and South side of Chicago. ### Hence if we are interested in just a description of those ### planes (and are lazy to extract the appropriate intercepts ### and slopes from m1Win), we can fit separate North/South models ### and take the coefficients from there. m1WinN <- lm(log(fire) ~ log(minor) + insur, data = chicago, subset = (fside == "North")) m1WinS <- lm(log(fire) ~ log(minor) + insur, data = chicago, subset = (fside == "South")) ### Compare the regression coefficients (beN <- coef(m1WinN)) (beS <- coef(m1WinS)) coef(m1Win) rbind(beS, beN) ### Now compare the standard errors? summary(m1Win)$coef summary(m1WinN)$coef summary(m1WinS)$coef ### Can you explain the differences which you observe in the outputs? ### -------------------------------------------------------------------------### ### Block 2: INDIVIDUAL WORK ### ### -------------------------------------------------------------------------### ### Standard model diagnostics in some more details: ### Plots for the standardized residuals against the predictor using lowess. ### Why is it useful? Explain, what can be concluded from such plot. u1 <- rstandard(m[["Line"]]) plot(u1 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "blue") lines(lowess(chicago[, "minor"], u1), col = "red3", lwd = 2) u2 <- rstandard(m[["Log2"]]) plot(u2 ~ log(minor), data = chicago, pch = 21, col = "blue4", bg = "blue") lines(lowess(log(chicago[, "minor"]), u2), col = "red3", lwd = 2) u3 <- rstandard(m[["Parab"]]) plot(u3 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "blue") lines(lowess(chicago[, "minor"], u3), col = "red3", lwd = 2) ### Improving the model 'mWin' (or 'mWinSum' respectively) by adding covariates. ### Which of the existing covariates contributes most significantly? m0 <- lm(log(fire) ~ log(minor) * fside, data = chicago) m1 <- list() m1[["theft"]] <- lm(log(fire) ~ (log(minor) + theft) * fside, data = chicago) m1[["old"]] <- lm(log(fire) ~ (log(minor) + old) * fside, data = chicago) m1[["insur"]] <- lm(log(fire) ~ (log(minor) + insur) * fside, data = chicago) m1[["income"]] <- lm(log(fire) ~ (log(minor) + income) * fside, data = chicago) ### What can be concluded from here? anova(m0, m1[["theft"]]) anova(m0, m1[["old"]]) anova(m0, m1[["insur"]]) anova(m0, m1[["income"]]) ### Which model would you prefer and what are the corresponding qualities of the ### final model? ### -------------------------------------------------------------------------### ### Block 3: ADDITIONAL MATERIAL ### ### -------------------------------------------------------------------------### ### The naked human eye is mostly used to read two dimensional plots but only a ### limited data structure can be described by two dimensional plots. More ### complex plot can be obtained by three dimensional plots which may be ### considered to be more fancy (but still of limited use)... minorGrid <- seq(1, 100, length = 50) insurGrid <- seq(0, 2.5, length = 50) xyGrid <- expand.grid(minorGrid, insurGrid) dim(xyGrid) xyGrid[1:10,] xyGrid[2491:2500,] lFireN <- matrix(beN[1] + beN[2] * log(xyGrid[, 1]) + beN[3] * xyGrid[, 2], nrow = length(minorGrid), ncol = length(insurGrid)) lFireS <- matrix(beS[1] + beS[2] * log(xyGrid[, 1]) + beS[3] * xyGrid[, 2], nrow = length(minorGrid), ncol = length(insurGrid)) par(mfrow = c(1, 2)) persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", main="North") persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", main="South") par(mfrow = c(1, 1)) ### A rotated version of the same plot par(mfrow = c(1, 2)) persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", theta = 60, phi = 45, main="North") persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "antiquewhite2", theta = 60, phi = 45, main="South") par(mfrow = c(1, 1)) ### Some improvement with the 'rgl' library ### (with an additional option to rotate the plot) library(rgl) ### Plot for North of Chicago persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "aquamarine") with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire), add = TRUE, size = 7, col = "darkgreen")) ### Plot for South of Chicago persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "orange") with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire), add = TRUE, size = 5, col = "red")) ### Plot for both North and South of Chicago persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "aquamarine") persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", zlab = "Log(Fire)", col = "orange", add=T) with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire), add = TRUE, size = 7, col = "darkgreen")) with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire), add = TRUE, size = 7, col = "red"))