### -------------------------------------------------------------------------###
### NMSA407 - Linear Regression                                              ###
### Winter Term 2020/2021 | Online                                           ###
###                                                                          ###
### EXERCISE #7 Regression models with interactions (log-transformation of Y)###
### -------------------------------------------------------------------------###

### -------------------------------------------------------------------------###
###  Block 1: TEACHING MATERIAL                                              ###
### -------------------------------------------------------------------------###  


library("mffSM")

### The command below assumes that a data file chicago.csv
### has been downloaded into your working directory.

chicago <- read.csv("chicago.csv", header = TRUE)

### Alternatively, if the computer is connected to the internet,
### the data can be also loaded in by using the following command: 

chicago <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/chicago.csv", 
                    header=T, stringsAsFactors = TRUE)

### Brief data description (Chicago insurance redlining in 47 districts in 1970)
### minor:    percentage of minority (0 - 100) in a given district
### fire:     number of fires per 100 households during the reference period
### theft:    number of reported thefts per 1000 inhabitants
### old:      percentage of residential units built before 1939
### insur:    number of policies per 100 households
### income:   median income (USD 1000) per household
### side:     location of the district within Chicago (0 = North, 1 = South)


dim(chicago)
head(chicago)
summary(chicago)

### Create a factor variable fside with values "North" and "South" from the 
## original 0-1 variable denoted as 'side'
chicago <- transform(chicago, fside = factor(side, levels = 0:1, 
                                      labels=c("North", "South")))
summary(chicago)


### Scatterplot matrix
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~fire + minor + theft + old + insur + income,
                    smooth = FALSE, data = chicago, pch = 16, cex=0.75)


### We start by looking for a model of dependence of the number of fires [fire]
### on the percentage of minority [minor]. We are also aware of the fact that 
### the form of this dependence may differ in north and south parts of Chicago.

XLAB <- "Minority (%)"
YLAB <- "Fires (#/100 households)"
COLS <- c(North = "darkgreen", South = "red")
BGS <- c(North = "aquamarine", South = "orange")
PCHS <- c(North = 21, South = 23)
plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], 
                   xlab = XLAB, ylab = YLAB, data = chicago)
legend(5, 40, legend = c("North", "South"), pch = PCHS, 
              col = COLS, pt.bg = BGS, bty = "n")
lines(with(subset(chicago, fside == "North"), lowess(minor, fire)), 
                                              col = COLS["North"], lwd = 2)
lines(with(subset(chicago, fside == "South"), lowess(minor, fire)), 
                                              col = COLS["South"], lwd = 2)


### How would you interpret the plot? Do we need interactions in the model?
### Does the side of Chicago modify the relationship between the number of fires
### and the percentage of minority?

m <- list()
m[["Line"]]  <- lm(fire ~ minor * fside, data = chicago)
m[["Log2"]]   <- lm(fire ~ log2(minor) * fside, data = chicago)
m[["Parab"]] <- lm(fire ~ (minor + I(minor^2)) * fside, data = chicago)

summary(m[["Line"]])
summary(m[["Log2"]])
summary(m[["Parab"]])

### QUESTIONS:
### Based on the above output describe the effect of the percentage of minority 
### on the number of fires. Describe also the effect of the side.  

### Equivalently - the summary for all three models at once:
lapply(m, summary)

### All R^2's at once
sm <- lapply(m, summary)
sapply(sm, "[[", "r.squared")
sapply(sm, "[[", "adj.r.squared")


### Let us put them in a data.frame
R2s <- data.frame(R2    = sapply(sm, "[[", "r.squared"),
                  R2adj = sapply(sm, "[[", "adj.r.squared"))
print(R2s)
    
### QUESTIONS
### Which model seems to be the best with respect to prediction ability?
### Which model would you prefer for the interpretation purposes?


### Fitted regression functions
lpred <- 500
pminor <- seq(1, 100, length = lpred)
pdata <- data.frame(minor = rep(pminor, 2), 
                    fside = factor(rep(c("North", "South"), 
                    each = lpred)))

### pdata
pdata[1:10,]
pdata[(lpred + 1):(lpred + 10),]

### Calculation of the fitted regression functions,
### again use vectorized calculation using lapply command.
fit <- lapply(m, predict, newdata = pdata)

print(fit[["Line"]][1:10])
print(fit[["Log2"]][1:10])
print(fit[["Parab"]][1:10])

### Plotting the fitted regression curves for all three models
layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:length(fit)){
  plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], 
                     xlab = XLAB, ylab = YLAB, data = chicago)
  lines(pminor, fit[[i]][1:lpred], col = COLS["North"], lwd = 2)
  lines(pminor, fit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"], lwd = 2)
  title(main = names(fit)[i])
}

plot.new()
legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3, 
                 title = "Side")
par(mfrow = c(1, 1))


### Do we need interaction terms in the given models?
### Which of the three suggested models seems to be the best one?
### Does any of the models suffer from with some issues related to the 
### assumptions of the normal linear model?
  
### Graphical diagnostics of the model assumptions
plotLM(m[["Line"]])
plotLM(m[["Log2"]])
plotLM(m[["Parab"]])


################################################################################
### When the residual variance increases with the response
### expectation, log-transformation of response often
### ensures a homoscedastic model.

### Scatterplots with the log-transformed response equipped with LOWESS 
LYLAB <- "Log(Fires)"
par(mfrow = c(1, 1))
plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], 
                        xlab = XLAB, ylab = LYLAB, data = chicago)
legend(5, 3.5, legend = c("North", "South"), pch = PCHS, 
               col = COLS, pt.bg = BGS, bty = "n")
lines(with(subset(chicago, fside == "North"), lowess(minor, log(fire))), 
                                              col = COLS["North"], lwd = 2)
lines(with(subset(chicago, fside == "South"), lowess(minor, log(fire))), 
                                              col = COLS["South"], lwd = 2)

### Does the plot looks better in some sense? Let's try the corresponding model.
### (note a connection to a multiplicative model for Y with log-normal errors)

tm <- list() 
summary(tm[["Line"]]  <- lm(log(fire) ~ minor * fside, data = chicago));

### Are we still modeling the expectation EY or something else?
### What are the consequences for the interpretation of model parameters?
### Interpret the parameters of the model with the log-transformed response.

summary(tm[["Log"]]   <- lm(log(fire) ~ log2(minor) * fside, data = chicago))

### Describe the effect of the percentage of minority on the number of fires.  
### Describe the effect of the side of the riveron the number of fires.  

summary(tm[["Parab"]] <- lm(log(fire) ~ (minor + I(minor^2)) * fside, data = chicago))


### Comparing the models with the log-transformed response
(stm <- lapply(tm, summary))
sapply(stm, "[[", "r.squared")
sapply(stm, "[[", "adj.r.squared")


### Does it make sense to compare the following two sets of R^2's?
### If yes, why? If no, why?
sapply(sm, "[[", "r.squared")      ## models for Y
sapply(stm, "[[", "r.squared")     ## models for log(Y)

### SST for both models (datasets): 
sum((chicago$fire - mean(chicago$fire))^2)
sum((log(chicago$fire) - mean(log(chicago$fire)))^2)


### Fitted regression functions
tfit <- lapply(tm, predict, newdata = pdata)

### Plotting again the fitted regression curves for all three models
layout(matrix(c(4,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
for (i in 1:length(fit)){
  plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
                          xlab = XLAB, ylab = LYLAB, data = chicago)
  lines(pminor, tfit[[i]][1:lpred], col = COLS["North"], lwd = 2)
  lines(pminor, tfit[[i]][(lpred + 1):(2 * lpred)], col = COLS["South"],lwd = 2)
  title(main = names(tfit)[i])  
}  
plot.new()
legend(0.1, 0.9, legend = names(COLS), col = COLS, pch = PCHS, lty = 1, lwd = 3, 
                 title = "Side")
par(mfrow = c(1, 1))


### Alternative view with a more detailed comparison
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
  plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside], 
                          xlab = XLAB, ylab = LYLAB, 
                          data = subset(chicago, fside == ss), 
                          xlim = range(chicago[, "minor"]), 
                          ylim = log(range(chicago[, "fire"])))
  ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
  lines(pminor, tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1)
  lines(pminor, tfit[["Parab"]][ind], col = COLS[ss], lwd = 2, lty = 5)
  title(main = ss)
  legend(40, 1.5, legend = c("Log", "Parab"), title = "Model", 
                  col = COLS[ss], lwd = 2, lty = c(1, 5))
}  
par(mfrow = c(1, 1))


### Which model seems to be the best one?

plotLM(tm[["Line"]])
plotLM(tm[["Log"]])
plotLM(tm[["Parab"]])

sapply(stm, "[[", "r.squared")
sapply(stm, "[[", "adj.r.squared")


### Let's take the  "Log" model for the following... 
mWin <- tm[["Log"]]
summary(mWin)

 
### Does the side of Chicago significantly modify  the relationship between the 
### log(minor) and log(fire)? What are we estimating in the following?

Lsl <- matrix(c(0,1,0,0, 0,1,0,1), nrow = 2, byrow = TRUE)
colnames(Lsl) <- names(coef(mWin))
rownames(Lsl) <- c("North", "South")
print(Lsl)

LSest(mWin, L = Lsl)

### What is the connection with the plots below?

LXLAB <- "Log(Minority)"
LYLAB <- "Log(Fires)"
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
  plot(log(fire) ~ log(minor), col = COLS[fside], bg = BGS[fside], 
                               pch = PCHS[fside], xlab = LXLAB, ylab = LYLAB, 
                               xlim = log(range(chicago[, "minor"])), 
                               ylim = log(range(chicago[, "fire"])),
                               data = subset(chicago, fside == ss))
  ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
  lines(log(pminor), tfit[["Log"]][ind], col = COLS[ss], lwd = 2, lty = 1)
  title(main = ss)
}  
par(mfrow = c(1, 1))


### What is the meaning of the numbers below?
LSm <- LSest(mWin, L = Lsl)
someEstim <- data.frame(Estimate = exp(LSm[["Estimate"]]),                        
                        Lower    = exp(LSm[["Lower"]]),
                        Upper    = exp(LSm[["Upper"]]))
print(someEstim)


### The fitted regression functions including the confidence band
### AROUND the regression function and the prediction band.
cifit <-predict(tm[["Log"]], newdata = pdata, interval = "confidence")
pfit <-predict(tm[["Log"]], newdata = pdata, interval = "prediction")

YLIM <- range(log(chicago[, "fire"]), pfit)
XLIM <- range(chicago[, "minor"])

par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
  plot(log(fire) ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
       xlab = XLAB, ylab = LYLAB, xlim = XLIM, ylim = YLIM,
       data = subset(chicago, fside == ss), main = ss)
  ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
  lines(pminor, cifit[ind, "fit"], col = COLS[ss], lwd = 3)
  lines(pminor, cifit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 2)
  lines(pminor, cifit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 2)
  lines(pminor, pfit[ind, "lwr"], col = COLS[ss], lwd = 2, lty = 3)
  lines(pminor, pfit[ind, "upr"], col = COLS[ss], lwd = 2, lty = 3)
	legend(x=25, y=1, lwd=c(3,2,2), lty=c(1,2,3), col=COLS[ss], 
	             legend=c("estim. E[log(Y)|X=x]",
	                      "conf. band around E[log(Y)|X=x]", 
	                      "prediction band for log(Y)"))
}  
par(mfrow = c(1, 1))


### Note that based on the mWin model one can only make prediction bands 
YLIM <- range(chicago[, "fire"], exp(pfit[, "upr"]))
par(mfrow = c(1, 2))
for (ss in levels(chicago[, "fside"])){
  plot(fire ~ minor, col = COLS[fside], bg = BGS[fside], pch = PCHS[fside],
       xlab = XLAB, ylab = "Fires", xlim = XLIM, ylim = YLIM,
       data = subset(chicago, fside == ss), main = ss)
  ind <- if (ss == "North") 1:lpred else (lpred + 1):(2 * lpred)
  lines(pminor, exp(pfit[ind, "lwr"]), col = COLS[ss], lwd = 2, lty = 3)
  lines(pminor, exp(pfit[ind, "upr"]), col = COLS[ss], lwd = 2, lty = 3)
	legend(x=1, y=70, lwd=c(2), lty=c(3), col=COLS[ss], 
	                  legend=c("prediction band for Y"))
}  
par(mfrow = c(1, 1))


### Different parameterization of the winning model.
mWinSum <- lm(log(fire) ~ log(minor) * fside, data = chicago, 
                          contrasts = list(fside = contr.sum))
summary(mWinSum)


### What is  the interpretation of the estimated parameters and the  numbers 
### providsed in the summary output?

contr.sum(2)

L1sum <- matrix(c(1,0,0,0, 0,0,1,0, 0,0,-1,0), nrow = 3, byrow = TRUE)
colnames(L1sum) <- names(coef(mWinSum))
rownames(L1sum) <- c("Mean Intercept", levels(chicago[, "fside"]))
print(L1sum)
LSest(mWinSum, L = L1sum)

L2sum <- matrix(c(0,1,0,0, 0,0,0,1, 0,0,0,-1), nrow = 3, byrow = TRUE)
colnames(L2sum) <- names(coef(mWinSum))
rownames(L2sum) <- c("Mean Slope", levels(chicago[, "fside"]))
print(L2sum)
LSest(mWinSum, L = L2sum)


### We now improve the model by adding the information about insurance policies.
m1Win <- lm(log(fire) ~ (log(minor) + insur) * fside, data = chicago)

summary(m0)          ## original model 
summary(m1Win)       ## "bigger" model

plotLM(m1Win)


### Note that the fitted regression planes describing the dependence 
### of log(fire) on log(minor) and insur based on the m1Win model are the same 
### as if the model is fitted separately in the North and South side of Chicago.

### Hence if we are interested in just a description of those
### planes (and are lazy to extract the appropriate intercepts
### and slopes from m1Win), we can fit separate North/South models
### and take the coefficients from there.

m1WinN <- lm(log(fire) ~ log(minor) + insur, data = chicago, 
                                             subset = (fside == "North"))
m1WinS <- lm(log(fire) ~ log(minor) + insur, data = chicago, 
                                             subset = (fside == "South")) 

### Compare the regression coefficients 
(beN <- coef(m1WinN))
(beS <- coef(m1WinS))

coef(m1Win)
rbind(beS, beN)

### Now compare the standard errors?  
summary(m1Win)$coef

summary(m1WinN)$coef 
summary(m1WinS)$coef

### Can you explain the differences which you observe in the outputs? 


### -------------------------------------------------------------------------###
###  Block 2: INDIVIDUAL WORK                                                ###
### -------------------------------------------------------------------------###

### Standard model diagnostics in some more details:
### Plots for the standardized residuals against the predictor using lowess. 
### Why is it useful? Explain, what can be concluded from such plot.

u1 <- rstandard(m[["Line"]])
plot(u1 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "blue")
lines(lowess(chicago[, "minor"], u1), col = "red3", lwd = 2)
 
u2 <- rstandard(m[["Log2"]])
plot(u2 ~ log(minor), data = chicago, pch = 21, col = "blue4", bg = "blue")
lines(lowess(log(chicago[, "minor"]), u2), col = "red3", lwd = 2)

u3 <- rstandard(m[["Parab"]])
plot(u3 ~ minor, data = chicago, pch = 21, col = "blue4", bg = "blue")
lines(lowess(chicago[, "minor"], u3), col = "red3", lwd = 2)


### Improving the model 'mWin' (or 'mWinSum' respectively) by adding covariates.
### Which of the existing covariates contributes most significantly? 
m0 <- lm(log(fire) ~ log(minor) * fside, data = chicago)

m1 <- list()
m1[["theft"]] <- lm(log(fire) ~ (log(minor) + theft) * fside, data = chicago)
m1[["old"]] <- lm(log(fire) ~ (log(minor) + old) * fside, data = chicago)
m1[["insur"]] <- lm(log(fire) ~ (log(minor) + insur) * fside, data = chicago)
m1[["income"]] <- lm(log(fire) ~ (log(minor) + income) * fside, data = chicago)


### What can be concluded from here?
anova(m0, m1[["theft"]])
anova(m0, m1[["old"]])
anova(m0, m1[["insur"]])
anova(m0, m1[["income"]])

### Which model would you prefer and what are the corresponding qualities of the 
### final model? 


### -------------------------------------------------------------------------###
###  Block 3: ADDITIONAL MATERIAL                                            ###
### -------------------------------------------------------------------------###

### The naked human eye is mostly used to read two dimensional plots but only a
### limited data structure can be described by two dimensional plots. More 
### complex plot can be obtained by three dimensional plots which may be 
### considered to be more  fancy (but still of limited use)...

minorGrid <- seq(1, 100, length = 50)
insurGrid <- seq(0, 2.5, length = 50)
xyGrid <- expand.grid(minorGrid, insurGrid)
dim(xyGrid)
xyGrid[1:10,]
xyGrid[2491:2500,]

lFireN <- matrix(beN[1] + beN[2] * log(xyGrid[, 1]) + beN[3] * xyGrid[, 2],
                          nrow = length(minorGrid), ncol = length(insurGrid))
lFireS <- matrix(beS[1] + beS[2] * log(xyGrid[, 1]) + beS[3] * xyGrid[, 2], 
                          nrow = length(minorGrid), ncol = length(insurGrid))

par(mfrow = c(1, 2))
persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", 
                 zlab = "Log(Fire)", col = "antiquewhite2", main="North")
persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance",
                 zlab = "Log(Fire)", col = "antiquewhite2", main="South")
par(mfrow = c(1, 1))


### A rotated version of the same plot 
par(mfrow = c(1, 2))
persp(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", 
                 zlab = "Log(Fire)", col = "antiquewhite2", theta = 60, 
                 phi = 45, main="North")
persp(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", 
                 zlab = "Log(Fire)", col = "antiquewhite2", theta = 60, 
                 phi = 45, main="South")
par(mfrow = c(1, 1))


### Some improvement with the 'rgl' library 
### (with an additional option to rotate the plot)

library(rgl)
### Plot for North of Chicago
persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", 
                                      zlab = "Log(Fire)", col = "aquamarine")
with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire), 
                                       add = TRUE, size = 7, col = "darkgreen"))


### Plot for South of Chicago
persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", 
                                      zlab = "Log(Fire)", col = "orange")
with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire), 
                                        add = TRUE, size = 5, col = "red"))

### Plot for both North and South of Chicago
persp3d(minorGrid, insurGrid, lFireN, xlab = "Minority", ylab = "Insurance", 
                                      zlab = "Log(Fire)", col = "aquamarine")
persp3d(minorGrid, insurGrid, lFireS, xlab = "Minority", ylab = "Insurance", 
                                      zlab = "Log(Fire)", col = "orange", add=T) 
with(subset(chicago, fside == "North"), plot3d(minor, insur, log(fire), 
                                      add = TRUE, size = 7, col = "darkgreen"))
with(subset(chicago, fside == "South"), plot3d(minor, insur, log(fire), 
                                        add = TRUE, size = 7, col = "red"))