NMSA407 Linear Regression: Tutorial

Numeric covariate: simple transformation, polynomial regression, regression splines

Data Houses1987




Introduction

Load used data and calculate basic summaries

data(Houses1987, package = "mffSM")
head(Houses1987)
##   price ground bed bath floor garage airco gas fbed fbath ffloor fgarage fairco fgas
## 1 42000    544   3    1     2      1     0   0    3     1      2       1     No   No
## 2 38500    372   2    1     1      0     0   0  <=2     1      1       0     No   No
## 3 49500    285   3    1     1      0     0   0    3     1      1       0     No   No
## 4 60500    619   3    1     2      0     0   0    3     1      2       0     No   No
## 5 61000    592   2    1     1      0     0   0  <=2     1      1       0     No   No
## 6 66000    387   3    1     1      0     1   0    3     1      1       0    Yes   No
dim(Houses1987)
## [1] 546  14
summary(Houses1987)
##      price            ground            bed             bath           floor           garage      
##  Min.   : 25000   Min.   : 153.0   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 49125   1st Qu.: 335.0   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median : 62000   Median : 428.0   Median :3.000   Median :1.000   Median :2.000   Median :0.0000  
##  Mean   : 68122   Mean   : 479.1   Mean   :2.965   Mean   :1.286   Mean   :1.808   Mean   :0.6923  
##  3rd Qu.: 82000   3rd Qu.: 592.0   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :190000   Max.   :1507.0   Max.   :6.000   Max.   :4.000   Max.   :4.000   Max.   :3.0000  
##      airco             gas           fbed     fbath     ffloor    fgarage   fairco     fgas    
##  Min.   :0.0000   Min.   :0.00000   <=2:138   1  :402   1  :227   0  :300   No :373   No :521  
##  1st Qu.:0.0000   1st Qu.:0.00000   3  :301   2  :133   2  :238   1  :126   Yes:173   Yes: 25  
##  Median :0.0000   Median :0.00000   4  : 95   >=3: 11   >=3: 81   >=2:120                      
##  Mean   :0.3168   Mean   :0.04579   >=5: 12                                                    
##  3rd Qu.:1.0000   3rd Qu.:0.00000                                                              
##  Max.   :1.0000   Max.   :1.00000




Dependence of price on ground

Basis scatterplot

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")

plot of chunk fig-ParamCovar-01-01

Scatterplot equiped with the LOWESS smoother

lws <- lowess(Houses1987[, "ground"], Houses1987[, "price"])
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
lines(lws, col = "blue", lwd = 2)

plot of chunk fig-ParamCovar-01-02




Model with log-transformed covariate

Fit

m1 <- lm(price ~ log(ground), data = Houses1987)
summary(m1)
## 
## Call:
## lm(formula = price ~ log(ground), data = Houses1987)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55841 -14946  -2805  10965 105124 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -161279      14538  -11.09   <2e-16 ***
## log(ground)    37658       2381   15.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22120 on 544 degrees of freedom
## Multiple R-squared:  0.3149, Adjusted R-squared:  0.3137 
## F-statistic: 250.1 on 1 and 544 DF,  p-value: < 2.2e-16

Fitted regression function (original \(z\)-scale)

grpred <- seq(150, 1510, length = 500)
pm1 <- predict(m1, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
#lines(lws, col = "blue", lwd = 2)
lines(grpred, pm1, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-03

#legend(160, 190000, legend = c("Y ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Fitted regression function (transformed \(\log(z)\)-scale)

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ log(ground), data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("log(Ground size) [log(", m^2, ")]")), ylab = "Price [CAD]")
#lines(lowess(log(Houses1987[, "ground"]), Houses1987[, "price"]), col = "blue", lwd = 2)
abline(m1, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-04

#legend(5, 190000, legend = c("Y ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Basic residual plots

library("mffSM")
plotLM(m1)

plot of chunk fig-ParamCovar-01-05




Additional log transformation of response

We additionally consider a log-transformation of the response which improves some other aspects of the model.

Fit

m2 <- lm(log(price) ~ log(ground), data = Houses1987)
summary(m2)
## 
## Call:
## lm(formula = log(price) ~ log(ground), data = Houses1987)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8571 -0.1988  0.0046  0.1929  0.8969 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.75625    0.19933   38.91   <2e-16 ***
## log(ground)  0.54216    0.03265   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3033 on 544 degrees of freedom
## Multiple R-squared:  0.3364, Adjusted R-squared:  0.3351 
## F-statistic: 275.7 on 1 and 544 DF,  p-value: < 2.2e-16

Fitted regression function (original \(z\)-scale, transformed \(\log(y)\)-scale)

pm2 <- predict(m2, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
#lines(lowess(Houses1987[, "ground"], log(Houses1987[, "price"])), col = "blue", lwd = 2)
lines(grpred, pm2, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-06

#legend(160, log(190000), legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Fitted regression function (transformed \(\log(z)\)-scale and \(\log(y)\)-scale)

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ log(ground), data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("X = log(Z) = log(Ground size) [log(", m^2, ")]")), ylab = "Y = log(Price)")
#lines(lowess(log(Houses1987[, "ground"]), log(Houses1987[, "price"])), col = "blue", lwd = 2)
abline(m2, col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-07

#legend(5, log(190000), legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Prediction line (both \(x\) and \(y\) axes use original units)

pm2 <- predict(m2, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
#lines(lws, col = "blue", lwd = 2)
lines(grpred, exp(pm2), col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-08

#legend(160, 190000, legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))

Basic residual plots

plotLM(m2)

plot of chunk fig-ParamCovar-01-09




Polynomial models with log(price) as response

Fits

p <- list()
p[["0"]] <- lm(log(price) ~ 1, data = Houses1987)
p[["1"]] <- lm(log(price) ~ ground, data = Houses1987)
p[["2"]] <- lm(log(price) ~ ground + I(ground^2), data = Houses1987)
p[["3"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
p[["4"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Houses1987)
p[["5"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4) + I(ground^5), data = Houses1987)

sapply(p, summary)
## $`0`
## 
## Call:
## lm(formula = log(price) ~ 1, data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93233 -0.25685 -0.02407  0.25551  1.09582 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.05896    0.01592   694.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.372 on 545 degrees of freedom
## 
## 
## $`1`
## 
## Call:
## lm(formula = log(price) ~ ground, data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96501 -0.20546  0.00402  0.19730  0.88491 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.058e+01  3.452e-02  306.50   <2e-16 ***
## ground      1.001e-03  6.641e-05   15.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3127 on 544 degrees of freedom
## Multiple R-squared:  0.2947, Adjusted R-squared:  0.2934 
## F-statistic: 227.3 on 1 and 544 DF,  p-value: < 2.2e-16
## 
## 
## $`2`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89001 -0.19452  0.00597  0.19397  0.91316 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.020e+01  6.698e-02  152.28  < 2e-16 ***
## ground       2.470e-03  2.339e-04   10.56  < 2e-16 ***
## I(ground^2) -1.200e-06  1.838e-07   -6.53 1.52e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3014 on 543 degrees of freedom
## Multiple R-squared:  0.3461, Adjusted R-squared:  0.3437 
## F-statistic: 143.7 on 2 and 543 DF,  p-value: < 2.2e-16
## 
## 
## $`3`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3), 
##     data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87279 -0.19903  0.00212  0.19780  0.90934 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.965e+00  1.371e-01  72.682  < 2e-16 ***
## ground       3.784e-03  7.109e-04   5.323 1.49e-07 ***
## I(ground^2) -3.306e-06  1.092e-06  -3.028  0.00258 ** 
## I(ground^3)  9.700e-10  4.958e-10   1.957  0.05091 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared:  0.3507, Adjusted R-squared:  0.3471 
## F-statistic: 97.57 on 3 and 542 DF,  p-value: < 2.2e-16
## 
## 
## $`4`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3) + 
##     I(ground^4), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91381 -0.19784  0.00291  0.20086  0.93320 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.071e+01  2.634e-01  40.653  < 2e-16 ***
## ground      -1.788e-03  1.834e-03  -0.975  0.33013    
## I(ground^2)  1.052e-05  4.339e-06   2.424  0.01569 *  
## I(ground^3) -1.256e-08  4.143e-09  -3.032  0.00254 ** 
## I(ground^4)  4.450e-12  1.353e-12   3.290  0.00107 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2979 on 541 degrees of freedom
## Multiple R-squared:  0.3634, Adjusted R-squared:  0.3587 
## F-statistic: 77.21 on 4 and 541 DF,  p-value: < 2.2e-16
## 
## 
## $`5`
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3) + 
##     I(ground^4) + I(ground^5), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90198 -0.19046  0.00679  0.19546  0.94797 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.145e+01  4.996e-01  22.923   <2e-16 ***
## ground      -9.058e-03  4.532e-03  -1.998   0.0462 *  
## I(ground^2)  3.589e-05  1.511e-05   2.376   0.0179 *  
## I(ground^3) -5.238e-08  2.308e-08  -2.269   0.0236 *  
## I(ground^4)  3.277e-11  1.621e-11   2.022   0.0437 *  
## I(ground^5) -7.379e-15  4.209e-15  -1.753   0.0801 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2973 on 540 degrees of freedom
## Multiple R-squared:  0.367,  Adjusted R-squared:  0.3612 
## F-statistic: 62.62 on 5 and 540 DF,  p-value: < 2.2e-16

Fitted regression functions

pp <- lapply(p, predict, newdata = data.frame(ground = grpred))

Plot of data together with the fitted regression functions (original \(z\)-scale)

COLS <- c("black", "brown", "red", "orange", "darkgreen", "blue")
names(COLS) <- names(p)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pp)){
  lines(grpred, pp[[i]], col = COLS[i], lwd = 2)
}  
lines(grpred, exp(pm2), col = "red3", lwd = 2)
legend(1300, 10.90, legend = 0:5, lty = 1, lwd = 2, col = COLS)
text(1300, 10.95, labels = "Degree", pos = 4, font = 2)

plot of chunk fig-ParamCovar-01-10

Residuals versus fitted values

par(mfrow = c(2, 3))
for (i in 1:length(p)){
  plot(p[[i]], which = 1, pch = 21, col = "blue4", bg = "skyblue", caption = paste("Degree ", i - 1, sep = ""))
}  

plot of chunk fig-ParamCovar-01-11

par(mfrow = c(1, 1))

Fitted regression function for a cubic polynomial

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pp[["3"]], col = "red3", lwd = 2)

plot of chunk fig-ParamCovar-01-12

Basic residual plots for a cubic polynomial

plotLM(p[["3"]])

plot of chunk fig-ParamCovar-01-13




Comparison of two (up to now) reasonable models

Fits (again)

m3 <- list()
m3[["log"]]   <- lm(log(price) ~ log(ground), data = Houses1987)
m3[["poly3"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)

Estimates and basic inference

lapply(m3, summary)
## $log
## 
## Call:
## lm(formula = log(price) ~ log(ground), data = Houses1987)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8571 -0.1988  0.0046  0.1929  0.8969 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.75625    0.19933   38.91   <2e-16 ***
## log(ground)  0.54216    0.03265   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3033 on 544 degrees of freedom
## Multiple R-squared:  0.3364, Adjusted R-squared:  0.3351 
## F-statistic: 275.7 on 1 and 544 DF,  p-value: < 2.2e-16
## 
## 
## $poly3
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3), 
##     data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87279 -0.19903  0.00212  0.19780  0.90934 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.965e+00  1.371e-01  72.682  < 2e-16 ***
## ground       3.784e-03  7.109e-04   5.323 1.49e-07 ***
## I(ground^2) -3.306e-06  1.092e-06  -3.028  0.00258 ** 
## I(ground^3)  9.700e-10  4.958e-10   1.957  0.05091 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared:  0.3507, Adjusted R-squared:  0.3471 
## F-statistic: 97.57 on 3 and 542 DF,  p-value: < 2.2e-16

Coefficients of determination

sapply(lapply(m3, summary), "[[", "r.squared")
##       log     poly3 
## 0.3363543 0.3506750
sapply(lapply(m3, summary), "[[", "adj.r.squared")
##       log     poly3 
## 0.3351344 0.3470810

Fitted regression functions including the prediction intervals

pm3 <- lapply(m3, predict, newdata = data.frame(ground = grpred), interval = "predict")

Plot

COLS <- c("red3", "darkblue")
names(COLS) <- names(m3)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pm3)){
  lines(grpred, pm3[[i]][, "fit"], col = COLS[i], lwd = 2)
  lines(grpred, pm3[[i]][, "lwr"], col = COLS[i], lwd = 2, lty = 2)
  lines(grpred, pm3[[i]][, "upr"], col = COLS[i], lwd = 2, lty = 2)  
}  
legend(1000, 10.50, legend = c("Log(Ground)", "Poly(Ground, 3)"), lty = 1, lwd = 2, col = COLS)
text(1000, 10.55, labels = "Log(Price) ~", pos = 4, font = 2)

plot of chunk fig-ParamCovar-01-14

Residuals versus fitted values

CAPTION <- paste("Log(Price) ~ ", c("Log(Ground)", "Poly(Ground, 3)"), sep = "")
par(mfrow = c(1, 2), bty = BTY, mar = c(4, 4, 2, 1) + 0.1)
for (i in 1:length(m3)){
  plot(m3[[i]], which = 1, pch = 21, col = "blue4", bg = "skyblue", caption = CAPTION[i])
}  

plot of chunk fig-ParamCovar-01-15

par(mfrow = c(1, 1))




Cubic polynomial parameterized using the orthonormal polynomials

Fit

m4 <- lm(log(price) ~ poly(ground, degree = 3), data = Houses1987)
summary(m4)
## 
## Call:
## lm(formula = log(price) ~ poly(ground, degree = 3), data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87279 -0.19903  0.00212  0.19780  0.90934 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               11.05896    0.01286 859.717  < 2e-16 ***
## poly(ground, degree = 3)1  4.71459    0.30058  15.685  < 2e-16 ***
## poly(ground, degree = 3)2 -1.96780    0.30058  -6.547 1.37e-10 ***
## poly(ground, degree = 3)3  0.58811    0.30058   1.957   0.0509 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared:  0.3507, Adjusted R-squared:  0.3471 
## F-statistic: 97.57 on 3 and 542 DF,  p-value: < 2.2e-16

Raw polynomials for comparison

m4b <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
summary(m4b)
## 
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3), 
##     data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87279 -0.19903  0.00212  0.19780  0.90934 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.965e+00  1.371e-01  72.682  < 2e-16 ***
## ground       3.784e-03  7.109e-04   5.323 1.49e-07 ***
## I(ground^2) -3.306e-06  1.092e-06  -3.028  0.00258 ** 
## I(ground^3)  9.700e-10  4.958e-10   1.957  0.05091 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared:  0.3507, Adjusted R-squared:  0.3471 
## F-statistic: 97.57 on 3 and 542 DF,  p-value: < 2.2e-16

Basis functions (model matrices)

x <- Houses1987[, "ground"]
x <- x[order(x)]
Z4 <- poly(x, degree = 3)
Z5 <- cbind(x, x^2, x^3)

Plot of basis functions

COL3 <- c("red3", "darkgreen", "blue")
par(mfrow = c(1, 2), bty = BTY, mar = c(4, 4, 3, 1) + 0.1)
plot(x, Z4[, 1], type = "l", col = COL3[1], lwd = 2, xlab = "z", ylab = "P(z)", ylim = range(Z4), main = "Orthonormal polynomials")
for (j in 2:ncol(Z4)) lines(x, Z4[, j], col = COL3[j], lwd = 2)
plot(x, Z5[, 1], type = "l", col = COL3[1], lwd = 2, xlab = "z", ylab = "P(z)", ylim = range(Z5), main = "Raw plynomials")
for (j in 2:ncol(Z5)) lines(x, Z5[, j], col = COL3[j], lwd = 2)

plot of chunk fig-ParamCovar-01-16

par(mfrow = c(1, 1))

Cubic versus linear, raw polynomials parameterization

rp3 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
rp1 <- lm(log(price) ~ ground, data = Houses1987)
anova(rp1, rp3)
## Analysis of Variance Table
## 
## Model 1: log(price) ~ ground
## Model 2: log(price) ~ ground + I(ground^2) + I(ground^3)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    544 53.186                                  
## 2    542 48.968  2    4.2181 23.344 1.883e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Cubic versus linear, orthonormal polynomials parameterization

op3 <- lm(log(price) ~ poly(ground, degree = 3), data = Houses1987)
op1 <- lm(log(price) ~ poly(ground, degree = 1), data = Houses1987)
anova(op1, op3)
## Analysis of Variance Table
## 
## Model 1: log(price) ~ poly(ground, degree = 1)
## Model 2: log(price) ~ poly(ground, degree = 3)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    544 53.186                                  
## 2    542 48.968  2    4.2181 23.344 1.883e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Global influence of each point on the fit (cubic regression function)

#REM1 <- rownames(Houses1987) == "363"
#print(Houses1987[REM1, c("ground", "price")])
#Data1 <- subset(Houses1987, !REM1)
#
REM1 <- Houses1987[, "ground"] >1400 & log(Houses1987[, "price"]) > 11.5
print(Houses1987[REM1, c("ground", "price")])
##     ground  price
## 369   1507 145000
Data1 <- subset(Houses1987, !REM1)

REM2 <- Houses1987[, "ground"] >1400 & log(Houses1987[, "price"]) < 11.5
print(Houses1987[REM2, c("ground", "price")])
##     ground price
## 365   1451 84900
Data2 <- subset(Houses1987, !REM2)

gl0 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
gl1 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Data1)
gl2 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Data2)

grpred <- seq(150, 1510, length = 500)
pgl0 <- predict(gl0, newdata = data.frame(ground = grpred))
pgl1 <- predict(gl1, newdata = data.frame(ground = grpred))
pgl2 <- predict(gl2, newdata = data.frame(ground = grpred))
COL <- c("red3", "darkblue", "darkgreen")

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pgl0, lwd = 2, col = COL[1])
lines(grpred, pgl1, lwd = 2, col = COL[2])
#lines(grpred, pgl2, lwd = 2, col = COL[3])
#
points(Houses1987[REM1, "ground"], log(Houses1987[REM1, "price"]), col = "red3", bg = COL[2], pch = 23, cex = 2)

plot of chunk fig-ParamCovar-01-23

#points(Houses1987[REM2, "ground"], log(Houses1987[REM2, "price"]), col = "red3", bg = COL[3], pch = 23, cex = 2)

Global influence of each point on the fit (polynomial of degree 4)

REM1 <- rownames(Houses1987) == "383"
print(Houses1987[REM1, c("ground", "price")])
##     ground  price
## 383   1228 140000
Data1 <- subset(Houses1987, !REM1)

gl40 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Houses1987)
gl41 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Data1)

pgl40 <- predict(gl40, newdata = data.frame(ground = grpred))
pgl41 <- predict(gl41, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pgl40, lwd = 2, col = COL[1])
lines(grpred, pgl41, lwd = 2, col = COL[2])
#
points(Houses1987[REM1, "ground"], log(Houses1987[REM1, "price"]), col = "red3", bg = COL[2], pch = 23, cex = 2)

plot of chunk fig-ParamCovar-01-24




Regression splines (cubic B-splines)

Basic scatterplot for log(price) versus ground

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Y = log(Price)")

plot of chunk fig-ParamCovar-01-17

Knots for the spline

range(Houses1987[, "ground"])
## [1]  153 1507
knots <- c(seq(150, 900, length = 4), 1510)
inner <- knots[-c(1, length(knots))]
bound <- knots[c(1, length(knots))]
print(knots)
## [1]  150  400  650  900 1510

B-spline basis used here

Grid to calculate and plot the basis

xgrid <- seq(knots[1], knots[length(knots)], length = 500)

B-spline basis evaluated in xgrid

library("splines")
Bgrid <- bs(xgrid, knots = inner, Boundary.knots = bound, degree = 3, intercept = TRUE)
Bgrid[1:10,]
##               1          2            3            4 5 6 7
##  [1,] 1.0000000 0.00000000 0.0000000000 0.000000e+00 0 0 0
##  [2,] 0.9676498 0.03217286 0.0001770863 2.159453e-07 0 0 0
##  [3,] 0.9360050 0.06328967 0.0007035943 1.727563e-06 0 0 0
##  [4,] 0.9050577 0.09336406 0.0015723980 5.830524e-06 0 0 0
##  [5,] 0.8748002 0.12240961 0.0027763710 1.382050e-05 0 0 0
##  [6,] 0.8452247 0.15043993 0.0043083872 2.699317e-05 0 0 0
##  [7,] 0.8163234 0.17746864 0.0061613203 4.664419e-05 0 0 0
##  [8,] 0.7880886 0.20350933 0.0083280443 7.406925e-05 0 0 0
##  [9,] 0.7605124 0.22857560 0.0108014329 1.105640e-04 0 0 0
## [10,] 0.7335871 0.25268107 0.0135743598 1.574241e-04 0 0 0

Plot of the basis

COL <- rainbow_hcl(ncol(Bgrid), c = 70)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(range(xgrid), rep(0, 2), type = "n", xlab = "z", ylab = "B(z)", ylim = range(Bgrid))  
for (j in 1:ncol(Bgrid)){
  activex <- Bgrid[, j] > 0
  lines(xgrid[activex], Bgrid[activex, j], lwd = 2, col = COL[j])    
}
points(knots, rep(0, length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5)

plot of chunk fig-ParamCovar-01-18

Model matrix

Bx <- bs(Houses1987[, "ground"], knots = inner, Boundary.knots = bound, degree = 3, intercept = TRUE)
#
rBx <- round(Bx, 3)
showBx <- data.frame(ground = Houses1987[, "ground"], B1 = rBx[,1], B2 = rBx[,2], B3 = rBx[,3], B4 = rBx[,4], B5 = rBx[,5], B6 = rBx[,6], B7 = rBx[,7])
print(showBx[1:15, ])
##    ground    B1    B2    B3    B4    B5 B6 B7
## 1     544 0.000 0.019 0.424 0.535 0.022  0  0
## 2     372 0.001 0.341 0.541 0.117 0.000  0  0
## 3     285 0.097 0.583 0.293 0.026 0.000  0  0
## 4     619 0.000 0.000 0.235 0.689 0.076  0  0
## 5     592 0.000 0.003 0.302 0.644 0.051  0  0
## 6     387 0.000 0.291 0.567 0.142 0.000  0  0
## 7     361 0.004 0.379 0.517 0.100 0.000  0  0
## 8     387 0.000 0.291 0.567 0.142 0.000  0  0
## 9     447 0.000 0.134 0.590 0.275 0.001  0  0
## 10    512 0.000 0.042 0.497 0.451 0.010  0  0
## 11    670 0.000 0.000 0.130 0.729 0.142  0  0
## 12    279 0.113 0.590 0.273 0.023 0.000  0  0
## 13    158 0.907 0.091 0.002 0.000 0.000  0  0
## 14    268 0.147 0.597 0.238 0.018 0.000  0  0
## 15    335 0.018 0.465 0.450 0.068 0.000  0  0

Fit

mB <- lm(log(price) ~ Bx - 1, data = Houses1987)
summary(mB)
## 
## Call:
## lm(formula = log(price) ~ Bx - 1, data = Houses1987)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90457 -0.19497  0.00698  0.19693  0.94698 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## Bx1 10.71312    0.12078   88.70   <2e-16 ***
## Bx2 10.66519    0.07956  134.06   <2e-16 ***
## Bx3 10.97388    0.07464  147.03   <2e-16 ***
## Bx4 11.46283    0.06699  171.11   <2e-16 ***
## Bx5 11.17900    0.16773   66.65   <2e-16 ***
## Bx6 11.41145    0.31448   36.29   <2e-16 ***
## Bx7 11.69708    0.25076   46.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2974 on 539 degrees of freedom
## Multiple R-squared:  0.9993, Adjusted R-squared:  0.9993 
## F-statistic: 1.079e+05 on 7 and 539 DF,  p-value: < 2.2e-16

Correct \(R^2\) (comparable to other models with explicit intercept)

m0 <- lm(log(price) ~ 1, data = Houses1987)
(SSe <- deviance(mB))
## [1] 47.66313
(SST <- deviance(m0))
## [1] 75.41317
R2 <- 1 - SSe/SST
R2adj <- 1 - (SSe/mB$df.residual) / (SST/m0$df.residual)
R2B <- c(R2 = R2, R2adj = R2adj)
print(R2B)
##        R2     R2adj 
## 0.3679734 0.3609379

Does the response expectation depend on ground?

anova(m0, mB)
## Analysis of Variance Table
## 
## Model 1: log(price) ~ 1
## Model 2: log(price) ~ Bx - 1
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    545 75.413                                  
## 2    539 47.663  6     27.75 52.302 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Is the spline significantly better than the (global) cubic polynomial?

mpoly3 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
anova(mpoly3, mB) 
## Analysis of Variance Table
## 
## Model 1: log(price) ~ ground + I(ground^2) + I(ground^3)
## Model 2: log(price) ~ Bx - 1
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1    542 48.968                                
## 2    539 47.663  3    1.3045 4.9174 0.002226 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fitted regression function (including the PREDICTION band)

pmB <- as.numeric(Bgrid %*% coef(mB))
se.pmB <- summary(mB)$sigma * sqrt(1 + diag(Bgrid %*% vcov(mB) %*% t(Bgrid)))
qq <- qt(0.975, df = mB$df.residual)
pmB <- data.frame(fit = pmB, lwr = pmB - se.pmB * qq, upr = pmB + se.pmB * qq)
pmB[1:10,]
##         fit      lwr      upr
## 1  10.71312 10.12473 11.30151
## 2  10.71162 10.12360 11.29965
## 3  10.71027 10.12259 11.29795
## 4  10.70906 10.12169 11.29643
## 5  10.70799 10.12091 11.29506
## 6  10.70705 10.12024 11.29386
## 7  10.70626 10.11969 11.29282
## 8  10.70559 10.11925 11.29194
## 9  10.70506 10.11892 11.29120
## 10 10.70467 10.11871 11.29062

Plot of data with the fitted regression function and the prediction band

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(xgrid, pmB[, "fit"], col = "darkgreen", lwd = 2)
lines(xgrid, pmB[, "lwr"], col = "darkgreen", lwd = 2, lty = 2)
lines(xgrid, pmB[, "upr"], col = "darkgreen", lwd = 2, lty = 2)
points(knots, rep(min(log(Houses1987[, "price"])), length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5)

plot of chunk fig-ParamCovar-01-19

Important residual plots

plotLM(mB)

plot of chunk fig-ParamCovar-01-20

Residuals versus the covariate

lwresid <- lowess(Houses1987[, "ground"], residuals(mB))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(residuals(mB) ~ Houses1987[, "ground"], pch = 21, col = "blue4", bg = "skyblue",
     xlab = "Z = Ground size", ylab = "Residuals")
lines(lwresid[["x"]], lwresid[["y"]], col = "red", lwd = 2)

plot of chunk fig-ParamCovar-01-21

Comparison to two previous models

m3 <- list()
m3[["log"]]   <- lm(log(price) ~ log(ground), data = Houses1987)
m3[["poly3"]] <- lm(log(price) ~ poly(ground, 3), data = Houses1987)
pm3 <- lapply(m3, predict, newdata = data.frame(ground = xgrid), interval = "predict")

print(R2B)
##        R2     R2adj 
## 0.3679734 0.3609379
sapply(lapply(m3, summary), "[[", "r.squared")
##       log     poly3 
## 0.3363543 0.3506750
R2s <- round(c(sapply(lapply(m3, summary), "[[", "r.squared"), R2B["R2"]), 3)

Plot

COLS <- c("red3", "darkblue", "darkgreen")
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
     xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pm3)){
  lines(xgrid, pm3[[i]][, "fit"], col = COLS[i], lwd = 2)
  lines(xgrid, pm3[[i]][, "lwr"], col = COLS[i], lwd = 2, lty = 2)
  lines(xgrid, pm3[[i]][, "upr"], col = COLS[i], lwd = 2, lty = 2)  
}
lines(xgrid, pmB[, "fit"], col = COLS[3], lwd = 2)
lines(xgrid, pmB[, "lwr"], col = COLS[3], lwd = 2, lty = 2)
lines(xgrid, pmB[, "upr"], col = COLS[3], lwd = 2, lty = 2)
legend(800, 10.50, legend = c("Log(Ground)", "Poly(Ground, 3)", "Cubic B-spline"), lty = 1, lwd = 2, col = COLS)
text(800, 10.55, labels = "Log(Price) ~", pos = 4, font = 2)
text(1210, 10.37, labels = eval(substitute(expression(paste(R^2, " = ", R2s, sep = "")), list(R2s = R2s[1]))), pos = 4, font = 2)
text(1210, 10.27, labels = eval(substitute(expression(paste(R^2, " = ", R2s, sep = "")), list(R2s = R2s[2]))), pos = 4, font = 2)
text(1210, 10.17, labels = eval(substitute(expression(paste(R^2, " = ", R2s, sep = "")), list(R2s = R2s[3]))), pos = 4, font = 2)

plot of chunk fig-ParamCovar-01-22