Numeric covariate: simple transformation, polynomial regression, regression splines
Data Houses1987
data(Houses1987, package = "mffSM")
head(Houses1987)
## price ground bed bath floor garage airco gas fbed fbath ffloor fgarage fairco fgas
## 1 42000 544 3 1 2 1 0 0 3 1 2 1 No No
## 2 38500 372 2 1 1 0 0 0 <=2 1 1 0 No No
## 3 49500 285 3 1 1 0 0 0 3 1 1 0 No No
## 4 60500 619 3 1 2 0 0 0 3 1 2 0 No No
## 5 61000 592 2 1 1 0 0 0 <=2 1 1 0 No No
## 6 66000 387 3 1 1 0 1 0 3 1 1 0 Yes No
dim(Houses1987)
## [1] 546 14
summary(Houses1987)
## price ground bed bath floor garage
## Min. : 25000 Min. : 153.0 Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.0000
## 1st Qu.: 49125 1st Qu.: 335.0 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000
## Median : 62000 Median : 428.0 Median :3.000 Median :1.000 Median :2.000 Median :0.0000
## Mean : 68122 Mean : 479.1 Mean :2.965 Mean :1.286 Mean :1.808 Mean :0.6923
## 3rd Qu.: 82000 3rd Qu.: 592.0 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :190000 Max. :1507.0 Max. :6.000 Max. :4.000 Max. :4.000 Max. :3.0000
## airco gas fbed fbath ffloor fgarage fairco fgas
## Min. :0.0000 Min. :0.00000 <=2:138 1 :402 1 :227 0 :300 No :373 No :521
## 1st Qu.:0.0000 1st Qu.:0.00000 3 :301 2 :133 2 :238 1 :126 Yes:173 Yes: 25
## Median :0.0000 Median :0.00000 4 : 95 >=3: 11 >=3: 81 >=2:120
## Mean :0.3168 Mean :0.04579 >=5: 12
## 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
price
on ground
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
lws <- lowess(Houses1987[, "ground"], Houses1987[, "price"])
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
lines(lws, col = "blue", lwd = 2)
m1 <- lm(price ~ log(ground), data = Houses1987)
summary(m1)
##
## Call:
## lm(formula = price ~ log(ground), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55841 -14946 -2805 10965 105124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -161279 14538 -11.09 <2e-16 ***
## log(ground) 37658 2381 15.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22120 on 544 degrees of freedom
## Multiple R-squared: 0.3149, Adjusted R-squared: 0.3137
## F-statistic: 250.1 on 1 and 544 DF, p-value: < 2.2e-16
grpred <- seq(150, 1510, length = 500)
pm1 <- predict(m1, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
#lines(lws, col = "blue", lwd = 2)
lines(grpred, pm1, col = "red3", lwd = 2)
#legend(160, 190000, legend = c("Y ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ log(ground), data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("log(Ground size) [log(", m^2, ")]")), ylab = "Price [CAD]")
#lines(lowess(log(Houses1987[, "ground"]), Houses1987[, "price"]), col = "blue", lwd = 2)
abline(m1, col = "red3", lwd = 2)
#legend(5, 190000, legend = c("Y ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))
plotLM
comes from package mffSM
.library("mffSM")
plotLM(m1)
We additionally consider a log-transformation of the response which improves some other aspects of the model.
m2 <- lm(log(price) ~ log(ground), data = Houses1987)
summary(m2)
##
## Call:
## lm(formula = log(price) ~ log(ground), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8571 -0.1988 0.0046 0.1929 0.8969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.75625 0.19933 38.91 <2e-16 ***
## log(ground) 0.54216 0.03265 16.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3033 on 544 degrees of freedom
## Multiple R-squared: 0.3364, Adjusted R-squared: 0.3351
## F-statistic: 275.7 on 1 and 544 DF, p-value: < 2.2e-16
pm2 <- predict(m2, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
#lines(lowess(Houses1987[, "ground"], log(Houses1987[, "price"])), col = "blue", lwd = 2)
lines(grpred, pm2, col = "red3", lwd = 2)
#legend(160, log(190000), legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ log(ground), data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("X = log(Z) = log(Ground size) [log(", m^2, ")]")), ylab = "Y = log(Price)")
#lines(lowess(log(Houses1987[, "ground"]), log(Houses1987[, "price"])), col = "blue", lwd = 2)
abline(m2, col = "red3", lwd = 2)
#legend(5, log(190000), legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))
pm2 <- predict(m2, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(price ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Price [CAD]")
#lines(lws, col = "blue", lwd = 2)
lines(grpred, exp(pm2), col = "red3", lwd = 2)
#legend(160, 190000, legend = c("log(Y) ~ log(x)", "LOWESS"), lty = 1, lwd = 2, col = c("red3", "blue"))
plotLM(m2)
price
) as responsep <- list()
p[["0"]] <- lm(log(price) ~ 1, data = Houses1987)
p[["1"]] <- lm(log(price) ~ ground, data = Houses1987)
p[["2"]] <- lm(log(price) ~ ground + I(ground^2), data = Houses1987)
p[["3"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
p[["4"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Houses1987)
p[["5"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4) + I(ground^5), data = Houses1987)
sapply(p, summary)
## $`0`
##
## Call:
## lm(formula = log(price) ~ 1, data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93233 -0.25685 -0.02407 0.25551 1.09582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.05896 0.01592 694.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.372 on 545 degrees of freedom
##
##
## $`1`
##
## Call:
## lm(formula = log(price) ~ ground, data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96501 -0.20546 0.00402 0.19730 0.88491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.058e+01 3.452e-02 306.50 <2e-16 ***
## ground 1.001e-03 6.641e-05 15.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3127 on 544 degrees of freedom
## Multiple R-squared: 0.2947, Adjusted R-squared: 0.2934
## F-statistic: 227.3 on 1 and 544 DF, p-value: < 2.2e-16
##
##
## $`2`
##
## Call:
## lm(formula = log(price) ~ ground + I(ground^2), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.89001 -0.19452 0.00597 0.19397 0.91316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.020e+01 6.698e-02 152.28 < 2e-16 ***
## ground 2.470e-03 2.339e-04 10.56 < 2e-16 ***
## I(ground^2) -1.200e-06 1.838e-07 -6.53 1.52e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3014 on 543 degrees of freedom
## Multiple R-squared: 0.3461, Adjusted R-squared: 0.3437
## F-statistic: 143.7 on 2 and 543 DF, p-value: < 2.2e-16
##
##
## $`3`
##
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3),
## data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87279 -0.19903 0.00212 0.19780 0.90934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.965e+00 1.371e-01 72.682 < 2e-16 ***
## ground 3.784e-03 7.109e-04 5.323 1.49e-07 ***
## I(ground^2) -3.306e-06 1.092e-06 -3.028 0.00258 **
## I(ground^3) 9.700e-10 4.958e-10 1.957 0.05091 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared: 0.3507, Adjusted R-squared: 0.3471
## F-statistic: 97.57 on 3 and 542 DF, p-value: < 2.2e-16
##
##
## $`4`
##
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3) +
## I(ground^4), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91381 -0.19784 0.00291 0.20086 0.93320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.071e+01 2.634e-01 40.653 < 2e-16 ***
## ground -1.788e-03 1.834e-03 -0.975 0.33013
## I(ground^2) 1.052e-05 4.339e-06 2.424 0.01569 *
## I(ground^3) -1.256e-08 4.143e-09 -3.032 0.00254 **
## I(ground^4) 4.450e-12 1.353e-12 3.290 0.00107 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2979 on 541 degrees of freedom
## Multiple R-squared: 0.3634, Adjusted R-squared: 0.3587
## F-statistic: 77.21 on 4 and 541 DF, p-value: < 2.2e-16
##
##
## $`5`
##
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3) +
## I(ground^4) + I(ground^5), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90198 -0.19046 0.00679 0.19546 0.94797
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.145e+01 4.996e-01 22.923 <2e-16 ***
## ground -9.058e-03 4.532e-03 -1.998 0.0462 *
## I(ground^2) 3.589e-05 1.511e-05 2.376 0.0179 *
## I(ground^3) -5.238e-08 2.308e-08 -2.269 0.0236 *
## I(ground^4) 3.277e-11 1.621e-11 2.022 0.0437 *
## I(ground^5) -7.379e-15 4.209e-15 -1.753 0.0801 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2973 on 540 degrees of freedom
## Multiple R-squared: 0.367, Adjusted R-squared: 0.3612
## F-statistic: 62.62 on 5 and 540 DF, p-value: < 2.2e-16
pp <- lapply(p, predict, newdata = data.frame(ground = grpred))
COLS <- c("black", "brown", "red", "orange", "darkgreen", "blue")
names(COLS) <- names(p)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pp)){
lines(grpred, pp[[i]], col = COLS[i], lwd = 2)
}
lines(grpred, exp(pm2), col = "red3", lwd = 2)
legend(1300, 10.90, legend = 0:5, lty = 1, lwd = 2, col = COLS)
text(1300, 10.95, labels = "Degree", pos = 4, font = 2)
par(mfrow = c(2, 3))
for (i in 1:length(p)){
plot(p[[i]], which = 1, pch = 21, col = "blue4", bg = "skyblue", caption = paste("Degree ", i - 1, sep = ""))
}
par(mfrow = c(1, 1))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pp[["3"]], col = "red3", lwd = 2)
plotLM(p[["3"]])
m3 <- list()
m3[["log"]] <- lm(log(price) ~ log(ground), data = Houses1987)
m3[["poly3"]] <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
lapply(m3, summary)
## $log
##
## Call:
## lm(formula = log(price) ~ log(ground), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8571 -0.1988 0.0046 0.1929 0.8969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.75625 0.19933 38.91 <2e-16 ***
## log(ground) 0.54216 0.03265 16.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3033 on 544 degrees of freedom
## Multiple R-squared: 0.3364, Adjusted R-squared: 0.3351
## F-statistic: 275.7 on 1 and 544 DF, p-value: < 2.2e-16
##
##
## $poly3
##
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3),
## data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87279 -0.19903 0.00212 0.19780 0.90934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.965e+00 1.371e-01 72.682 < 2e-16 ***
## ground 3.784e-03 7.109e-04 5.323 1.49e-07 ***
## I(ground^2) -3.306e-06 1.092e-06 -3.028 0.00258 **
## I(ground^3) 9.700e-10 4.958e-10 1.957 0.05091 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared: 0.3507, Adjusted R-squared: 0.3471
## F-statistic: 97.57 on 3 and 542 DF, p-value: < 2.2e-16
sapply(lapply(m3, summary), "[[", "r.squared")
## log poly3
## 0.3363543 0.3506750
sapply(lapply(m3, summary), "[[", "adj.r.squared")
## log poly3
## 0.3351344 0.3470810
pm3 <- lapply(m3, predict, newdata = data.frame(ground = grpred), interval = "predict")
COLS <- c("red3", "darkblue")
names(COLS) <- names(m3)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pm3)){
lines(grpred, pm3[[i]][, "fit"], col = COLS[i], lwd = 2)
lines(grpred, pm3[[i]][, "lwr"], col = COLS[i], lwd = 2, lty = 2)
lines(grpred, pm3[[i]][, "upr"], col = COLS[i], lwd = 2, lty = 2)
}
legend(1000, 10.50, legend = c("Log(Ground)", "Poly(Ground, 3)"), lty = 1, lwd = 2, col = COLS)
text(1000, 10.55, labels = "Log(Price) ~", pos = 4, font = 2)
CAPTION <- paste("Log(Price) ~ ", c("Log(Ground)", "Poly(Ground, 3)"), sep = "")
par(mfrow = c(1, 2), bty = BTY, mar = c(4, 4, 2, 1) + 0.1)
for (i in 1:length(m3)){
plot(m3[[i]], which = 1, pch = 21, col = "blue4", bg = "skyblue", caption = CAPTION[i])
}
par(mfrow = c(1, 1))
m4 <- lm(log(price) ~ poly(ground, degree = 3), data = Houses1987)
summary(m4)
##
## Call:
## lm(formula = log(price) ~ poly(ground, degree = 3), data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87279 -0.19903 0.00212 0.19780 0.90934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.05896 0.01286 859.717 < 2e-16 ***
## poly(ground, degree = 3)1 4.71459 0.30058 15.685 < 2e-16 ***
## poly(ground, degree = 3)2 -1.96780 0.30058 -6.547 1.37e-10 ***
## poly(ground, degree = 3)3 0.58811 0.30058 1.957 0.0509 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared: 0.3507, Adjusted R-squared: 0.3471
## F-statistic: 97.57 on 3 and 542 DF, p-value: < 2.2e-16
m4b <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
summary(m4b)
##
## Call:
## lm(formula = log(price) ~ ground + I(ground^2) + I(ground^3),
## data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87279 -0.19903 0.00212 0.19780 0.90934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.965e+00 1.371e-01 72.682 < 2e-16 ***
## ground 3.784e-03 7.109e-04 5.323 1.49e-07 ***
## I(ground^2) -3.306e-06 1.092e-06 -3.028 0.00258 **
## I(ground^3) 9.700e-10 4.958e-10 1.957 0.05091 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3006 on 542 degrees of freedom
## Multiple R-squared: 0.3507, Adjusted R-squared: 0.3471
## F-statistic: 97.57 on 3 and 542 DF, p-value: < 2.2e-16
x <- Houses1987[, "ground"]
x <- x[order(x)]
Z4 <- poly(x, degree = 3)
Z5 <- cbind(x, x^2, x^3)
COL3 <- c("red3", "darkgreen", "blue")
par(mfrow = c(1, 2), bty = BTY, mar = c(4, 4, 3, 1) + 0.1)
plot(x, Z4[, 1], type = "l", col = COL3[1], lwd = 2, xlab = "z", ylab = "P(z)", ylim = range(Z4), main = "Orthonormal polynomials")
for (j in 2:ncol(Z4)) lines(x, Z4[, j], col = COL3[j], lwd = 2)
plot(x, Z5[, 1], type = "l", col = COL3[1], lwd = 2, xlab = "z", ylab = "P(z)", ylim = range(Z5), main = "Raw plynomials")
for (j in 2:ncol(Z5)) lines(x, Z5[, j], col = COL3[j], lwd = 2)
par(mfrow = c(1, 1))
rp3 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
rp1 <- lm(log(price) ~ ground, data = Houses1987)
anova(rp1, rp3)
## Analysis of Variance Table
##
## Model 1: log(price) ~ ground
## Model 2: log(price) ~ ground + I(ground^2) + I(ground^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 544 53.186
## 2 542 48.968 2 4.2181 23.344 1.883e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
op3 <- lm(log(price) ~ poly(ground, degree = 3), data = Houses1987)
op1 <- lm(log(price) ~ poly(ground, degree = 1), data = Houses1987)
anova(op1, op3)
## Analysis of Variance Table
##
## Model 1: log(price) ~ poly(ground, degree = 1)
## Model 2: log(price) ~ poly(ground, degree = 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 544 53.186
## 2 542 48.968 2 4.2181 23.344 1.883e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#REM1 <- rownames(Houses1987) == "363"
#print(Houses1987[REM1, c("ground", "price")])
#Data1 <- subset(Houses1987, !REM1)
#
REM1 <- Houses1987[, "ground"] >1400 & log(Houses1987[, "price"]) > 11.5
print(Houses1987[REM1, c("ground", "price")])
## ground price
## 369 1507 145000
Data1 <- subset(Houses1987, !REM1)
REM2 <- Houses1987[, "ground"] >1400 & log(Houses1987[, "price"]) < 11.5
print(Houses1987[REM2, c("ground", "price")])
## ground price
## 365 1451 84900
Data2 <- subset(Houses1987, !REM2)
gl0 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
gl1 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Data1)
gl2 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Data2)
grpred <- seq(150, 1510, length = 500)
pgl0 <- predict(gl0, newdata = data.frame(ground = grpred))
pgl1 <- predict(gl1, newdata = data.frame(ground = grpred))
pgl2 <- predict(gl2, newdata = data.frame(ground = grpred))
COL <- c("red3", "darkblue", "darkgreen")
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pgl0, lwd = 2, col = COL[1])
lines(grpred, pgl1, lwd = 2, col = COL[2])
#lines(grpred, pgl2, lwd = 2, col = COL[3])
#
points(Houses1987[REM1, "ground"], log(Houses1987[REM1, "price"]), col = "red3", bg = COL[2], pch = 23, cex = 2)
#points(Houses1987[REM2, "ground"], log(Houses1987[REM2, "price"]), col = "red3", bg = COL[3], pch = 23, cex = 2)
REM1 <- rownames(Houses1987) == "383"
print(Houses1987[REM1, c("ground", "price")])
## ground price
## 383 1228 140000
Data1 <- subset(Houses1987, !REM1)
gl40 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Houses1987)
gl41 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3) + I(ground^4), data = Data1)
pgl40 <- predict(gl40, newdata = data.frame(ground = grpred))
pgl41 <- predict(gl41, newdata = data.frame(ground = grpred))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(grpred, pgl40, lwd = 2, col = COL[1])
lines(grpred, pgl41, lwd = 2, col = COL[2])
#
points(Houses1987[REM1, "ground"], log(Houses1987[REM1, "price"]), col = "red3", bg = COL[2], pch = 23, cex = 2)
price
) versus ground
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Ground size [", m^2, "]")), ylab = "Y = log(Price)")
range(Houses1987[, "ground"])
## [1] 153 1507
knots <- c(seq(150, 900, length = 4), 1510)
inner <- knots[-c(1, length(knots))]
bound <- knots[c(1, length(knots))]
print(knots)
## [1] 150 400 650 900 1510
xgrid <- seq(knots[1], knots[length(knots)], length = 500)
xgrid
bs
comes from package splines
.library("splines")
Bgrid <- bs(xgrid, knots = inner, Boundary.knots = bound, degree = 3, intercept = TRUE)
Bgrid[1:10,]
## 1 2 3 4 5 6 7
## [1,] 1.0000000 0.00000000 0.0000000000 0.000000e+00 0 0 0
## [2,] 0.9676498 0.03217286 0.0001770863 2.159453e-07 0 0 0
## [3,] 0.9360050 0.06328967 0.0007035943 1.727563e-06 0 0 0
## [4,] 0.9050577 0.09336406 0.0015723980 5.830524e-06 0 0 0
## [5,] 0.8748002 0.12240961 0.0027763710 1.382050e-05 0 0 0
## [6,] 0.8452247 0.15043993 0.0043083872 2.699317e-05 0 0 0
## [7,] 0.8163234 0.17746864 0.0061613203 4.664419e-05 0 0 0
## [8,] 0.7880886 0.20350933 0.0083280443 7.406925e-05 0 0 0
## [9,] 0.7605124 0.22857560 0.0108014329 1.105640e-04 0 0 0
## [10,] 0.7335871 0.25268107 0.0135743598 1.574241e-04 0 0 0
COL <- rainbow_hcl(ncol(Bgrid), c = 70)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(range(xgrid), rep(0, 2), type = "n", xlab = "z", ylab = "B(z)", ylim = range(Bgrid))
for (j in 1:ncol(Bgrid)){
activex <- Bgrid[, j] > 0
lines(xgrid[activex], Bgrid[activex, j], lwd = 2, col = COL[j])
}
points(knots, rep(0, length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5)
Bx <- bs(Houses1987[, "ground"], knots = inner, Boundary.knots = bound, degree = 3, intercept = TRUE)
#
rBx <- round(Bx, 3)
showBx <- data.frame(ground = Houses1987[, "ground"], B1 = rBx[,1], B2 = rBx[,2], B3 = rBx[,3], B4 = rBx[,4], B5 = rBx[,5], B6 = rBx[,6], B7 = rBx[,7])
print(showBx[1:15, ])
## ground B1 B2 B3 B4 B5 B6 B7
## 1 544 0.000 0.019 0.424 0.535 0.022 0 0
## 2 372 0.001 0.341 0.541 0.117 0.000 0 0
## 3 285 0.097 0.583 0.293 0.026 0.000 0 0
## 4 619 0.000 0.000 0.235 0.689 0.076 0 0
## 5 592 0.000 0.003 0.302 0.644 0.051 0 0
## 6 387 0.000 0.291 0.567 0.142 0.000 0 0
## 7 361 0.004 0.379 0.517 0.100 0.000 0 0
## 8 387 0.000 0.291 0.567 0.142 0.000 0 0
## 9 447 0.000 0.134 0.590 0.275 0.001 0 0
## 10 512 0.000 0.042 0.497 0.451 0.010 0 0
## 11 670 0.000 0.000 0.130 0.729 0.142 0 0
## 12 279 0.113 0.590 0.273 0.023 0.000 0 0
## 13 158 0.907 0.091 0.002 0.000 0.000 0 0
## 14 268 0.147 0.597 0.238 0.018 0.000 0 0
## 15 335 0.018 0.465 0.450 0.068 0.000 0 0
mB <- lm(log(price) ~ Bx - 1, data = Houses1987)
summary(mB)
##
## Call:
## lm(formula = log(price) ~ Bx - 1, data = Houses1987)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90457 -0.19497 0.00698 0.19693 0.94698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Bx1 10.71312 0.12078 88.70 <2e-16 ***
## Bx2 10.66519 0.07956 134.06 <2e-16 ***
## Bx3 10.97388 0.07464 147.03 <2e-16 ***
## Bx4 11.46283 0.06699 171.11 <2e-16 ***
## Bx5 11.17900 0.16773 66.65 <2e-16 ***
## Bx6 11.41145 0.31448 36.29 <2e-16 ***
## Bx7 11.69708 0.25076 46.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2974 on 539 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9993
## F-statistic: 1.079e+05 on 7 and 539 DF, p-value: < 2.2e-16
m0 <- lm(log(price) ~ 1, data = Houses1987)
(SSe <- deviance(mB))
## [1] 47.66313
(SST <- deviance(m0))
## [1] 75.41317
R2 <- 1 - SSe/SST
R2adj <- 1 - (SSe/mB$df.residual) / (SST/m0$df.residual)
R2B <- c(R2 = R2, R2adj = R2adj)
print(R2B)
## R2 R2adj
## 0.3679734 0.3609379
ground
?anova(m0, mB)
## Analysis of Variance Table
##
## Model 1: log(price) ~ 1
## Model 2: log(price) ~ Bx - 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 545 75.413
## 2 539 47.663 6 27.75 52.302 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mpoly3 <- lm(log(price) ~ ground + I(ground^2) + I(ground^3), data = Houses1987)
anova(mpoly3, mB)
## Analysis of Variance Table
##
## Model 1: log(price) ~ ground + I(ground^2) + I(ground^3)
## Model 2: log(price) ~ Bx - 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 542 48.968
## 2 539 47.663 3 1.3045 4.9174 0.002226 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pmB <- as.numeric(Bgrid %*% coef(mB))
se.pmB <- summary(mB)$sigma * sqrt(1 + diag(Bgrid %*% vcov(mB) %*% t(Bgrid)))
qq <- qt(0.975, df = mB$df.residual)
pmB <- data.frame(fit = pmB, lwr = pmB - se.pmB * qq, upr = pmB + se.pmB * qq)
pmB[1:10,]
## fit lwr upr
## 1 10.71312 10.12473 11.30151
## 2 10.71162 10.12360 11.29965
## 3 10.71027 10.12259 11.29795
## 4 10.70906 10.12169 11.29643
## 5 10.70799 10.12091 11.29506
## 6 10.70705 10.12024 11.29386
## 7 10.70626 10.11969 11.29282
## 8 10.70559 10.11925 11.29194
## 9 10.70506 10.11892 11.29120
## 10 10.70467 10.11871 11.29062
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = log(Price)")
lines(xgrid, pmB[, "fit"], col = "darkgreen", lwd = 2)
lines(xgrid, pmB[, "lwr"], col = "darkgreen", lwd = 2, lty = 2)
lines(xgrid, pmB[, "upr"], col = "darkgreen", lwd = 2, lty = 2)
points(knots, rep(min(log(Houses1987[, "price"])), length(knots)), pch = 21, bg = "blue", col = "skyblue", cex = 1.5)
plotLM(mB)
lwresid <- lowess(Houses1987[, "ground"], residuals(mB))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(residuals(mB) ~ Houses1987[, "ground"], pch = 21, col = "blue4", bg = "skyblue",
xlab = "Z = Ground size", ylab = "Residuals")
lines(lwresid[["x"]], lwresid[["y"]], col = "red", lwd = 2)
m3 <- list()
m3[["log"]] <- lm(log(price) ~ log(ground), data = Houses1987)
m3[["poly3"]] <- lm(log(price) ~ poly(ground, 3), data = Houses1987)
pm3 <- lapply(m3, predict, newdata = data.frame(ground = xgrid), interval = "predict")
print(R2B)
## R2 R2adj
## 0.3679734 0.3609379
sapply(lapply(m3, summary), "[[", "r.squared")
## log poly3
## 0.3363543 0.3506750
R2s <- round(c(sapply(lapply(m3, summary), "[[", "r.squared"), R2B["R2"]), 3)
COLS <- c("red3", "darkblue", "darkgreen")
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(log(price) ~ ground, data = Houses1987, pch = 21, bg = "palegoldenrod", col = "orange",
xlab = expression(paste("Z = Ground size [", m^2, "]")), ylab = "Y = Log(price)")
for (i in 1:length(pm3)){
lines(xgrid, pm3[[i]][, "fit"], col = COLS[i], lwd = 2)
lines(xgrid, pm3[[i]][, "lwr"], col = COLS[i], lwd = 2, lty = 2)
lines(xgrid, pm3[[i]][, "upr"], col = COLS[i], lwd = 2, lty = 2)
}
lines(xgrid, pmB[, "fit"], col = COLS[3], lwd = 2)
lines(xgrid, pmB[, "lwr"], col = COLS[3], lwd = 2, lty = 2)
lines(xgrid, pmB[, "upr"], col = COLS[3], lwd = 2, lty = 2)
legend(800, 10.50, legend = c("Log(Ground)", "Poly(Ground, 3)", "Cubic B-spline"), lty = 1, lwd = 2, col = COLS)
text(800, 10.55, labels = "Log(Price) ~", pos = 4, font = 2)
text(1210, 10.37, labels = eval(substitute(expression(paste(R^2, " = ", R2s, sep = "")), list(R2s = R2s[1]))), pos = 4, font = 2)
text(1210, 10.27, labels = eval(substitute(expression(paste(R^2, " = ", R2s, sep = "")), list(R2s = R2s[2]))), pos = 4, font = 2)
text(1210, 10.17, labels = eval(substitute(expression(paste(R^2, " = ", R2s, sep = "")), list(R2s = R2s[3]))), pos = 4, font = 2)