Residual plots and tests on assumptions
Data Cars2004nh
data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
## vname type drive price.retail price.dealer price cons.city cons.highway
## 1 Chevrolet.Aveo.4dr 1 1 11690 10965 11327.5 8.4 6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch 1 1 12585 11802 12193.5 8.4 6.9
## 3 Chevrolet.Cavalier.2dr 1 1 14610 13697 14153.5 9.0 6.4
## 4 Chevrolet.Cavalier.4dr 1 1 14810 13884 14347.0 9.0 6.4
## 5 Chevrolet.Cavalier.LS.2dr 1 1 16385 15357 15871.0 9.0 6.4
## 6 Dodge.Neon.SE.4dr 1 1 13670 12849 13259.5 8.1 6.5
## consumption engine.size ncylinder horsepower weight iweight lweight wheel.base length width
## 1 7.65 1.6 4 103 1075 0.0009302326 6.980076 249 424 168
## 2 7.65 1.6 4 103 1065 0.0009389671 6.970730 249 389 168
## 3 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 4 7.70 2.2 4 140 1214 0.0008237232 7.101676 264 465 173
## 5 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 6 7.30 2.0 4 132 1171 0.0008539710 7.065613 267 442 170
## ftype fdrive
## 1 personal front
## 2 personal front
## 3 personal front
## 4 personal front
## 5 personal front
## 6 personal front
dim(Cars2004nh)
## [1] 425 20
summary(Cars2004nh)
## vname type drive price.retail price.dealer
## Length:425 Min. :1.000 Min. :1.000 Min. : 10280 Min. : 9875
## Class :character 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 20370 1st Qu.: 18973
## Mode :character Median :1.000 Median :1.000 Median : 27905 Median : 25672
## Mean :2.219 Mean :1.692 Mean : 32866 Mean : 30096
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.: 39235 3rd Qu.: 35777
## Max. :6.000 Max. :3.000 Max. :192465 Max. :173560
##
## price cons.city cons.highway consumption engine.size ncylinder
## Min. : 10078 Min. : 6.20 Min. : 5.100 Min. : 5.65 Min. :1.300 Min. :-1.000
## 1st Qu.: 19600 1st Qu.:11.20 1st Qu.: 8.100 1st Qu.: 9.65 1st Qu.:2.400 1st Qu.: 4.000
## Median : 26656 Median :12.40 Median : 9.000 Median :10.70 Median :3.000 Median : 6.000
## Mean : 31481 Mean :12.36 Mean : 9.142 Mean :10.75 Mean :3.208 Mean : 5.791
## 3rd Qu.: 37514 3rd Qu.:13.80 3rd Qu.: 9.800 3rd Qu.:11.65 3rd Qu.:3.900 3rd Qu.: 6.000
## Max. :183012 Max. :23.50 Max. :19.600 Max. :21.55 Max. :8.300 Max. :12.000
## NA's :14 NA's :14 NA's :14
## horsepower weight iweight lweight wheel.base length
## Min. :100.0 Min. : 923 Min. :0.0003067 Min. :6.828 Min. :226.0 Min. :363.0
## 1st Qu.:165.0 1st Qu.:1412 1st Qu.:0.0005542 1st Qu.:7.253 1st Qu.:262.0 1st Qu.:450.0
## Median :210.0 Median :1577 Median :0.0006341 Median :7.363 Median :272.0 Median :472.0
## Mean :216.8 Mean :1626 Mean :0.0006412 Mean :7.373 Mean :274.9 Mean :470.6
## 3rd Qu.:255.0 3rd Qu.:1804 3rd Qu.:0.0007082 3rd Qu.:7.498 3rd Qu.:284.0 3rd Qu.:490.0
## Max. :500.0 Max. :3261 Max. :0.0010834 Max. :8.090 Max. :366.0 Max. :577.0
## NA's :2 NA's :2 NA's :2 NA's :2 NA's :26
## width ftype fdrive
## Min. :163.0 personal:242 front:223
## 1st Qu.:175.0 wagon : 30 rear :110
## Median :180.0 SUV : 60 4x4 : 92
## Mean :181.1 pickup : 24
## 3rd Qu.:185.0 sport : 49
## Max. :206.0 minivan : 20
## NA's :28
To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption
, lweight
and engine.size
are known.
isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)
## [1] 16
CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size", "horsepower"))
dim(CarsNow)
## [1] 409 7
summary(CarsNow)
## consumption drive fdrive weight lweight engine.size
## Min. : 5.65 Min. :1.000 front:212 Min. : 923 Min. :6.828 Min. :1.300
## 1st Qu.: 9.65 1st Qu.:1.000 rear :108 1st Qu.:1415 1st Qu.:7.255 1st Qu.:2.400
## Median :10.70 Median :1.000 4x4 : 89 Median :1577 Median :7.363 Median :3.000
## Mean :10.75 Mean :1.699 Mean :1622 Mean :7.371 Mean :3.178
## 3rd Qu.:11.65 3rd Qu.:2.000 3rd Qu.:1804 3rd Qu.:7.498 3rd Qu.:3.800
## Max. :21.55 Max. :3.000 Max. :2903 Max. :7.973 Max. :6.000
## horsepower
## Min. :100.0
## 1st Qu.:165.0
## Median :210.0
## Mean :215.8
## 3rd Qu.:250.0
## Max. :493.0
consumption
on lweight
m <- lm(consumption ~ lweight, data = CarsNow)
summary(m)
##
## Call:
## lm(formula = consumption ~ lweight, data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6544 -0.7442 -0.1526 0.5160 5.1616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -58.2480 1.8941 -30.75 <2e-16 ***
## lweight 9.3606 0.2569 36.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.035 on 407 degrees of freedom
## Multiple R-squared: 0.7654, Adjusted R-squared: 0.7648
## F-statistic: 1328 on 1 and 407 DF, p-value: < 2.2e-16
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ lweight, data = CarsNow, pch = PCH, col = COL2, bg = BGC2,
xlab = "Log(weight) [log(kg)]", ylab = "Consumption [l/100 km]")
abline(m, col = "red2", lwd = 2)
#lines(lowess(CarsNow[, "lweight"], CarsNow[, "consumption"]), col = "blue", lwd = 2)
par(mar = c(4, 4, 2, 1) + 0.1)
plot(m, which = 1, pch = 21, col = "blue4", bg = "skyblue")
plot(m, which = 2, pch = 21, col = "blue4", bg = "skyblue")
plot(m, which = 3, pch = 21, col = "blue4", bg = "skyblue")
plotLM
from package mffSM
.library("mffSM")
plotLM(m)
m <- lm(consumption ~ lweight, data = CarsNow, x = TRUE)
par(mfrow = c(1, 3), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
### Scatterplot + fitted line
plot(consumption ~ lweight, data = CarsNow, pch = PCH, col = COL2, bg = BGC2,
xlab = "Log(weight) [log(kg)]", ylab = "Consumption [l/100 km]")
abline(m, col = "red2", lwd = 2)
### Residuals versus fitted values (+ lowess)
plot(fitted(m), residuals(m), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Fitted values", ylab = "Residuals", main = "Residuals vs Fitted")
lines(lowess(fitted(m), residuals(m)), col = "red")
### Residuals versus the covariate (+ lowess)
plot(m[["x"]][, "lweight"], residuals(m), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Log(weight) [log(kg)]", ylab = "Residuals", main = "Residuals vs Covariate")
lines(lowess(m[["x"]][, "lweight"], residuals(m)), col = "red")
par(mfrow = c(1, 2), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
### Residuals versus engine.size (+ lowess)
plot(CarsNow[, "engine.size"], residuals(m), pch = 21, col = COL2, bg = BGC2,
xlab = "Engine size [l]", ylab = "Residuals", main = "Residuals vs Omitted covariate 1")
lines(lowess(CarsNow[, "engine.size"], residuals(m)), col = "red")
### Residuals versus horsepower (+ lowess)
plot(CarsNow[, "horsepower"], residuals(m), pch = 21, col = COL2, bg = BGC2,
xlab = "Horsepower", ylab = "Residuals", main = "Residuals vs Omitted covariate 2")
lines(lowess(CarsNow[, "horsepower"], residuals(m)), col = "red")
lweight
and engine.size
as covariatesm2a <- lm(consumption ~ lweight + engine.size, data = CarsNow, x = TRUE)
summary(m2a)
##
## Call:
## lm(formula = consumption ~ lweight + engine.size, data = CarsNow,
## x = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3243 -0.6741 -0.1286 0.5270 5.0459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -42.65641 2.99243 -14.255 < 2e-16 ***
## lweight 7.01155 0.43501 16.118 < 2e-16 ***
## engine.size 0.54231 0.08304 6.531 1.96e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9854 on 406 degrees of freedom
## Multiple R-squared: 0.7877, Adjusted R-squared: 0.7867
## F-statistic: 753.3 on 2 and 406 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
### Residuals versus fitted values (+ lowess)
plot(fitted(m2a), residuals(m2a), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Fitted values", ylab = "Residuals", main = "Residuals vs Fitted")
lines(lowess(fitted(m2a), residuals(m2a)), col = "red")
### Residuals versus lweight (included covariate)
plot(m2a[["x"]][, "lweight"], residuals(m2a), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Log(weight) [log(kg)]", ylab = "Residuals", main = "Residuals vs Covariate 1")
lines(lowess(m2a[["x"]][, "lweight"], residuals(m2a)), col = "red")
### Residuals versus engine.size (included covariate)
plot(m2a[["x"]][, "engine.size"], residuals(m2a), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Engine size [l]", ylab = "Residuals", main = "Residuals vs Covariate 2")
lines(lowess(m2a[["x"]][, "engine.size"], residuals(m2a)), col = "red")
### Residuals versus horsepower (potentially omitted covariate)
plot(CarsNow[, "horsepower"], residuals(m2a), pch = 21, col = COL2, bg = BGC2,
xlab = "Horsepower", ylab = "Residuals", main = "Residuals vs Omitted covariate")
lines(lowess(CarsNow[, "horsepower"], residuals(m2a)), col = "red")
lweight
and horsepower
as covariatesm2b <- lm(consumption ~ lweight + horsepower, data = CarsNow, x = TRUE)
summary(m2b)
##
## Call:
## lm(formula = consumption ~ lweight + horsepower, data = CarsNow,
## x = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1612 -0.7139 -0.1163 0.4864 5.3775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -49.442751 2.249979 -21.975 < 2e-16 ***
## lweight 7.987645 0.322199 24.791 < 2e-16 ***
## horsepower 0.006094 0.000931 6.546 1.79e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9852 on 406 degrees of freedom
## Multiple R-squared: 0.7878, Adjusted R-squared: 0.7868
## F-statistic: 753.7 on 2 and 406 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
### Residuals versus fitted values (+ lowess)
plot(fitted(m2b), residuals(m2b), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Fitted values", ylab = "Residuals", main = "Residuals vs Fitted")
lines(lowess(fitted(m2b), residuals(m2b)), col = "red")
### Residuals versus lweight (included covariate)
plot(m2b[["x"]][, "lweight"], residuals(m2b), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Log(weight) [log(kg)]", ylab = "Residuals", main = "Residuals vs Covariate 1")
lines(lowess(m2b[["x"]][, "lweight"], residuals(m2b)), col = "red")
### Residuals versus horsepower (included covariate)
plot(m2b[["x"]][, "horsepower"], residuals(m2b), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Horsepower", ylab = "Residuals", main = "Residuals vs Covariate 2")
lines(lowess(m2b[["x"]][, "horsepower"], residuals(m2b)), col = "red")
### Residuals versus engine.size (potentially omitted covariate)
plot(CarsNow[, "engine.size"], residuals(m2b), pch = 21, col = COL2, bg = BGC2,
xlab = "Engine size [l]", ylab = "Residuals", main = "Residuals vs Omitted covariate")
lines(lowess(CarsNow[, "engine.size"], residuals(m2b)), col = "red")
lweight
engine.size
and horsepower
as covariatesm3 <- lm(consumption ~ lweight + engine.size + horsepower, data = CarsNow, x = TRUE)
summary(m3)
##
## Call:
## lm(formula = consumption ~ lweight + engine.size + horsepower,
## data = CarsNow, x = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1174 -0.6923 -0.1127 0.5473 5.2275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -42.353265 2.948614 -14.364 < 2e-16 ***
## lweight 6.935604 0.428971 16.168 < 2e-16 ***
## engine.size 0.352687 0.096730 3.646 0.000301 ***
## horsepower 0.003983 0.001085 3.672 0.000273 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9706 on 405 degrees of freedom
## Multiple R-squared: 0.7946, Adjusted R-squared: 0.793
## F-statistic: 522.1 on 3 and 405 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
### Residuals versus fitted values (+ lowess)
plot(fitted(m3), residuals(m3), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Fitted values", ylab = "Residuals", main = "Residuals vs Fitted")
lines(lowess(fitted(m3), residuals(m3)), col = "red")
### Residuals versus lweight (included covariate)
plot(m3[["x"]][, "lweight"], residuals(m3), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Log(weight) [log(kg)]", ylab = "Residuals", main = "Residuals vs Covariate 1")
lines(lowess(m3[["x"]][, "lweight"], residuals(m3)), col = "red")
### Residuals versus engine.size (included covariate)
plot(m3[["x"]][, "engine.size"], residuals(m3), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Engine size [l]", ylab = "Residuals", main = "Residuals vs Covariate 2")
lines(lowess(m3[["x"]][, "engine.size"], residuals(m3)), col = "red")
### Residuals versus horsepower (included covariate)
plot(m3[["x"]][, "horsepower"], residuals(m3), pch = 21, col = "blue4", bg = "skyblue",
xlab = "Horsepower", ylab = "Residuals", main = "Residuals vs Covariate 3")
lines(lowess(m3[["x"]][, "horsepower"], residuals(m3)), col = "red")
ybar <- with(CarsNow, mean(consumption))
prY <- residuals(m3, type = "partial") + ybar
YLIM <- range(c(CarsNow[, "consumption"], prY))
par(mfrow = c(1, 3), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
plot(CarsNow[, "lweight"], prY[, "lweight"], ylim = YLIM, main = "Log(weight)",
xlab = "Log(weight) [log(kg)]", ylab = "Response-mean partial residuals [l/100 km]", pch = PCH3, col = COL3, bg = BGC3)
lines(lowess(CarsNow[, "lweight"], prY[, "lweight"]), col = "red3")
#
plot(CarsNow[, "engine.size"], prY[, "engine.size"], ylim = YLIM, main = "Engine size",
xlab = "Engine size", ylab = "Response-mean partial residuals [l/100 km]", pch = PCH3, col = COL3, bg = BGC3)
lines(lowess(CarsNow[, "engine.size"], prY[, "engine.size"]), col = "red3")
#
plot(CarsNow[, "horsepower"], prY[, "horsepower"], ylim = YLIM, main = "Horsepower",
xlab = "Horsepower", ylab = "Response-mean partial residuals [l/100 km]", pch = PCH3, col = COL3, bg = BGC3)
lines(lowess(CarsNow[, "horsepower"], prY[, "horsepower"]), col = "red3")
m: consumption ~ lweight
mW <- lm(consumption ~ lweight, data = CarsNow)
mWE <- lm(consumption ~ lweight + engine.size, data = CarsNow)
mWH <- lm(consumption ~ lweight + horsepower, data = CarsNow)
mWEH <- lm(consumption ~ lweight + engine.size + horsepower, data = CarsNow)
engine size
omitted important covariate in model m
?anova(mW, mWE)
## Analysis of Variance Table
##
## Model 1: consumption ~ lweight
## Model 2: consumption ~ lweight + engine.size
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 407 435.68
## 2 406 394.26 1 41.415 42.648 1.962e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
horsepower
omitted important covariate in model m
?anova(mW, mWH)
## Analysis of Variance Table
##
## Model 1: consumption ~ lweight
## Model 2: consumption ~ lweight + horsepower
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 407 435.68
## 2 406 394.08 1 41.595 42.853 1.785e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
engine.size
or horsepower
or both omitted important covariate in model m
?anova(mW, mWEH)
## Analysis of Variance Table
##
## Model 1: consumption ~ lweight
## Model 2: consumption ~ lweight + engine.size + horsepower
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 407 435.68
## 2 405 381.56 2 54.119 28.722 2.163e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This confirms that the (by now) final model m3
\(=\) mWEH
is significantly better than the initial model m
\(=\) mW
.
m3
?par(mfrow = c(2, 2), bty = BTY, mar = c(5, 4, 3, 1) + 0.1)
#
plot(m3, which = 3, pch = 21, col = "blue4", bg = "skyblue", lwd = 2)
#
XLAB <- c("Weight [kg]", "Engine size [l]", "Horsepower")
for (j in 1:3){
plot(m3[["x"]][, j + 1], sqrt(abs(rstandard(m3))), pch = 21, col = "blue4",
bg = "skyblue", xlab = XLAB[j],
ylab = as.expression(substitute(sqrt(abs(yL)),
list(yL = as.name("Standardized residuals")))),
main = paste("Scale-Location (", colnames(m3[["x"]])[j + 1], ")", sep = ""))
lines(lowess(m3[["x"]][, j + 1], sqrt(abs(rstandard(m3)))), col = "red3", lwd = 2)
}
library("lmtest")
bptest(m3, varformula = ~fitted(m3))
##
## studentized Breusch-Pagan test
##
## data: m3
## BP = 2.56, df = 1, p-value = 0.1096
bptest(m3, varformula = ~lweight, data = CarsNow)
##
## studentized Breusch-Pagan test
##
## data: m3
## BP = 16.255, df = 1, p-value = 5.535e-05
bptest(m3, varformula = ~engine.size, data = CarsNow)
##
## studentized Breusch-Pagan test
##
## data: m3
## BP = 13.006, df = 1, p-value = 0.0003104
bptest(m3, varformula = ~horsepower, data = CarsNow)
##
## studentized Breusch-Pagan test
##
## data: m3
## BP = 2.9875, df = 1, p-value = 0.08391
m3
?par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 3, 1) + 0.1)
plot(m3, which = 2, pch = 21, col = "blue4", bg = "skyblue")
shapiro.test(residuals(m3))
##
## Shapiro-Wilk normality test
##
## data: residuals(m3)
## W = 0.95388, p-value = 5.276e-10
shapiro.test(rstandard(m3))
##
## Shapiro-Wilk normality test
##
## data: rstandard(m3)
## W = 0.95313, p-value = 4.144e-10
library("nortest")
lillie.test(residuals(m3))
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: residuals(m3)
## D = 0.074135, p-value = 1.276e-05
ad.test(residuals(m3))
##
## Anderson-Darling normality test
##
## data: residuals(m3)
## A = 3.433, p-value = 1.349e-08