Transformation of response: Regression with log-transformed response to stabilize the variance, Box-Cox transformation
Data Cars2004nh
data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
## vname type drive price.retail price.dealer price cons.city cons.highway
## 1 Chevrolet.Aveo.4dr 1 1 11690 10965 11327.5 8.4 6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch 1 1 12585 11802 12193.5 8.4 6.9
## 3 Chevrolet.Cavalier.2dr 1 1 14610 13697 14153.5 9.0 6.4
## 4 Chevrolet.Cavalier.4dr 1 1 14810 13884 14347.0 9.0 6.4
## 5 Chevrolet.Cavalier.LS.2dr 1 1 16385 15357 15871.0 9.0 6.4
## 6 Dodge.Neon.SE.4dr 1 1 13670 12849 13259.5 8.1 6.5
## consumption engine.size ncylinder horsepower weight iweight lweight wheel.base length width
## 1 7.65 1.6 4 103 1075 0.0009302326 6.980076 249 424 168
## 2 7.65 1.6 4 103 1065 0.0009389671 6.970730 249 389 168
## 3 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 4 7.70 2.2 4 140 1214 0.0008237232 7.101676 264 465 173
## 5 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 6 7.30 2.0 4 132 1171 0.0008539710 7.065613 267 442 170
## ftype fdrive
## 1 personal front
## 2 personal front
## 3 personal front
## 4 personal front
## 5 personal front
## 6 personal front
dim(Cars2004nh)
## [1] 425 20
summary(Cars2004nh)
## vname type drive price.retail price.dealer
## Length:425 Min. :1.000 Min. :1.000 Min. : 10280 Min. : 9875
## Class :character 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 20370 1st Qu.: 18973
## Mode :character Median :1.000 Median :1.000 Median : 27905 Median : 25672
## Mean :2.219 Mean :1.692 Mean : 32866 Mean : 30096
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.: 39235 3rd Qu.: 35777
## Max. :6.000 Max. :3.000 Max. :192465 Max. :173560
##
## price cons.city cons.highway consumption engine.size ncylinder
## Min. : 10078 Min. : 6.20 Min. : 5.100 Min. : 5.65 Min. :1.300 Min. :-1.000
## 1st Qu.: 19600 1st Qu.:11.20 1st Qu.: 8.100 1st Qu.: 9.65 1st Qu.:2.400 1st Qu.: 4.000
## Median : 26656 Median :12.40 Median : 9.000 Median :10.70 Median :3.000 Median : 6.000
## Mean : 31481 Mean :12.36 Mean : 9.142 Mean :10.75 Mean :3.208 Mean : 5.791
## 3rd Qu.: 37514 3rd Qu.:13.80 3rd Qu.: 9.800 3rd Qu.:11.65 3rd Qu.:3.900 3rd Qu.: 6.000
## Max. :183012 Max. :23.50 Max. :19.600 Max. :21.55 Max. :8.300 Max. :12.000
## NA's :14 NA's :14 NA's :14
## horsepower weight iweight lweight wheel.base length
## Min. :100.0 Min. : 923 Min. :0.0003067 Min. :6.828 Min. :226.0 Min. :363.0
## 1st Qu.:165.0 1st Qu.:1412 1st Qu.:0.0005542 1st Qu.:7.253 1st Qu.:262.0 1st Qu.:450.0
## Median :210.0 Median :1577 Median :0.0006341 Median :7.363 Median :272.0 Median :472.0
## Mean :216.8 Mean :1626 Mean :0.0006412 Mean :7.373 Mean :274.9 Mean :470.6
## 3rd Qu.:255.0 3rd Qu.:1804 3rd Qu.:0.0007082 3rd Qu.:7.498 3rd Qu.:284.0 3rd Qu.:490.0
## Max. :500.0 Max. :3261 Max. :0.0010834 Max. :8.090 Max. :366.0 Max. :577.0
## NA's :2 NA's :2 NA's :2 NA's :2 NA's :26
## width ftype fdrive
## Min. :163.0 personal:242 front:223
## 1st Qu.:175.0 wagon : 30 rear :110
## Median :180.0 SUV : 60 4x4 : 92
## Mean :181.1 pickup : 24
## 3rd Qu.:185.0 sport : 49
## Max. :206.0 minivan : 20
## NA's :28
Cars2004nh <- transform(Cars2004nh, weightTon = weight/1000)
To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption
, lweight
and engine.size
are known.
isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)
## [1] 16
CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "weightTon", "engine.size"))
dim(CarsNow)
## [1] 409 7
summary(CarsNow)
## consumption drive fdrive weight lweight weightTon
## Min. : 5.65 Min. :1.000 front:212 Min. : 923 Min. :6.828 Min. :0.923
## 1st Qu.: 9.65 1st Qu.:1.000 rear :108 1st Qu.:1415 1st Qu.:7.255 1st Qu.:1.415
## Median :10.70 Median :1.000 4x4 : 89 Median :1577 Median :7.363 Median :1.577
## Mean :10.75 Mean :1.699 Mean :1622 Mean :7.371 Mean :1.622
## 3rd Qu.:11.65 3rd Qu.:2.000 3rd Qu.:1804 3rd Qu.:7.498 3rd Qu.:1.804
## Max. :21.55 Max. :3.000 Max. :2903 Max. :7.973 Max. :2.903
## engine.size
## Min. :1.300
## 1st Qu.:2.400
## Median :3.000
## Mean :3.178
## 3rd Qu.:3.800
## Max. :6.000
COL3 <- c("red3", "darkgreen", "blue3")
LCOL3 <- c("red", "green3", "blue")
BG3 <- rainbow_hcl(3)
PCH3 <- c(21, 22, 23)
names(COL3) <- names(BG3) <- names(PCH3) <- levels(Cars2004nh[, "fdrive"])
consumption
on weightTon
and fdrive
plot(consumption ~ weightTon, data = CarsNow, col = COL3[fdrive], bg = BG3[fdrive], pch = PCH3[fdrive],
xlab = "Weight [t]", ylab = "Consumption [l/100 km]")
legend(1, 20, legend = levels(CarsNow[, "fdrive"]), pt.bg = BG3, col = COL3, pch = PCH3)
m1 <- lm(consumption ~ weightTon + fdrive + weightTon:fdrive, data = CarsNow)
summary(m1)
##
## Call:
## lm(formula = consumption ~ weightTon + fdrive + weightTon:fdrive,
## data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2952 -0.5924 -0.0984 0.5009 3.5622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1824 0.3412 3.465 0.000588 ***
## weightTon 5.6996 0.2233 25.522 < 2e-16 ***
## fdriverear 3.8936 0.7062 5.514 6.29e-08 ***
## fdrive4x4 1.2177 0.6046 2.014 0.044677 *
## weightTon:fdriverear -1.9296 0.4330 -4.456 1.08e-05 ***
## weightTon:fdrive4x4 -0.3105 0.3437 -0.904 0.366797
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9191 on 403 degrees of freedom
## Multiple R-squared: 0.8167, Adjusted R-squared: 0.8144
## F-statistic: 359.1 on 5 and 403 DF, p-value: < 2.2e-16
anova(m1)
## Analysis of Variance Table
##
## Response: consumption
## Df Sum Sq Mean Sq F value Pr(>F)
## weightTon 1 1445.71 1445.71 1711.481 < 2.2e-16 ***
## fdrive 2 54.01 27.00 31.968 1.299e-13 ***
## weightTon:fdrive 2 17.11 8.55 10.125 5.124e-05 ***
## Residuals 403 340.42 0.84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lpred <- 500
wpred <- seq(0.9, 3.3, length = lpred)
dataPred <- data.frame(weightTon = rep(wpred, 3),
fdrive = gl(3, lpred, labels = levels(CarsNow[, "fdrive"])))
print(dataPred[1:10,])
## weightTon fdrive
## 1 0.9000000 front
## 2 0.9048096 front
## 3 0.9096192 front
## 4 0.9144289 front
## 5 0.9192385 front
## 6 0.9240481 front
## 7 0.9288577 front
## 8 0.9336673 front
## 9 0.9384770 front
## 10 0.9432866 front
print(dataPred[(lpred + 1):(lpred + 10),])
## weightTon fdrive
## 501 0.9000000 rear
## 502 0.9048096 rear
## 503 0.9096192 rear
## 504 0.9144289 rear
## 505 0.9192385 rear
## 506 0.9240481 rear
## 507 0.9288577 rear
## 508 0.9336673 rear
## 509 0.9384770 rear
## 510 0.9432866 rear
print(dataPred[(2*lpred + 1):(2*lpred + 10),])
## weightTon fdrive
## 1001 0.9000000 4x4
## 1002 0.9048096 4x4
## 1003 0.9096192 4x4
## 1004 0.9144289 4x4
## 1005 0.9192385 4x4
## 1006 0.9240481 4x4
## 1007 0.9288577 4x4
## 1008 0.9336673 4x4
## 1009 0.9384770 4x4
## 1010 0.9432866 4x4
pm1 <- predict(m1, newdata = dataPred)
print(pm1[1:10])
## 1 2 3 4 5 6 7 8 9 10
## 6.311984 6.339397 6.366810 6.394223 6.421635 6.449048 6.476461 6.503874 6.531286 6.558699
print(pm1[(lpred + 1):(lpred + 10)])
## 501 502 503 504 505 506 507 508 509 510
## 8.468907 8.487039 8.505172 8.523304 8.541436 8.559568 8.577700 8.595832 8.613964 8.632097
print(pm1[(2*lpred + 1):(2*lpred + 10)])
## 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
## 7.250234 7.276153 7.302072 7.327992 7.353911 7.379830 7.405750 7.431669 7.457588 7.483508
plot(consumption ~ weightTon, data = CarsNow, col = COL3[fdrive], bg = BG3[fdrive], pch = PCH3[fdrive],
xlab = "Weight [t]", ylab = "Consumption [l/100 km]")
for (l in 1:3){
lines(wpred, pm1[((l-1)*lpred + 1):(l*lpred)], col = LCOL3[l], lwd = 2)
}
legend(1, 20, legend = levels(CarsNow[, "fdrive"]), pt.bg = BG3, col = LCOL3, pch = PCH3, lty = 1, lwd = 2)
library("mffSM")
plotLM(m1)
library("lmtest")
bptest(m1, varformula = ~fitted(m1))
##
## studentized Breusch-Pagan test
##
## data: m1
## BP = 3.5537, df = 1, p-value = 0.05941
m2 <- lm(log(consumption) ~ weightTon + fdrive + weightTon:fdrive, data = CarsNow)
summary(m2)
##
## Call:
## lm(formula = log(consumption) ~ weightTon + fdrive + weightTon:fdrive,
## data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44734 -0.05304 -0.00676 0.05508 0.30350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.39228 0.03155 44.129 < 2e-16 ***
## weightTon 0.57762 0.02065 27.975 < 2e-16 ***
## fdriverear 0.46467 0.06529 7.117 5.08e-12 ***
## fdrive4x4 0.34690 0.05590 6.205 1.36e-09 ***
## weightTon:fdriverear -0.23764 0.04003 -5.936 6.31e-09 ***
## weightTon:fdrive4x4 -0.16658 0.03178 -5.242 2.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08498 on 403 degrees of freedom
## Multiple R-squared: 0.8162, Adjusted R-squared: 0.814
## F-statistic: 358 on 5 and 403 DF, p-value: < 2.2e-16
anova(m2)
## Analysis of Variance Table
##
## Response: log(consumption)
## Df Sum Sq Mean Sq F value Pr(>F)
## weightTon 1 11.9938 11.9938 1661.012 < 2.2e-16 ***
## fdrive 2 0.5914 0.2957 40.953 < 2.2e-16 ***
## weightTon:fdrive 2 0.3405 0.1702 23.577 2.073e-10 ***
## Residuals 403 2.9100 0.0072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pm2 <- predict(m2, newdata = dataPred)
plot(log(consumption) ~ weightTon, col = COL3[fdrive], bg = BG3[fdrive], pch = PCH3[fdrive], data = CarsNow,
xlab = "Weight [t]", ylab = "Log(consumption) [log(l/100 km)]")
for (l in 1:3){
lines(wpred, pm2[((l-1)*lpred + 1):(l*lpred)], col = LCOL3[l], lwd = 2)
}
legend(1, 3, legend = levels(CarsNow[, "fdrive"]), pt.bg = BG3, col = LCOL3, pch = PCH3, lty = 1, lwd = 2)
consumption
and not as estimates of the expected consumption
.plot(consumption ~ weightTon, col = COL3[fdrive], bg = BG3[fdrive], pch = PCH3[fdrive], data = CarsNow,
xlab = "Weight [t]", ylab = "Consumption [l/100 km]")
for (l in 1:3){
lines(wpred, exp(pm2[((l-1)*lpred + 1):(l*lpred)]), col = LCOL3[l], lwd = 2)
}
legend(1, 20, legend = levels(CarsNow[, "fdrive"]), pt.bg = BG3, col = LCOL3, pch = PCH3, lty = 1, lwd = 2)
plotLM(m2)
bptest(m2, varformula = ~fitted(m2))
##
## studentized Breusch-Pagan test
##
## data: m2
## BP = 0.21273, df = 1, p-value = 0.6446
detach("package:lmtest")
weight
= 1.5 tcoef(m2)
## (Intercept) weightTon fdriverear fdrive4x4
## 1.3922801 0.5776185 0.4646743 0.3468986
## weightTon:fdriverear weightTon:fdrive4x4
## -0.2376359 -0.1665767
fdrive
ldrive <- levels(CarsNow[, "fdrive"])
C <- contr.treatment(length(ldrive))
rownames(C) <- ldrive
colnames(C) <- names(coef(m2))[3:4]
print(C)
## fdriverear fdrive4x4
## front 0 0
## rear 1 0
## 4x4 0 1
weightTon
= 0L.diff <- rbind(C[2,] - C[1,],
C[3,] - C[1,],
C[3,] - C[2,])
rownames(L.diff) <- paste(ldrive[c(2, 3, 3)], "-", ldrive[c(1, 1, 2)], sep = "")
print(L.diff)
## fdriverear fdrive4x4
## rear-front 1 0
## 4x4-front 0 1
## 4x4-rear -1 1
weightTon
= 1.5x <- 1.5
L <- cbind(0, 0, L.diff, x*L.diff)
colnames(L) <- names(coef(m2))
print(L)
## (Intercept) weightTon fdriverear fdrive4x4 weightTon:fdriverear weightTon:fdrive4x4
## rear-front 0 0 1 0 1.5 0.0
## 4x4-front 0 0 0 1 0.0 1.5
## 4x4-rear 0 0 -1 1 -1.5 1.5
LSest(m2, L)
## Estimate Std. Error t value P value Lower Upper
## rear-front 0.10822048 0.01127677 9.5967640 < 2.22e-16 0.08605185 0.13038912
## 4x4-front 0.09703352 0.01402729 6.9174829 1.8142e-11 0.06945773 0.12460932
## 4x4-rear -0.01118696 0.01599398 -0.6994485 0.48468 -0.04262901 0.02025508
ls2 <- LSest(m2, L)
ls2$Estimate <- exp(ls2$Estimate)
ls2$Lower <- exp(ls2$Lower)
ls2$Upper <- exp(ls2$Upper)
print(ls2)
## Estimate Std. Error t value P value Lower Upper
## rear-front 1.1142934 0.01127677 9.5967640 < 2.22e-16 1.0898628 1.139272
## 4x4-front 1.1018973 0.01402729 6.9174829 1.8142e-11 1.0719267 1.132706
## 4x4-rear 0.9888754 0.01599398 -0.6994485 0.48468 0.9582668 1.020462
print(ls2, digits = 4)
## Estimate Std. Error t value P value Lower Upper
## rear-front 1.1143 0.01128 9.5968 < 2.22e-16 1.0899 1.139
## 4x4-front 1.1019 0.01403 6.9175 1.8142e-11 1.0719 1.133
## 4x4-rear 0.9889 0.01599 -0.6994 0.48468 0.9583 1.020
NOTE: Standard errors do not correspond to the estimated ratios!!!
library("multcomp")
mcp2 <- glht(m2, linfct = L)
set.seed(221913282)
(smpc2 <- summary(mcp2))
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lm(formula = log(consumption) ~ weightTon + fdrive + weightTon:fdrive,
## data = CarsNow)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## rear-front == 0 0.10822 0.01128 9.597 <1e-05 ***
## 4x4-front == 0 0.09703 0.01403 6.917 <1e-05 ***
## 4x4-rear == 0 -0.01119 0.01599 -0.699 0.76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
(ci2 <- confint(mcp2))
##
## Simultaneous Confidence Intervals
##
## Fit: lm(formula = log(consumption) ~ weightTon + fdrive + weightTon:fdrive,
## data = CarsNow)
##
## Quantile = 2.3396
## 95% family-wise confidence level
##
##
## Linear Hypotheses:
## Estimate lwr upr
## rear-front == 0 0.10822 0.08184 0.13460
## 4x4-front == 0 0.09703 0.06421 0.12985
## 4x4-rear == 0 -0.01119 -0.04861 0.02623
names(ci2)
## [1] "model" "linfct" "rhs" "coef" "vcov" "df"
## [7] "alternative" "type" "confint"
exp(ci2$confint)
## Estimate lwr upr
## rear-front 1.1142934 1.0852787 1.144084
## 4x4-front 1.1018973 1.0663213 1.138660
## 4x4-rear 0.9888754 0.9525552 1.026580
## attr(,"conf.level")
## [1] 0.95
## attr(,"calpha")
## [1] 2.339644
m1 <- lm(consumption ~ weightTon + fdrive + weightTon:fdrive, data = CarsNow)
summary(m1)
##
## Call:
## lm(formula = consumption ~ weightTon + fdrive + weightTon:fdrive,
## data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2952 -0.5924 -0.0984 0.5009 3.5622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1824 0.3412 3.465 0.000588 ***
## weightTon 5.6996 0.2233 25.522 < 2e-16 ***
## fdriverear 3.8936 0.7062 5.514 6.29e-08 ***
## fdrive4x4 1.2177 0.6046 2.014 0.044677 *
## weightTon:fdriverear -1.9296 0.4330 -4.456 1.08e-05 ***
## weightTon:fdrive4x4 -0.3105 0.3437 -0.904 0.366797
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9191 on 403 degrees of freedom
## Multiple R-squared: 0.8167, Adjusted R-squared: 0.8144
## F-statistic: 359.1 on 5 and 403 DF, p-value: < 2.2e-16
library("MASS")
boxcox(m1)
boxcox(m1, lambda = seq(-0.1, 0.8, by = 0.05))
λ=0 is just outside the 95% confidence interval. Since it provides an interpretable model, it is still reasonable to use it.
boxcox(m1, lambda = seq(0.2, 0.4, by = 0.025), plotit = FALSE)
## $x
## [1] 0.200 0.225 0.250 0.275 0.300 0.325 0.350 0.375 0.400
##
## $y
## [1] -215.8314 -215.6701 -215.5455 -215.4575 -215.4061 -215.3915 -215.4135 -215.4722 -215.5676