Categorical ordinal covariate
Data Cars2004nh
data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
## vname type drive price.retail price.dealer price cons.city cons.highway
## 1 Chevrolet.Aveo.4dr 1 1 11690 10965 11327.5 8.4 6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch 1 1 12585 11802 12193.5 8.4 6.9
## 3 Chevrolet.Cavalier.2dr 1 1 14610 13697 14153.5 9.0 6.4
## 4 Chevrolet.Cavalier.4dr 1 1 14810 13884 14347.0 9.0 6.4
## 5 Chevrolet.Cavalier.LS.2dr 1 1 16385 15357 15871.0 9.0 6.4
## 6 Dodge.Neon.SE.4dr 1 1 13670 12849 13259.5 8.1 6.5
## consumption engine.size ncylinder horsepower weight iweight lweight wheel.base length width
## 1 7.65 1.6 4 103 1075 0.0009302326 6.980076 249 424 168
## 2 7.65 1.6 4 103 1065 0.0009389671 6.970730 249 389 168
## 3 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 4 7.70 2.2 4 140 1214 0.0008237232 7.101676 264 465 173
## 5 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 6 7.30 2.0 4 132 1171 0.0008539710 7.065613 267 442 170
## ftype fdrive
## 1 personal front
## 2 personal front
## 3 personal front
## 4 personal front
## 5 personal front
## 6 personal front
dim(Cars2004nh)
## [1] 425 20
summary(Cars2004nh)
## vname type drive price.retail price.dealer
## Length:425 Min. :1.000 Min. :1.000 Min. : 10280 Min. : 9875
## Class :character 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 20370 1st Qu.: 18973
## Mode :character Median :1.000 Median :1.000 Median : 27905 Median : 25672
## Mean :2.219 Mean :1.692 Mean : 32866 Mean : 30096
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.: 39235 3rd Qu.: 35777
## Max. :6.000 Max. :3.000 Max. :192465 Max. :173560
##
## price cons.city cons.highway consumption engine.size ncylinder
## Min. : 10078 Min. : 6.20 Min. : 5.100 Min. : 5.65 Min. :1.300 Min. :-1.000
## 1st Qu.: 19600 1st Qu.:11.20 1st Qu.: 8.100 1st Qu.: 9.65 1st Qu.:2.400 1st Qu.: 4.000
## Median : 26656 Median :12.40 Median : 9.000 Median :10.70 Median :3.000 Median : 6.000
## Mean : 31481 Mean :12.36 Mean : 9.142 Mean :10.75 Mean :3.208 Mean : 5.791
## 3rd Qu.: 37514 3rd Qu.:13.80 3rd Qu.: 9.800 3rd Qu.:11.65 3rd Qu.:3.900 3rd Qu.: 6.000
## Max. :183012 Max. :23.50 Max. :19.600 Max. :21.55 Max. :8.300 Max. :12.000
## NA's :14 NA's :14 NA's :14
## horsepower weight iweight lweight wheel.base length
## Min. :100.0 Min. : 923 Min. :0.0003067 Min. :6.828 Min. :226.0 Min. :363.0
## 1st Qu.:165.0 1st Qu.:1412 1st Qu.:0.0005542 1st Qu.:7.253 1st Qu.:262.0 1st Qu.:450.0
## Median :210.0 Median :1577 Median :0.0006341 Median :7.363 Median :272.0 Median :472.0
## Mean :216.8 Mean :1626 Mean :0.0006412 Mean :7.373 Mean :274.9 Mean :470.6
## 3rd Qu.:255.0 3rd Qu.:1804 3rd Qu.:0.0007082 3rd Qu.:7.498 3rd Qu.:284.0 3rd Qu.:490.0
## Max. :500.0 Max. :3261 Max. :0.0010834 Max. :8.090 Max. :366.0 Max. :577.0
## NA's :2 NA's :2 NA's :2 NA's :2 NA's :26
## width ftype fdrive
## Min. :163.0 personal:242 front:223
## 1st Qu.:175.0 wagon : 30 rear :110
## Median :180.0 SUV : 60 4x4 : 92
## Mean :181.1 pickup : 24
## 3rd Qu.:185.0 sport : 49
## Max. :206.0 minivan : 20
## NA's :28
To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption
, lweight
and engine.size
are known.
isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)
## [1] 16
CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size"))
dim(CarsNow)
## [1] 409 6
summary(CarsNow)
## consumption drive fdrive weight lweight engine.size
## Min. : 5.65 Min. :1.000 front:212 Min. : 923 Min. :6.828 Min. :1.300
## 1st Qu.: 9.65 1st Qu.:1.000 rear :108 1st Qu.:1415 1st Qu.:7.255 1st Qu.:2.400
## Median :10.70 Median :1.000 4x4 : 89 Median :1577 Median :7.363 Median :3.000
## Mean :10.75 Mean :1.699 Mean :1622 Mean :7.371 Mean :3.178
## 3rd Qu.:11.65 3rd Qu.:2.000 3rd Qu.:1804 3rd Qu.:7.498 3rd Qu.:3.800
## Max. :21.55 Max. :3.000 Max. :2903 Max. :7.973 Max. :6.000
weight
CarsNow <- transform(CarsNow, fweight = cut(weight, breaks = c(900, 1250, 1500, 1750, 2000, 3300),
labels = c("<=1250", "1250-1500", "1500-1750", "1750-2000", ">2000")))
summary(CarsNow[["fweight"]])
## <=1250 1250-1500 1500-1750 1750-2000 >2000
## 57 95 137 71 49
consumption
on categorized weight
consumption
print(grandMean <- with(CarsNow, mean(consumption)))
## [1] 10.75134
print(gMean <- with(CarsNow, tapply(consumption, fweight, mean, na.rm = TRUE)))
## <=1250 1250-1500 1500-1750 1750-2000 >2000
## 7.77193 9.84000 10.73905 11.82676 14.46020
ng <- length(gMean)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ fweight, data = CarsNow, col = rainbow_hcl(ng), xlab = "Weight [kg]", ylab = "Consumption [l/100 km]")
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ as.numeric(fweight), data = CarsNow, pch = 23, col = "red3", bg = "orange",
xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
lines(1:ng, gMean, col = "blue", lwd = 2)
set.seed(20010911)
CarsNow <- transform(CarsNow, jfweight = as.numeric(fweight) + runif(nrow(CarsNow), -0.25, 0.25))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ jfweight, data = CarsNow, pch = 23, col = "red3", bg = "orange",
xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
lines(1:ng, gMean, col = "blue", lwd = 2)
set.seed(20010911)
CarsNow <- transform(CarsNow, jfweight = as.numeric(fweight) + runif(nrow(CarsNow), -0.25, 0.25))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ jfweight, data = CarsNow, pch = 23, col = "red3", bg = "orange",
xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
##lines(1:ng, gMean, col = "blue", lwd = 2)
a2 <- aov(consumption ~ fweight, data = CarsNow)
summary(a2)
## Df Sum Sq Mean Sq F value Pr(>F)
## fweight 4 1341.0 335.3 262.4 <2e-16 ***
## Residuals 404 516.2 1.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mTrt <- lm(consumption ~ fweight, data = CarsNow)
summary(mTrt)
##
## Call:
## lm(formula = consumption ~ fweight, data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1900 -0.7102 -0.0400 0.6232 7.0898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.7719 0.1497 51.91 <2e-16 ***
## fweight1250-1500 2.0681 0.1894 10.92 <2e-16 ***
## fweight1500-1750 2.9671 0.1782 16.65 <2e-16 ***
## fweight1750-2000 4.0548 0.2010 20.17 <2e-16 ***
## fweight>2000 6.6883 0.2202 30.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 404 degrees of freedom
## Multiple R-squared: 0.7221, Adjusted R-squared: 0.7193
## F-statistic: 262.4 on 4 and 404 DF, p-value: < 2.2e-16
fweight
levels(CarsNow[["fweight"]])
## [1] "<=1250" "1250-1500" "1500-1750" "1750-2000" ">2000"
mPoly <- lm(consumption ~ fweight, data = CarsNow, contrasts = list(fweight = contr.poly))
summary(mPoly)
##
## Call:
## lm(formula = consumption ~ fweight, data = CarsNow, contrasts = list(fweight = contr.poly))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1900 -0.7102 -0.0400 0.6232 7.0898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.093e+01 5.975e-02 182.876 < 2e-16 ***
## fweight.L 4.858e+00 1.501e-01 32.359 < 2e-16 ***
## fweight.Q 3.526e-01 1.370e-01 2.574 0.0104 *
## fweight.C 8.585e-01 1.320e-01 6.503 2.33e-10 ***
## fweight^4 -7.193e-05 1.126e-01 -0.001 0.9995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 404 degrees of freedom
## Multiple R-squared: 0.7221, Adjusted R-squared: 0.7193
## F-statistic: 262.4 on 4 and 404 DF, p-value: < 2.2e-16
(CPoly <- contr.poly(5))
## .L .Q .C ^4
## [1,] -6.324555e-01 0.5345225 -3.162278e-01 0.1195229
## [2,] -3.162278e-01 -0.2672612 6.324555e-01 -0.4780914
## [3,] -3.287978e-17 -0.5345225 1.595204e-16 0.7171372
## [4,] 3.162278e-01 -0.2672612 -6.324555e-01 -0.4780914
## [5,] 6.324555e-01 0.5345225 3.162278e-01 0.1195229
round(CPoly, digits = 15)
## .L .Q .C ^4
## [1,] -0.6324555 0.5345225 -0.3162278 0.1195229
## [2,] -0.3162278 -0.2672612 0.6324555 -0.4780914
## [3,] 0.0000000 -0.5345225 0.0000000 0.7171372
## [4,] 0.3162278 -0.2672612 -0.6324555 -0.4780914
## [5,] 0.6324555 0.5345225 0.3162278 0.1195229
coef(mPoly)["(Intercept)"] + CPoly %*% coef(mPoly)[-1]
## [,1]
## [1,] 7.77193
## [2,] 9.84000
## [3,] 10.73905
## [4,] 11.82676
## [5,] 14.46020
muhat <- as.numeric(coef(mPoly)["(Intercept)"] + CPoly %*% coef(mPoly)[-1])
names(muhat) <- levels(CarsNow[, "fweight"])
print(muhat)
## <=1250 1250-1500 1500-1750 1750-2000 >2000
## 7.77193 9.84000 10.73905 11.82676 14.46020
tgrid <- seq(0.75, 5.25, length = 100)
be <- coef(mPoly)
fitl <- be[1] + apply(be[-1] * t(predict(poly(1:5, degree = 4), newdata = tgrid)), 2, sum)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ jfweight, data = CarsNow, pch = 23, col = "red3", bg = "orange",
xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
lines(tgrid, fitl, col = "blue", lwd = 2)
nweight
taking values 1, 2, 3, 4, 5CarsNow <- transform(CarsNow, nweight = as.numeric(fweight))
with(CarsNow, table(fweight, nweight))
## nweight
## fweight 1 2 3 4 5
## <=1250 57 0 0 0 0
## 1250-1500 0 95 0 0 0
## 1500-1750 0 0 137 0 0
## 1750-2000 0 0 0 71 0
## >2000 0 0 0 0 49
p4 <- lm(consumption ~ nweight + I(nweight^2) + I(nweight^3) + I(nweight^4), data = CarsNow)
#p4 <- lm(consumption ~ poly(nweight, 4), data = CarsNow)
summary(p4)
##
## Call:
## lm(formula = consumption ~ nweight + I(nweight^2) + I(nweight^3) +
## I(nweight^4), data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1900 -0.7102 -0.0400 0.6232 7.0898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.177e+00 1.820e+00 1.745 0.0818 .
## nweight 6.312e+00 3.274e+00 1.928 0.0546 .
## I(nweight^2) -1.943e+00 1.947e+00 -0.998 0.3190
## I(nweight^3) 2.265e-01 4.687e-01 0.483 0.6292
## I(nweight^4) -2.507e-05 3.925e-02 -0.001 0.9995
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.13 on 404 degrees of freedom
## Multiple R-squared: 0.7221, Adjusted R-squared: 0.7193
## F-statistic: 262.4 on 4 and 404 DF, p-value: < 2.2e-16
#
p3 <- lm(consumption ~ nweight + I(nweight^2) + I(nweight^3), data = CarsNow)
#p3 <- lm(consumption ~ poly(nweight, 3), data = CarsNow)
summary(p3)
##
## Call:
## lm(formula = consumption ~ nweight + I(nweight^2) + I(nweight^3),
## data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1900 -0.7102 -0.0400 0.6233 7.0898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.17764 0.66606 4.771 2.57e-06 ***
## nweight 6.30991 0.84064 7.506 3.91e-13 ***
## I(nweight^2) -1.94184 0.31116 -6.241 1.10e-09 ***
## I(nweight^3) 0.22623 0.03456 6.545 1.80e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.129 on 405 degrees of freedom
## Multiple R-squared: 0.7221, Adjusted R-squared: 0.72
## F-statistic: 350.7 on 3 and 405 DF, p-value: < 2.2e-16
#
p2 <- lm(consumption ~ nweight + I(nweight^2), data = CarsNow)
#p2 <- lm(consumption ~ poly(nweight, 2), data = CarsNow)
summary(p2)
##
## Call:
## lm(formula = consumption ~ nweight + I(nweight^2), data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7283 -0.8288 -0.0907 0.6712 7.4817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.06590 0.31634 22.337 < 2e-16 ***
## nweight 0.99339 0.22734 4.370 1.58e-05 ***
## I(nweight^2) 0.08142 0.03732 2.182 0.0297 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.186 on 406 degrees of freedom
## Multiple R-squared: 0.6927, Adjusted R-squared: 0.6912
## F-statistic: 457.5 on 2 and 406 DF, p-value: < 2.2e-16
#
p1 <- lm(consumption ~ nweight, data = CarsNow)
#p1 <- lm(consumption ~ poly(nweight, 1), data = CarsNow)
summary(p1)
##
## Call:
## lm(formula = consumption ~ nweight, data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7682 -0.7959 -0.1235 0.6595 7.6988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4628 0.1545 41.84 <2e-16 ***
## nweight 1.4777 0.0492 30.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.191 on 407 degrees of freedom
## Multiple R-squared: 0.6891, Adjusted R-squared: 0.6883
## F-statistic: 901.9 on 1 and 407 DF, p-value: < 2.2e-16
anova(p1, p4)
## Analysis of Variance Table
##
## Model 1: consumption ~ nweight
## Model 2: consumption ~ nweight + I(nweight^2) + I(nweight^3) + I(nweight^4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 407 577.49
## 2 404 516.20 3 61.291 15.99 7.667e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(p1, mPoly)
## Analysis of Variance Table
##
## Model 1: consumption ~ nweight
## Model 2: consumption ~ fweight
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 407 577.49
## 2 404 516.20 3 61.291 15.99 7.667e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1