Categorical nominal covariate
Data Cars2004nh
data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
## vname type drive price.retail price.dealer price cons.city cons.highway
## 1 Chevrolet.Aveo.4dr 1 1 11690 10965 11327.5 8.4 6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch 1 1 12585 11802 12193.5 8.4 6.9
## 3 Chevrolet.Cavalier.2dr 1 1 14610 13697 14153.5 9.0 6.4
## 4 Chevrolet.Cavalier.4dr 1 1 14810 13884 14347.0 9.0 6.4
## 5 Chevrolet.Cavalier.LS.2dr 1 1 16385 15357 15871.0 9.0 6.4
## 6 Dodge.Neon.SE.4dr 1 1 13670 12849 13259.5 8.1 6.5
## consumption engine.size ncylinder horsepower weight iweight lweight wheel.base length width
## 1 7.65 1.6 4 103 1075 0.0009302326 6.980076 249 424 168
## 2 7.65 1.6 4 103 1065 0.0009389671 6.970730 249 389 168
## 3 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 4 7.70 2.2 4 140 1214 0.0008237232 7.101676 264 465 173
## 5 7.70 2.2 4 140 1187 0.0008424600 7.079184 264 465 175
## 6 7.30 2.0 4 132 1171 0.0008539710 7.065613 267 442 170
## ftype fdrive
## 1 personal front
## 2 personal front
## 3 personal front
## 4 personal front
## 5 personal front
## 6 personal front
dim(Cars2004nh)
## [1] 425 20
summary(Cars2004nh)
## vname type drive price.retail price.dealer
## Length:425 Min. :1.000 Min. :1.000 Min. : 10280 Min. : 9875
## Class :character 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 20370 1st Qu.: 18973
## Mode :character Median :1.000 Median :1.000 Median : 27905 Median : 25672
## Mean :2.219 Mean :1.692 Mean : 32866 Mean : 30096
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.: 39235 3rd Qu.: 35777
## Max. :6.000 Max. :3.000 Max. :192465 Max. :173560
##
## price cons.city cons.highway consumption engine.size ncylinder
## Min. : 10078 Min. : 6.20 Min. : 5.100 Min. : 5.65 Min. :1.300 Min. :-1.000
## 1st Qu.: 19600 1st Qu.:11.20 1st Qu.: 8.100 1st Qu.: 9.65 1st Qu.:2.400 1st Qu.: 4.000
## Median : 26656 Median :12.40 Median : 9.000 Median :10.70 Median :3.000 Median : 6.000
## Mean : 31481 Mean :12.36 Mean : 9.142 Mean :10.75 Mean :3.208 Mean : 5.791
## 3rd Qu.: 37514 3rd Qu.:13.80 3rd Qu.: 9.800 3rd Qu.:11.65 3rd Qu.:3.900 3rd Qu.: 6.000
## Max. :183012 Max. :23.50 Max. :19.600 Max. :21.55 Max. :8.300 Max. :12.000
## NA's :14 NA's :14 NA's :14
## horsepower weight iweight lweight wheel.base length
## Min. :100.0 Min. : 923 Min. :0.0003067 Min. :6.828 Min. :226.0 Min. :363.0
## 1st Qu.:165.0 1st Qu.:1412 1st Qu.:0.0005542 1st Qu.:7.253 1st Qu.:262.0 1st Qu.:450.0
## Median :210.0 Median :1577 Median :0.0006341 Median :7.363 Median :272.0 Median :472.0
## Mean :216.8 Mean :1626 Mean :0.0006412 Mean :7.373 Mean :274.9 Mean :470.6
## 3rd Qu.:255.0 3rd Qu.:1804 3rd Qu.:0.0007082 3rd Qu.:7.498 3rd Qu.:284.0 3rd Qu.:490.0
## Max. :500.0 Max. :3261 Max. :0.0010834 Max. :8.090 Max. :366.0 Max. :577.0
## NA's :2 NA's :2 NA's :2 NA's :2 NA's :26
## width ftype fdrive
## Min. :163.0 personal:242 front:223
## 1st Qu.:175.0 wagon : 30 rear :110
## Median :180.0 SUV : 60 4x4 : 92
## Mean :181.1 pickup : 24
## 3rd Qu.:185.0 sport : 49
## Max. :206.0 minivan : 20
## NA's :28
To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption
, lweight
and engine.size
are known.
isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)
## [1] 16
CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size"))
dim(CarsNow)
## [1] 409 6
summary(CarsNow)
## consumption drive fdrive weight lweight engine.size
## Min. : 5.65 Min. :1.000 front:212 Min. : 923 Min. :6.828 Min. :1.300
## 1st Qu.: 9.65 1st Qu.:1.000 rear :108 1st Qu.:1415 1st Qu.:7.255 1st Qu.:2.400
## Median :10.70 Median :1.000 4x4 : 89 Median :1577 Median :7.363 Median :3.000
## Mean :10.75 Mean :1.699 Mean :1622 Mean :7.371 Mean :3.178
## 3rd Qu.:11.65 3rd Qu.:2.000 3rd Qu.:1804 3rd Qu.:7.498 3rd Qu.:3.800
## Max. :21.55 Max. :3.000 Max. :2903 Max. :7.973 Max. :6.000
with(CarsNow, tapply(consumption, fdrive, mean))
## front rear 4x4
## 9.741274 11.293981 12.498876
consumption
on drive
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ drive, data = CarsNow, xaxt = "n", col = COL, bg = BGC, pch = PCH,
xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)
axis(1, at = 1:3, labels = levels(CarsNow[, "fdrive"]), cex.axis = 1.2)
set.seed(20010911)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(CarsNow[, "drive"] + runif(nrow(CarsNow), -0.2, 0.2), CarsNow[, "consumption"], xaxt = "n", col = COL, bg = BGC, pch = PCH,
xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)
axis(1, at = 1:3, labels = levels(CarsNow[, "fdrive"]), cex.axis = 1.2)
consumption
(ybar <- with(CarsNow, mean(consumption)))
## [1] 10.75134
(ybargr <- with(CarsNow, tapply(consumption, fdrive, mean)))
## front rear 4x4
## 9.741274 11.293981 12.498876
layout(matrix(c(1, 2,2,2,2), nrow = 1))
par(bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
set.seed(20010911)
plot(runif(nrow(CarsNow), -0.1, 0.1), CarsNow[, "consumption"], xaxt = "n", col = "darkblue", bg = "skyblue", pch = PCH,
xlab = "", ylab = "Consumption [l/100 km]", xlim = c(-0.2, 0.2), cex.lab = 1.2, cex.axis = 1.2)
axis(1, at = 0, labels = "All")
points(0, ybar, pch = 22, col = "red", bg = "yellow", cex = 3)
set.seed(20010911)
plot(CarsNow[, "drive"] + runif(nrow(CarsNow), -0.1, 0.1), CarsNow[, "consumption"], xaxt = "n", col = COL, bg = BGC, pch = PCH,
xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)
points(1:3, ybargr, pch = 22, col = "darkgreen", bg = "seagreen", cex = 3)
axis(1, at = 1:3, labels = levels(CarsNow[, "fdrive"]), cex.axis = 1.2)
consumption
by fdrive
FCOL <- rainbow_hcl(3)
names(FCOL) <- levels(CarsNow[, "fdrive"])
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ fdrive, data = CarsNow, col = FCOL, xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)
consumption
of all datapoints, boxplots of consumption
by fdrive
layout(matrix(c(1, 2,2,2,2), nrow = 1))
par(bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
boxplot(CarsNow[, "consumption"], col = "sandybrown", ylab = "Consumption [l/100 km]", cexc.lab = 1.2, cex.axis = 1.2)
plot(consumption ~ fdrive, data = CarsNow, col = FCOL, xlab = "Drive", ylab = "Consumption [l/100 km]", cex.lab = 1.2, cex.axis = 1.2)
consumption
by fdrive
XLIM <- with(CarsNow, range(consumption))
layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
par(bty = BTY, mar = c(4, 4, 4, 1) + 0.1)
for (fl in levels(CarsNow[, "fdrive"])){
hist(subset(CarsNow, fdrive == fl)[, "consumption"], prob = TRUE, col = FCOL[fl],
xlab = "Consumption [l/100 km]", xlim = XLIM, ylim = c(0, 0.42), main = fl, cex.main = 1.6, cex.lab = 1.4, cex.axis = 1.4)
}
par(mfrow = c(1, 1))
consumption
by fdrive
layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
par(bty = BTY, mar = c(4, 4, 4, 2) + 0.1)
for (fl in levels(CarsNow[, "fdrive"])){
qqnorm(subset(CarsNow, fdrive == fl)[, "consumption"], main = fl, pch = PCH, col = COL, bg = BGC, cex.main = 1.6, cex.lab = 1.4, cex.axis = 1.4)
qqline(subset(CarsNow, fdrive == fl)[, "consumption"], lwd = 2, col = "blue")
}
par(mfrow = c(1, 1))
In this parameterization, regression coefficients provide the estimated group means, i.e., the group sample means.
mF <- lm(consumption ~ -1 + fdrive, data = CarsNow)
summary(mF)
##
## Call:
## lm(formula = consumption ~ -1 + fdrive, data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0913 -1.2489 -0.0440 0.9587 9.0511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## fdrivefront 9.7413 0.1247 78.15 <2e-16 ***
## fdriverear 11.2940 0.1746 64.67 <2e-16 ***
## fdrive4x4 12.4989 0.1924 64.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared: 0.9728, Adjusted R-squared: 0.9726
## F-statistic: 4837 on 3 and 406 DF, p-value: < 2.2e-16
deviance(mF) ## SS_e
## [1] 1337.355
m0 <- lm(consumption ~ 1, data = CarsNow)
summary(m0)
##
## Call:
## lm(formula = consumption ~ 1, data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1013 -1.1013 -0.0513 0.8987 10.7987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7513 0.1055 101.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.134 on 408 degrees of freedom
deviance(m0) ## SS_T
## [1] 1857.242
deviance(m0) - deviance(mF) ## SS_R
## [1] 519.8869
a1 <- aov(consumption ~ fdrive, data = CarsNow)
summary(a1)
## Df Sum Sq Mean Sq F value Pr(>F)
## fdrive 2 519.9 259.94 78.92 <2e-16 ***
## Residuals 406 1337.4 3.29
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
consumption
(ybar <- with(CarsNow, mean(consumption)))
## [1] 10.75134
(ybargr <- with(CarsNow, tapply(consumption, fdrive, mean)))
## front rear 4x4
## 9.741274 11.293981 12.498876
(meanybargr <- mean(ybargr))
## [1] 11.17804
(TAB <- with(CarsNow, table(fdrive)))
## fdrive
## front rear 4x4
## 212 108 89
(PTAB <- prop.table(TAB))
## fdrive
## front rear 4x4
## 0.5183374 0.2640587 0.2176039
(wmeanybargr <- sum(ybargr * PTAB))
## [1] 10.75134
factor
fdrive
levels(CarsNow[["fdrive"]]) ## Which group is the first one?
## [1] "front" "rear" "4x4"
(G <- length(levels(CarsNow[, "fdrive"])))
## [1] 3
mTrt <- lm(consumption ~ fdrive, data = CarsNow)
summary(mTrt)
##
## Call:
## lm(formula = consumption ~ fdrive, data = CarsNow)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0913 -1.2489 -0.0440 0.9587 9.0511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.7413 0.1247 78.149 < 2e-16 ***
## fdriverear 1.5527 0.2146 7.237 2.32e-12 ***
## fdrive4x4 2.7576 0.2292 12.030 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2764
## F-statistic: 78.91 on 2 and 406 DF, p-value: < 2.2e-16
(CTrt <- contr.treatment(G))
## 2 3
## 1 0 0
## 2 1 0
## 3 0 1
coef(mTrt)["(Intercept)"] + CTrt %*% coef(mTrt)[-1]
## [,1]
## 1 9.741274
## 2 11.293981
## 3 12.498876
muhat <- as.numeric(coef(mTrt)["(Intercept)"] + CTrt %*% coef(mTrt)[-1])
names(muhat) <- levels(CarsNow[, "fdrive"])
print(muhat)
## front rear 4x4
## 9.741274 11.293981 12.498876
mSAS <- lm(consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = contr.SAS))
summary(mSAS)
##
## Call:
## lm(formula = consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = contr.SAS))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0913 -1.2489 -0.0440 0.9587 9.0511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.4989 0.1924 64.969 < 2e-16 ***
## fdrive1 -2.7576 0.2292 -12.030 < 2e-16 ***
## fdrive2 -1.2049 0.2598 -4.637 4.77e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2764
## F-statistic: 78.91 on 2 and 406 DF, p-value: < 2.2e-16
(CSAS <- contr.SAS(G))
## 1 2
## 1 1 0
## 2 0 1
## 3 0 0
coef(mSAS)["(Intercept)"] + CSAS %*% coef(mSAS)[-1]
## [,1]
## 1 9.741274
## 2 11.293981
## 3 12.498876
muhat <- as.numeric(coef(mSAS)["(Intercept)"] + CSAS %*% coef(mSAS)[-1])
names(muhat) <- levels(CarsNow[, "fdrive"])
print(muhat)
## front rear 4x4
## 9.741274 11.293981 12.498876
mSum <- lm(consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = contr.sum))
summary(mSum)
##
## Call:
## lm(formula = consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = contr.sum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0913 -1.2489 -0.0440 0.9587 9.0511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.17804 0.09606 116.365 <2e-16 ***
## fdrive1 -1.43677 0.12003 -11.970 <2e-16 ***
## fdrive2 0.11594 0.13926 0.833 0.406
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2764
## F-statistic: 78.91 on 2 and 406 DF, p-value: < 2.2e-16
(CSum <- contr.sum(G))
## [,1] [,2]
## 1 1 0
## 2 0 1
## 3 -1 -1
coef(mSum)["(Intercept)"] + CSum %*% coef(mSum)[-1]
## [,1]
## 1 9.741274
## 2 11.293981
## 3 12.498876
muhat <- as.numeric(coef(mSum)["(Intercept)"] + CSum %*% coef(mSum)[-1])
names(muhat) <- levels(CarsNow[, "fdrive"])
print(muhat)
## front rear 4x4
## 9.741274 11.293981 12.498876
alphaSum <- as.numeric(CSum %*% coef(mSum)[-1])
names(alphaSum) <- levels(CarsNow[, "fdrive"])
print(alphaSum)
## front rear 4x4
## -1.4367702 0.1159377 1.3208326
LSest
comes from package mffSM
.library("mffSM")
LSum <- cbind(0, CSum)
rownames(LSum) <- levels(CarsNow[["fdrive"]])
colnames(LSum) <- names(coef(mSum))
print(LSum)
## (Intercept) fdrive1 fdrive2
## front 0 1 0
## rear 0 0 1
## 4x4 0 -1 -1
LSest(mSum, L = LSum)
## Estimate Std. Error t value P value Lower Upper
## front -1.4367702 0.1200284 -11.9702552 < 2e-16 -1.672725 -1.2008156
## rear 0.1159377 0.1392631 0.8325084 0.40561 -0.157829 0.3897043
## 4x4 1.3208326 0.1468489 8.9945021 < 2e-16 1.032153 1.6095117
(ng <- with(CarsNow, table(fdrive)))
## fdrive
## front rear 4x4
## 212 108 89
CwSum <- rbind(diag(G - 1), - ng[-G] / ng[G])
rownames(CwSum) <- levels(CarsNow[["fdrive"]])
print(CwSum)
## front rear
## front 1.000000 0.000000
## rear 0.000000 1.000000
## 4x4 -2.382022 -1.213483
mwSum <- lm(consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = CwSum))
summary(mwSum)
##
## Call:
## lm(formula = consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = CwSum))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0913 -1.2489 -0.0440 0.9587 9.0511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.75134 0.08974 119.802 < 2e-16 ***
## fdrivefront -1.01007 0.08651 -11.676 < 2e-16 ***
## fdriverear 0.54264 0.14982 3.622 0.000329 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2764
## F-statistic: 78.91 on 2 and 406 DF, p-value: < 2.2e-16
coef(mwSum)["(Intercept)"] + CwSum %*% coef(mwSum)[-1]
## [,1]
## front 9.741274
## rear 11.293981
## 4x4 12.498876
muhat <- as.numeric(coef(mwSum)["(Intercept)"] + CwSum %*% coef(mwSum)[-1])
names(muhat) <- levels(CarsNow[, "fdrive"])
print(muhat)
## front rear 4x4
## 9.741274 11.293981 12.498876
alphawSum <- as.numeric(CwSum %*% coef(mwSum)[-1])
names(alphawSum) <- levels(CarsNow[, "fdrive"])
print(alphawSum)
## front rear 4x4
## -1.0100712 0.5426367 1.7475317
LwSum <- cbind(0, CwSum)
rownames(LwSum) <- levels(CarsNow[["fdrive"]])
colnames(LwSum) <- names(coef(mwSum))
print(LwSum)
## (Intercept) fdrivefront fdriverear
## front 0 1.000000 0.000000
## rear 0 0.000000 1.000000
## 4x4 0 -2.382022 -1.213483
LSest(mwSum, L = LwSum)
## Estimate Std. Error t value P value Lower Upper
## front -1.0100712 0.08650951 -11.675840 < 2.22e-16 -1.1801336 -0.8400087
## rear 0.5426367 0.14982008 3.621923 0.00032946 0.2481168 0.8371567
## 4x4 1.7475317 0.17016829 10.269432 < 2.22e-16 1.4130107 2.0820526
mHelmert <- lm(consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = contr.helmert))
summary(mHelmert)
##
## Call:
## lm(formula = consumption ~ fdrive, data = CarsNow, contrasts = list(fdrive = contr.helmert))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0913 -1.2489 -0.0440 0.9587 9.0511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.17804 0.09606 116.365 < 2e-16 ***
## fdrive1 0.77635 0.10728 7.237 2.32e-12 ***
## fdrive2 0.66042 0.07342 8.995 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.815 on 406 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2764
## F-statistic: 78.91 on 2 and 406 DF, p-value: < 2.2e-16
(CHelmert <- contr.helmert(G))
## [,1] [,2]
## 1 -1 -1
## 2 1 -1
## 3 0 2
coef(mHelmert)["(Intercept)"] + CHelmert %*% coef(mHelmert)[-1]
## [,1]
## 1 9.741274
## 2 11.293981
## 3 12.498876
muhat <- as.numeric(coef(mHelmert)["(Intercept)"] + CHelmert %*% coef(mHelmert)[-1])
names(muhat) <- levels(CarsNow[, "fdrive"])
print(muhat)
## front rear 4x4
## 9.741274 11.293981 12.498876