NMSA407 Linear Regression: Tutorial

Categorical ordinal covariate

Data Cars2004nh

Introduction

Load used data and calculate basic summaries

data(Cars2004nh, package = "mffSM")
head(Cars2004nh)

##                         vname type drive price.retail price.dealer   price cons.city cons.highway
## 1          Chevrolet.Aveo.4dr    1     1        11690        10965 11327.5       8.4          6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch    1     1        12585        11802 12193.5       8.4          6.9
## 3      Chevrolet.Cavalier.2dr    1     1        14610        13697 14153.5       9.0          6.4
## 4      Chevrolet.Cavalier.4dr    1     1        14810        13884 14347.0       9.0          6.4
## 5   Chevrolet.Cavalier.LS.2dr    1     1        16385        15357 15871.0       9.0          6.4
## 6           Dodge.Neon.SE.4dr    1     1        13670        12849 13259.5       8.1          6.5
##   consumption engine.size ncylinder horsepower weight      iweight  lweight wheel.base length width
## 1        7.65         1.6         4        103   1075 0.0009302326 6.980076        249    424   168
## 2        7.65         1.6         4        103   1065 0.0009389671 6.970730        249    389   168
## 3        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 4        7.70         2.2         4        140   1214 0.0008237232 7.101676        264    465   173
## 5        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 6        7.30         2.0         4        132   1171 0.0008539710 7.065613        267    442   170
##      ftype fdrive
## 1 personal  front
## 2 personal  front
## 3 personal  front
## 4 personal  front
## 5 personal  front
## 6 personal  front

dim(Cars2004nh)

## [1] 425  20

summary(Cars2004nh)

##     vname                type           drive        price.retail     price.dealer   
##  Length:425         Min.   :1.000   Min.   :1.000   Min.   : 10280   Min.   :  9875  
##  Class :character   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 20370   1st Qu.: 18973  
##  Mode  :character   Median :1.000   Median :1.000   Median : 27905   Median : 25672  
##                     Mean   :2.219   Mean   :1.692   Mean   : 32866   Mean   : 30096  
##                     3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.: 39235   3rd Qu.: 35777  
##                     Max.   :6.000   Max.   :3.000   Max.   :192465   Max.   :173560  
##                                                                                      
##      price          cons.city      cons.highway     consumption     engine.size      ncylinder     
##  Min.   : 10078   Min.   : 6.20   Min.   : 5.100   Min.   : 5.65   Min.   :1.300   Min.   :-1.000  
##  1st Qu.: 19600   1st Qu.:11.20   1st Qu.: 8.100   1st Qu.: 9.65   1st Qu.:2.400   1st Qu.: 4.000  
##  Median : 26656   Median :12.40   Median : 9.000   Median :10.70   Median :3.000   Median : 6.000  
##  Mean   : 31481   Mean   :12.36   Mean   : 9.142   Mean   :10.75   Mean   :3.208   Mean   : 5.791  
##  3rd Qu.: 37514   3rd Qu.:13.80   3rd Qu.: 9.800   3rd Qu.:11.65   3rd Qu.:3.900   3rd Qu.: 6.000  
##  Max.   :183012   Max.   :23.50   Max.   :19.600   Max.   :21.55   Max.   :8.300   Max.   :12.000  
##                   NA's   :14      NA's   :14       NA's   :14                                      
##    horsepower        weight        iweight             lweight        wheel.base        length     
##  Min.   :100.0   Min.   : 923   Min.   :0.0003067   Min.   :6.828   Min.   :226.0   Min.   :363.0  
##  1st Qu.:165.0   1st Qu.:1412   1st Qu.:0.0005542   1st Qu.:7.253   1st Qu.:262.0   1st Qu.:450.0  
##  Median :210.0   Median :1577   Median :0.0006341   Median :7.363   Median :272.0   Median :472.0  
##  Mean   :216.8   Mean   :1626   Mean   :0.0006412   Mean   :7.373   Mean   :274.9   Mean   :470.6  
##  3rd Qu.:255.0   3rd Qu.:1804   3rd Qu.:0.0007082   3rd Qu.:7.498   3rd Qu.:284.0   3rd Qu.:490.0  
##  Max.   :500.0   Max.   :3261   Max.   :0.0010834   Max.   :8.090   Max.   :366.0   Max.   :577.0  
##                  NA's   :2      NA's   :2           NA's   :2       NA's   :2       NA's   :26     
##      width            ftype       fdrive   
##  Min.   :163.0   personal:242   front:223  
##  1st Qu.:175.0   wagon   : 30   rear :110  
##  Median :180.0   SUV     : 60   4x4  : 92  
##  Mean   :181.1   pickup  : 24              
##  3rd Qu.:185.0   sport   : 49              
##  Max.   :206.0   minivan : 20              
##  NA's   :28

Complete cases subset used here

To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption, lweight and engine.size are known.

isComplete <- complete.cases(Cars2004nh[, c("consumption", "lweight", "engine.size")])
sum(!isComplete)

## [1] 16

CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size"))
dim(CarsNow)

## [1] 409   6

summary(CarsNow)

##   consumption        drive         fdrive        weight        lweight       engine.size   
##  Min.   : 5.65   Min.   :1.000   front:212   Min.   : 923   Min.   :6.828   Min.   :1.300  
##  1st Qu.: 9.65   1st Qu.:1.000   rear :108   1st Qu.:1415   1st Qu.:7.255   1st Qu.:2.400  
##  Median :10.70   Median :1.000   4x4  : 89   Median :1577   Median :7.363   Median :3.000  
##  Mean   :10.75   Mean   :1.699               Mean   :1622   Mean   :7.371   Mean   :3.178  
##  3rd Qu.:11.65   3rd Qu.:2.000               3rd Qu.:1804   3rd Qu.:7.498   3rd Qu.:3.800  
##  Max.   :21.55   Max.   :3.000               Max.   :2903   Max.   :7.973   Max.   :6.000

Categorized `weight`

CarsNow <- transform(CarsNow, fweight = cut(weight, breaks = c(900, 1250, 1500, 1750, 2000, 3300),
                                                    labels = c("<=1250", "1250-1500", "1500-1750", "1750-2000", ">2000")))
summary(CarsNow[["fweight"]])

##    <=1250 1250-1500 1500-1750 1750-2000     >2000 
##        57        95       137        71        49

Dependence of `consumption` on categorized `weight`

Sample means of `consumption`

print(grandMean <- with(CarsNow, mean(consumption)))

## [1] 10.75134

print(gMean <- with(CarsNow, tapply(consumption, fweight, mean, na.rm = TRUE)))

##    <=1250 1250-1500 1500-1750 1750-2000     >2000 
##   7.77193   9.84000  10.73905  11.82676  14.46020

ng <- length(gMean)

Boxplots

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ fweight, data = CarsNow, col = rainbow_hcl(ng), xlab = "Weight [kg]", ylab = "Consumption [l/100 km]")

plot of chunk fig-ParamCovar-04-01

Scatterplot with group means connected by a line

par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ as.numeric(fweight), data = CarsNow, pch = 23, col = "red3", bg = "orange",
     xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
lines(1:ng, gMean, col = "blue", lwd = 2)

plot of chunk fig-ParamCovar-04-02

Scatterplot with horizontally jittered values

set.seed(20010911)
CarsNow <- transform(CarsNow, jfweight = as.numeric(fweight) + runif(nrow(CarsNow), -0.25, 0.25))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ jfweight, data = CarsNow, pch = 23, col = "red3", bg = "orange",
     xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
lines(1:ng, gMean, col = "blue", lwd = 2)

plot of chunk fig-ParamCovar-04-03

Scatterplot with horizontally jittered values, sample means not connected

set.seed(20010911)
CarsNow <- transform(CarsNow, jfweight = as.numeric(fweight) + runif(nrow(CarsNow), -0.25, 0.25))
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ jfweight, data = CarsNow, pch = 23, col = "red3", bg = "orange",
     xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)

plot of chunk fig-ParamCovar-04-04

##lines(1:ng, gMean, col = "blue", lwd = 2)

One-way analysis of variance

ANOVA table

a2 <- aov(consumption ~ fweight, data = CarsNow)
summary(a2)

##              Df Sum Sq Mean Sq F value Pr(>F)    
## fweight       4 1341.0   335.3   262.4 <2e-16 ***
## Residuals   404  516.2     1.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reference group pseudocontrast parameterization

Fit

mTrt <- lm(consumption ~ fweight, data = CarsNow)
summary(mTrt)

## 
## Call:
## lm(formula = consumption ~ fweight, data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1900 -0.7102 -0.0400  0.6232  7.0898 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        7.7719     0.1497   51.91   <2e-16 ***
## fweight1250-1500   2.0681     0.1894   10.92   <2e-16 ***
## fweight1500-1750   2.9671     0.1782   16.65   <2e-16 ***
## fweight1750-2000   4.0548     0.2010   20.17   <2e-16 ***
## fweight>2000       6.6883     0.2202   30.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.13 on 404 degrees of freedom
## Multiple R-squared:  0.7221, Adjusted R-squared:  0.7193 
## F-statistic: 262.4 on 4 and 404 DF,  p-value: < 2.2e-16

Orthonormal polynomials parameterization

Levels of an ordinal covariate `fweight`

levels(CarsNow[["fweight"]])

## [1] "<=1250"    "1250-1500" "1500-1750" "1750-2000" ">2000"

Fit

mPoly <- lm(consumption ~ fweight, data = CarsNow, contrasts = list(fweight = contr.poly))
summary(mPoly)

## 
## Call:
## lm(formula = consumption ~ fweight, data = CarsNow, contrasts = list(fweight = contr.poly))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1900 -0.7102 -0.0400  0.6232  7.0898 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.093e+01  5.975e-02 182.876  < 2e-16 ***
## fweight.L    4.858e+00  1.501e-01  32.359  < 2e-16 ***
## fweight.Q    3.526e-01  1.370e-01   2.574   0.0104 *  
## fweight.C    8.585e-01  1.320e-01   6.503 2.33e-10 ***
## fweight^4   -7.193e-05  1.126e-01  -0.001   0.9995    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.13 on 404 degrees of freedom
## Multiple R-squared:  0.7221, Adjusted R-squared:  0.7193 
## F-statistic: 262.4 on 4 and 404 DF,  p-value: < 2.2e-16

Contrast matrix

(CPoly <- contr.poly(5))

##                 .L         .Q            .C         ^4
## [1,] -6.324555e-01  0.5345225 -3.162278e-01  0.1195229
## [2,] -3.162278e-01 -0.2672612  6.324555e-01 -0.4780914
## [3,] -3.287978e-17 -0.5345225  1.595204e-16  0.7171372
## [4,]  3.162278e-01 -0.2672612 -6.324555e-01 -0.4780914
## [5,]  6.324555e-01  0.5345225  3.162278e-01  0.1195229

round(CPoly, digits = 15)

##              .L         .Q         .C         ^4
## [1,] -0.6324555  0.5345225 -0.3162278  0.1195229
## [2,] -0.3162278 -0.2672612  0.6324555 -0.4780914
## [3,]  0.0000000 -0.5345225  0.0000000  0.7171372
## [4,]  0.3162278 -0.2672612 -0.6324555 -0.4780914
## [5,]  0.6324555  0.5345225  0.3162278  0.1195229

Estimated group means from the linear model

coef(mPoly)["(Intercept)"] + CPoly %*% coef(mPoly)[-1]

##          [,1]
## [1,]  7.77193
## [2,]  9.84000
## [3,] 10.73905
## [4,] 11.82676
## [5,] 14.46020

muhat <- as.numeric(coef(mPoly)["(Intercept)"] + CPoly %*% coef(mPoly)[-1])
names(muhat) <- levels(CarsNow[, "fweight"])
print(muhat)

##    <=1250 1250-1500 1500-1750 1750-2000     >2000 
##   7.77193   9.84000  10.73905  11.82676  14.46020

Scatterplot with horizontally jittered values equipped by the fitted polynomial

tgrid <- seq(0.75, 5.25, length = 100)
be <- coef(mPoly)
fitl <- be[1] + apply(be[-1] * t(predict(poly(1:5, degree = 4), newdata = tgrid)), 2, sum)
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(consumption ~ jfweight, data = CarsNow, pch = 23, col = "red3", bg = "orange",
     xlab = "Weight [kg]", ylab = "Consumption [l/100 km]", xlim = c(0.5, ng + 0.5), xaxt = "n")
axis(1, at = 1:ng, labels = levels(CarsNow[, "fweight"]))
points(1:ng, gMean, pch = 21, col = "yellow", bg = "blue", cex = 2.5)
lines(tgrid, fitl, col = "blue", lwd = 2)

plot of chunk fig-ParamCovar-04-05

Smaller degree polynomial trend?

Variable `nweight` taking values 1, 2, 3, 4, 5

CarsNow <- transform(CarsNow, nweight = as.numeric(fweight))
with(CarsNow, table(fweight, nweight))

##            nweight
## fweight       1   2   3   4   5
##   <=1250     57   0   0   0   0
##   1250-1500   0  95   0   0   0
##   1500-1750   0   0 137   0   0
##   1750-2000   0   0   0  71   0
##   >2000       0   0   0   0  49

Polynomial trends of degree 1, 2, 3, 4

p4 <- lm(consumption ~ nweight + I(nweight^2) + I(nweight^3) + I(nweight^4), data = CarsNow)
#p4 <- lm(consumption ~ poly(nweight, 4), data = CarsNow)
summary(p4)

## 
## Call:
## lm(formula = consumption ~ nweight + I(nweight^2) + I(nweight^3) + 
##     I(nweight^4), data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1900 -0.7102 -0.0400  0.6232  7.0898 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   3.177e+00  1.820e+00   1.745   0.0818 .
## nweight       6.312e+00  3.274e+00   1.928   0.0546 .
## I(nweight^2) -1.943e+00  1.947e+00  -0.998   0.3190  
## I(nweight^3)  2.265e-01  4.687e-01   0.483   0.6292  
## I(nweight^4) -2.507e-05  3.925e-02  -0.001   0.9995  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.13 on 404 degrees of freedom
## Multiple R-squared:  0.7221, Adjusted R-squared:  0.7193 
## F-statistic: 262.4 on 4 and 404 DF,  p-value: < 2.2e-16

#
p3 <- lm(consumption ~ nweight + I(nweight^2) + I(nweight^3), data = CarsNow)
#p3 <- lm(consumption ~ poly(nweight, 3), data = CarsNow)
summary(p3)

## 
## Call:
## lm(formula = consumption ~ nweight + I(nweight^2) + I(nweight^3), 
##     data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1900 -0.7102 -0.0400  0.6233  7.0898 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.17764    0.66606   4.771 2.57e-06 ***
## nweight       6.30991    0.84064   7.506 3.91e-13 ***
## I(nweight^2) -1.94184    0.31116  -6.241 1.10e-09 ***
## I(nweight^3)  0.22623    0.03456   6.545 1.80e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.129 on 405 degrees of freedom
## Multiple R-squared:  0.7221, Adjusted R-squared:   0.72 
## F-statistic: 350.7 on 3 and 405 DF,  p-value: < 2.2e-16

#
p2 <- lm(consumption ~ nweight + I(nweight^2), data = CarsNow)
#p2 <- lm(consumption ~ poly(nweight, 2), data = CarsNow)
summary(p2)

## 
## Call:
## lm(formula = consumption ~ nweight + I(nweight^2), data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7283 -0.8288 -0.0907  0.6712  7.4817 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.06590    0.31634  22.337  < 2e-16 ***
## nweight       0.99339    0.22734   4.370 1.58e-05 ***
## I(nweight^2)  0.08142    0.03732   2.182   0.0297 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.186 on 406 degrees of freedom
## Multiple R-squared:  0.6927, Adjusted R-squared:  0.6912 
## F-statistic: 457.5 on 2 and 406 DF,  p-value: < 2.2e-16

#
p1 <- lm(consumption ~ nweight, data = CarsNow)
#p1 <- lm(consumption ~ poly(nweight, 1), data = CarsNow)
summary(p1)

## 
## Call:
## lm(formula = consumption ~ nweight, data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7682 -0.7959 -0.1235  0.6595  7.6988 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.4628     0.1545   41.84   <2e-16 ***
## nweight       1.4777     0.0492   30.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.191 on 407 degrees of freedom
## Multiple R-squared:  0.6891, Adjusted R-squared:  0.6883 
## F-statistic: 901.9 on 1 and 407 DF,  p-value: < 2.2e-16

Is a linear trend adequate?

anova(p1, p4)

## Analysis of Variance Table
## 
## Model 1: consumption ~ nweight
## Model 2: consumption ~ nweight + I(nweight^2) + I(nweight^3) + I(nweight^4)
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1    407 577.49                                 
## 2    404 516.20  3    61.291 15.99 7.667e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(p1, mPoly)

## Analysis of Variance Table
## 
## Model 1: consumption ~ nweight
## Model 2: consumption ~ fweight
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1    407 577.49                                 
## 2    404 516.20  3    61.291 15.99 7.667e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1