NMSA407 Linear Regression: Tutorial

Multiple regression model

Data Cars2004nh




Introduction

Load used data and calculate basic summaries

data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
##                         vname type drive price.retail price.dealer   price cons.city cons.highway
## 1          Chevrolet.Aveo.4dr    1     1        11690        10965 11327.5       8.4          6.9
## 2 Chevrolet.Aveo.LS.4dr.hatch    1     1        12585        11802 12193.5       8.4          6.9
## 3      Chevrolet.Cavalier.2dr    1     1        14610        13697 14153.5       9.0          6.4
## 4      Chevrolet.Cavalier.4dr    1     1        14810        13884 14347.0       9.0          6.4
## 5   Chevrolet.Cavalier.LS.2dr    1     1        16385        15357 15871.0       9.0          6.4
## 6           Dodge.Neon.SE.4dr    1     1        13670        12849 13259.5       8.1          6.5
##   consumption engine.size ncylinder horsepower weight      iweight  lweight wheel.base length width
## 1        7.65         1.6         4        103   1075 0.0009302326 6.980076        249    424   168
## 2        7.65         1.6         4        103   1065 0.0009389671 6.970730        249    389   168
## 3        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 4        7.70         2.2         4        140   1214 0.0008237232 7.101676        264    465   173
## 5        7.70         2.2         4        140   1187 0.0008424600 7.079184        264    465   175
## 6        7.30         2.0         4        132   1171 0.0008539710 7.065613        267    442   170
##      ftype fdrive
## 1 personal  front
## 2 personal  front
## 3 personal  front
## 4 personal  front
## 5 personal  front
## 6 personal  front
dim(Cars2004nh)
## [1] 425  20
summary(Cars2004nh)
##     vname                type           drive        price.retail     price.dealer   
##  Length:425         Min.   :1.000   Min.   :1.000   Min.   : 10280   Min.   :  9875  
##  Class :character   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 20370   1st Qu.: 18973  
##  Mode  :character   Median :1.000   Median :1.000   Median : 27905   Median : 25672  
##                     Mean   :2.219   Mean   :1.692   Mean   : 32866   Mean   : 30096  
##                     3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.: 39235   3rd Qu.: 35777  
##                     Max.   :6.000   Max.   :3.000   Max.   :192465   Max.   :173560  
##                                                                                      
##      price          cons.city      cons.highway     consumption     engine.size      ncylinder     
##  Min.   : 10078   Min.   : 6.20   Min.   : 5.100   Min.   : 5.65   Min.   :1.300   Min.   :-1.000  
##  1st Qu.: 19600   1st Qu.:11.20   1st Qu.: 8.100   1st Qu.: 9.65   1st Qu.:2.400   1st Qu.: 4.000  
##  Median : 26656   Median :12.40   Median : 9.000   Median :10.70   Median :3.000   Median : 6.000  
##  Mean   : 31481   Mean   :12.36   Mean   : 9.142   Mean   :10.75   Mean   :3.208   Mean   : 5.791  
##  3rd Qu.: 37514   3rd Qu.:13.80   3rd Qu.: 9.800   3rd Qu.:11.65   3rd Qu.:3.900   3rd Qu.: 6.000  
##  Max.   :183012   Max.   :23.50   Max.   :19.600   Max.   :21.55   Max.   :8.300   Max.   :12.000  
##                   NA's   :14      NA's   :14       NA's   :14                                      
##    horsepower        weight        iweight             lweight        wheel.base        length     
##  Min.   :100.0   Min.   : 923   Min.   :0.0003067   Min.   :6.828   Min.   :226.0   Min.   :363.0  
##  1st Qu.:165.0   1st Qu.:1412   1st Qu.:0.0005542   1st Qu.:7.253   1st Qu.:262.0   1st Qu.:450.0  
##  Median :210.0   Median :1577   Median :0.0006341   Median :7.363   Median :272.0   Median :472.0  
##  Mean   :216.8   Mean   :1626   Mean   :0.0006412   Mean   :7.373   Mean   :274.9   Mean   :470.6  
##  3rd Qu.:255.0   3rd Qu.:1804   3rd Qu.:0.0007082   3rd Qu.:7.498   3rd Qu.:284.0   3rd Qu.:490.0  
##  Max.   :500.0   Max.   :3261   Max.   :0.0010834   Max.   :8.090   Max.   :366.0   Max.   :577.0  
##                  NA's   :2      NA's   :2           NA's   :2       NA's   :2       NA's   :26     
##      width            ftype       fdrive   
##  Min.   :163.0   personal:242   front:223  
##  1st Qu.:175.0   wagon   : 30   rear :110  
##  Median :180.0   SUV     : 60   4x4  : 92  
##  Mean   :181.1   pickup  : 24              
##  3rd Qu.:185.0   sport   : 49              
##  Max.   :206.0   minivan : 20              
##  NA's   :28

Complete cases subset used here

To be able to compare a model fitted here with other models where also other covariates will be included, we restrict ourselves to a subset of the dataset where all variables consumption, fdrive', andlweight are known.

isComplete <- complete.cases(Cars2004nh[, c("consumption", "fdrive", "lweight", "engine.size")])
sum(!isComplete)
## [1] 16
CarsNow <- subset(Cars2004nh, isComplete, select = c("consumption", "drive", "fdrive", "weight", "lweight", "engine.size"))
dim(CarsNow)
## [1] 409   6
summary(CarsNow)
##   consumption        drive         fdrive        weight        lweight       engine.size   
##  Min.   : 5.65   Min.   :1.000   front:212   Min.   : 923   Min.   :6.828   Min.   :1.300  
##  1st Qu.: 9.65   1st Qu.:1.000   rear :108   1st Qu.:1415   1st Qu.:7.255   1st Qu.:2.400  
##  Median :10.70   Median :1.000   4x4  : 89   Median :1577   Median :7.363   Median :3.000  
##  Mean   :10.75   Mean   :1.699               Mean   :1622   Mean   :7.371   Mean   :3.178  
##  3rd Qu.:11.65   3rd Qu.:2.000               3rd Qu.:1804   3rd Qu.:7.498   3rd Qu.:3.800  
##  Max.   :21.55   Max.   :3.000               Max.   :2903   Max.   :7.973   Max.   :6.000




Dependence of consumption on fdrive',lweightandengine.size

Basic scatterplots to start

library("car")
## Loading required package: carData
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~consumption + fdrive + lweight + engine.size, pch = 16, col = "olivedrab",
                  regLine = list(method = lm, lty = 1, lwd = 2, col = "red3"), smooth = FALSE,
          diagonal = list(method = "histogram"), data = CarsNow)

plot of chunk fig-MultipleRegr-01-01

Model with all covariates additively

mAddit <- lm(consumption ~ fdrive + engine.size + lweight, data = CarsNow)
summary(mAddit)
## 
## Call:
## lm(formula = consumption ~ fdrive + engine.size + lweight, data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1058 -0.5892 -0.0899  0.5472  4.9148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -35.84930    3.08092 -11.636  < 2e-16 ***
## fdriverear    0.46260    0.11715   3.949 9.26e-05 ***
## fdrive4x4     0.98198    0.13019   7.543 3.07e-13 ***
## engine.size   0.56908    0.08361   6.807 3.62e-11 ***
## lweight       6.03099    0.44795  13.464  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9223 on 404 degrees of freedom
## Multiple R-squared:  0.8149, Adjusted R-squared:  0.8131 
## F-statistic: 444.8 on 4 and 404 DF,  p-value: < 2.2e-16

Significance of each term (if additivity can be assumed)

drop1(mAddit, test = "F")
## Single term deletions
## 
## Model:
## consumption ~ fdrive + engine.size + lweight
##             Df Sum of Sq    RSS     AIC F value    Pr(>F)    
## <none>                   343.69 -61.161                      
## fdrive       2    50.574 394.26  -9.012  29.725 9.046e-13 ***
## engine.size  1    39.413 383.10 -18.758  46.330 3.625e-11 ***
## lweight      1   154.205 497.89  88.436 181.267 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model with all covariates within two-way interactions

mInter <- lm(consumption ~ (fdrive + engine.size + lweight)^2, data = CarsNow)
summary(mInter)
## 
## Call:
## lm(formula = consumption ~ (fdrive + engine.size + lweight)^2, 
##     data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1489 -0.5982 -0.1019  0.4863  3.5918 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -26.124609   5.776121  -4.523 8.06e-06 ***
## fdriverear              26.875936   7.367167   3.648 0.000299 ***
## fdrive4x4               13.308169   8.311915   1.601 0.110147    
## engine.size             -5.391862   1.746264  -3.088 0.002158 ** 
## lweight                  4.757609   0.817131   5.822 1.19e-08 ***
## fdriverear:engine.size   0.009665   0.182958   0.053 0.957895    
## fdrive4x4:engine.size    0.315489   0.216880   1.455 0.146547    
## fdriverear:lweight      -3.571144   1.061146  -3.365 0.000839 ***
## fdrive4x4:lweight       -1.818723   1.189560  -1.529 0.127081    
## engine.size:lweight      0.790111   0.233312   3.386 0.000778 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8726 on 399 degrees of freedom
## Multiple R-squared:  0.8364, Adjusted R-squared:  0.8327 
## F-statistic: 226.7 on 9 and 399 DF,  p-value: < 2.2e-16

Compare additive and all-two-way-interaction models

anova(mAddit, mInter)
## Analysis of Variance Table
## 
## Model 1: consumption ~ fdrive + engine.size + lweight
## Model 2: consumption ~ (fdrive + engine.size + lweight)^2
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    404 343.69                                  
## 2    399 303.78  5    39.906 10.483 1.813e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

All two-way interactions needed?

drop1(mInter, test = "F")
## Single term deletions
## 
## Model:
## consumption ~ (fdrive + engine.size + lweight)^2
##                     Df Sum of Sq    RSS      AIC F value    Pr(>F)    
## <none>                           303.78 -101.642                      
## fdrive:engine.size   2    1.9215 305.70 -103.064  1.2619 0.2842440    
## fdrive:lweight       2    8.6863 312.46  -94.112  5.7045 0.0036085 ** 
## engine.size:lweight  1    8.7315 312.51  -92.052 11.4684 0.0007782 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model without the fdrive:engine.size' interaction

mInter2 <- lm(consumption ~ fdrive + engine.size + lweight + fdrive:lweight + engine.size:lweight, data = CarsNow)
summary(mInter2)
## 
## Call:
## lm(formula = consumption ~ fdrive + engine.size + lweight + fdrive:lweight + 
##     engine.size:lweight, data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1094 -0.6157 -0.0946  0.4900  3.5782 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -22.8398     4.9687  -4.597 5.76e-06 ***
## fdriverear           27.3567     4.9219   5.558 4.98e-08 ***
## fdrive4x4             4.3904     5.5249   0.795 0.427287    
## engine.size          -5.8845     1.6945  -3.473 0.000571 ***
## lweight               4.2821     0.6873   6.230 1.18e-09 ***
## fdriverear:lweight   -3.6356     0.6675  -5.446 8.98e-08 ***
## fdrive4x4:lweight    -0.4836     0.7425  -0.651 0.515241    
## engine.size:lweight   0.8662     0.2270   3.817 0.000157 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8731 on 401 degrees of freedom
## Multiple R-squared:  0.8354, Adjusted R-squared:  0.8325 
## F-statistic: 290.7 on 7 and 401 DF,  p-value: < 2.2e-16

Still both two-way interactions needed?

drop1(mInter2, test = "F")
## Single term deletions
## 
## Model:
## consumption ~ fdrive + engine.size + lweight + fdrive:lweight + 
##     engine.size:lweight
##                     Df Sum of Sq    RSS      AIC F value    Pr(>F)    
## <none>                           305.70 -103.064                      
## fdrive:lweight       2    24.150 329.85  -75.966  15.839 2.395e-07 ***
## engine.size:lweight  1    11.105 316.81  -90.469  14.567 0.0001566 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model with only fdrive:lweight' interaction

mInter1 <- lm(consumption ~ fdrive + engine.size + lweight + fdrive:lweight, data = CarsNow)
summary(mInter1)
## 
## Call:
## lm(formula = consumption ~ fdrive + engine.size + lweight + fdrive:lweight, 
##     data = CarsNow)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0825 -0.6328 -0.0868  0.4760  4.2210 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -37.44459    3.22260 -11.619  < 2e-16 ***
## fdriverear          22.90273    4.86163   4.711 3.40e-06 ***
## fdrive4x4           -8.59853    4.42520  -1.943   0.0527 .  
## engine.size          0.57588    0.08125   7.088 6.16e-12 ***
## lweight              6.24702    0.46296  13.494  < 2e-16 ***
## fdriverear:lweight  -3.03731    0.65971  -4.604 5.57e-06 ***
## fdrive4x4:lweight    1.26748    0.59358   2.135   0.0333 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8877 on 402 degrees of freedom
## Multiple R-squared:  0.8294, Adjusted R-squared:  0.8269 
## F-statistic: 325.8 on 6 and 402 DF,  p-value: < 2.2e-16

What can be removed now?

drop1(mInter1, test = "F")
## Single term deletions
## 
## Model:
## consumption ~ fdrive + engine.size + lweight + fdrive:lweight
##                Df Sum of Sq    RSS     AIC F value    Pr(>F)    
## <none>                      316.81 -90.469                      
## engine.size     1    39.590 356.40 -44.308  50.236 6.159e-12 ***
## fdrive:lweight  2    26.879 343.69 -61.161  17.054 7.782e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Models comparisons

anova(mAddit, mInter)
## Analysis of Variance Table
## 
## Model 1: consumption ~ fdrive + engine.size + lweight
## Model 2: consumption ~ (fdrive + engine.size + lweight)^2
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    404 343.69                                  
## 2    399 303.78  5    39.906 10.483 1.813e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mInter1, mInter)
## Analysis of Variance Table
## 
## Model 1: consumption ~ fdrive + engine.size + lweight + fdrive:lweight
## Model 2: consumption ~ (fdrive + engine.size + lweight)^2
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    402 316.81                                  
## 2    399 303.78  3    13.027 5.7034 0.0007864 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mInter2, mInter)
## Analysis of Variance Table
## 
## Model 1: consumption ~ fdrive + engine.size + lweight + fdrive:lweight + 
##     engine.size:lweight
## Model 2: consumption ~ (fdrive + engine.size + lweight)^2
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    401 305.70                           
## 2    399 303.78  2    1.9215 1.2619 0.2842

Basic diagnostic plot for the four models

par(mfrow = c(2, 2), mar = c(4, 4, 2, 1) + 0.1, bty = BTY)
plot(mAddit, which = 1, caption = "Additive", pch = 21, col = "blue4", bg = "skyblue")
plot(mInter1, which = 1, caption = "One Interaction", pch = 21, col = "blue4", bg = "skyblue")
plot(mInter2, which = 1, caption = "Two Interactions", pch = 21, col = "blue4", bg = "skyblue")
plot(mInter, which = 1, caption = "All Interactions", pch = 21, col = "blue4", bg = "skyblue")

plot of chunk fig-MultipleRegr-01-02

par(mfrow = c(1, 1), mar = c(4, 4, 4, 1) + 0.1)

Function to create additional useful diagnostic plots

funDiag <- function(fit){
  FCOL <- rainbow_hcl(3)
  FCOL2 <- c("red3", "darkgreen", "darkblue")
  FPCH <- c(21, 23, 24)
  names(FCOL) <- names(FCOL2) <- names(FPCH) <- levels(CarsNow[, "fdrive"])

  resid <- residuals(fit)
  PAR <- par(mfrow = c(2, 3), bty = "n", mar = c(4, 4, 4, 1) + 0.1)
  on.exit(par(PAR))
  for (XV in c("engine.size", "lweight")){
    for (DRV in levels(CarsNow[, "fdrive"])){
      y <- resid[CarsNow[, "fdrive"] == DRV]
      x <- CarsNow[CarsNow[, "fdrive"] == DRV, XV]
      plot(y ~ x, xlab = XV, ylab = "Residuals", main = DRV, type = "n")
      abline(h = 0, col = "grey80")
      points(x, y, pch = FPCH[DRV], col = FCOL[DRV], bg = FCOL2[DRV], )
      lines(lowess(x, y), col = "orange", lwd = 2)
    }
  }

  return(invisible(fit))
}

A bit more detailed diagnostics

funDiag(mAddit)

plot of chunk fig-MultipleRegr-01-03

funDiag(mInter1)

plot of chunk fig-MultipleRegr-01-04

funDiag(mInter2)

plot of chunk fig-MultipleRegr-01-05

funDiag(mInter)

plot of chunk fig-MultipleRegr-01-06