NMSA407 Linear Regression: Tutorial

Multicollinearity

Data IQ




Introduction

Load used data and calculate basic summaries

data(IQ, package = "mffSM")
head(IQ)
##   gender fgender  iq  zn7  zn8
## 1      0    girl  94 1.84 1.81
## 2      0    girl 134 1.00 1.00
## 3      0    girl 108 1.69 1.90
## 4      1     boy  96 2.60 2.90
## 5      0    girl 101 1.92 1.72
## 6      1     boy  97 2.00 2.25
dim(IQ)
## [1] 111   5
summary(IQ)
##      gender       fgender         iq             zn7             zn8       
##  Min.   :0.0000   girl:54   Min.   : 72.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:0.0000   boy :57   1st Qu.:101.0   1st Qu.:1.300   1st Qu.:1.270  
##  Median :1.0000             Median :108.0   Median :1.620   Median :1.720  
##  Mean   :0.5135             Mean   :109.2   Mean   :1.746   Mean   :1.782  
##  3rd Qu.:1.0000             3rd Qu.:118.0   3rd Qu.:2.070   3rd Qu.:2.090  
##  Max.   :1.0000             Max.   :141.0   Max.   :3.300   Max.   :3.360

Summary by gender

Boys

summary(subset(IQ, fgender == "boy", select = c("iq", "zn7", "zn8")))
##        iq             zn7             zn8       
##  Min.   : 72.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 97.0   1st Qu.:1.380   1st Qu.:1.270  
##  Median :106.0   Median :1.840   Median :1.900  
##  Mean   :107.5   Mean   :1.968   Mean   :2.012  
##  3rd Qu.:117.0   3rd Qu.:2.610   3rd Qu.:2.550  
##  Max.   :141.0   Max.   :3.300   Max.   :3.360

Girls

summary(subset(IQ, fgender == "girl", select = c("iq", "zn7", "zn8")))
##        iq             zn7             zn8       
##  Min.   : 80.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:102.2   1st Qu.:1.208   1st Qu.:1.270  
##  Median :111.0   Median :1.460   Median :1.540  
##  Mean   :111.1   Mean   :1.511   Mean   :1.538  
##  3rd Qu.:118.5   3rd Qu.:1.830   3rd Qu.:1.810  
##  Max.   :141.0   Max.   :2.230   Max.   :2.450




Dependence of IQ on grades from 7th and 8th year and on gender

Scatterplots (and histograms) of all numeric variables and gender represented by zeros and ones

library("psych")
pairs.panels(subset(IQ, select = c("iq", "gender", "zn7", "zn8")), bg = "palegoldenrod", col = "red3", pch = 21,
             ellipses = FALSE, smooth = FALSE, lm = TRUE, hist.col = "lightblue", rug = FALSE)

plot of chunk fig-ProblRegrSpace-01-01

Correlation coefficients (again)

round(cor(IQ[, c("iq", "gender", "zn7", "zn8")]), 2)
##           iq gender   zn7   zn8
## iq      1.00  -0.12 -0.69 -0.66
## gender -0.12   1.00  0.37  0.38
## zn7    -0.69   0.37  1.00  0.95
## zn8    -0.66   0.38  0.95  1.00

One more useful plot

COLS <- c("red3", "darkblue")
BGCOLS <- rainbow_hcl(2, start = 30, end = 230)
PCH <- c(25, 24)
names(COLS) <- names(BGCOLS) <- names(PCH) <- c("girl", "boy")
layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE))
par(mar = c(5, 4, 1, 1) + 0.1)
plot(iq ~ fgender, col = BGCOLS, data=IQ)
plot(iq ~ zn7, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
plot(iq ~ zn8, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
legend(2.5, 140, legend = c("Girl", "Boy"), col = COLS, pt.bg = BGCOLS, pch = PCH)

plot of chunk fig-ProblRegrSpace-01-02

#par(mfrow = c(1, 1))




Linear model with all regressors included additively

LSE fit

(sm1 <- summary(m1 <- lm(iq ~ gender + zn7 + zn8, data = IQ)))
## 
## Call:
## lm(formula = iq ~ gender + zn7 + zn8, data = IQ)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.1677  -7.5243  -0.4338   7.1780  26.4095 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  138.222      3.119  44.314  < 2e-16 ***
## gender         4.563      2.221   2.055  0.04232 *  
## zn7          -16.767      5.536  -3.029  0.00308 ** 
## zn8           -1.149      5.557  -0.207  0.83658    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.81 on 107 degrees of freedom
## Multiple R-squared:  0.4943, Adjusted R-squared:  0.4801 
## F-statistic: 34.87 on 3 and 107 DF,  p-value: 8.472e-16

Standardized variables (regressors of unity standard deviation)

(sm1st <- summary(m1st <- lm(scale(iq) ~ scale(gender) + scale(zn7) + scale(zn8), data = IQ)))
## 
## Call:
## lm(formula = scale(iq) ~ scale(gender) + scale(zn7) + scale(zn8), 
##     data = IQ)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47790 -0.50164 -0.02892  0.47855  1.76069 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    1.169e-16  6.844e-02   0.000  1.00000   
## scale(gender)  1.528e-01  7.434e-02   2.055  0.04232 * 
## scale(zn7)    -6.989e-01  2.308e-01  -3.029  0.00308 **
## scale(zn8)    -4.800e-02  2.321e-01  -0.207  0.83658   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.721 on 107 degrees of freedom
## Multiple R-squared:  0.4943, Adjusted R-squared:  0.4801 
## F-statistic: 34.87 on 3 and 107 DF,  p-value: 8.472e-16

Standardized variables (regressors of unity norm)

n <- nrow(IQ)
IQ <- transform(IQ, siq     = scale(iq) / sqrt(n - 1),
                    sgender = scale(gender) / sqrt(n - 1),
                    szn7    = scale(zn7) / sqrt(n - 1),
                    szn8    = scale(zn8) / sqrt(n - 1))
(sm1st2 <- summary(m1st2 <- lm(siq ~ sgender + szn7 + szn8, data = IQ)))
## 
## Call:
## lm(formula = siq ~ sgender + szn7 + szn8, data = IQ)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.140912 -0.047829 -0.002757  0.045628  0.167875 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  9.731e-18  6.525e-03   0.000  1.00000   
## sgender      1.528e-01  7.434e-02   2.055  0.04232 * 
## szn7        -6.989e-01  2.308e-01  -3.029  0.00308 **
## szn8        -4.800e-02  2.321e-01  -0.207  0.83658   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06875 on 107 degrees of freedom
## Multiple R-squared:  0.4943, Adjusted R-squared:  0.4801 
## F-statistic: 34.87 on 3 and 107 DF,  p-value: 8.472e-16

Variance inflation factors

detach("package:psych")
library("car")
vif(m1)
##   gender      zn7      zn8 
##  1.16923 11.26866 11.40240
vif(lm(iq ~ fgender + zn7 + zn8, data = IQ))
##  fgender      zn7      zn8 
##  1.16923 11.26866 11.40240




Linear models where gender and only either grade from the 7th year of from the 8th year included

Gender and grade from the 7th year included

(sm27 <- summary(m27 <- lm(iq ~ gender + zn7, data = IQ)))
## 
## Call:
## lm(formula = iq ~ gender + zn7, data = IQ)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.9606  -7.4290  -0.1927   7.0047  26.5244 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  138.093      3.043  45.376   <2e-16 ***
## gender         4.513      2.198   2.054   0.0424 *  
## zn7          -17.852      1.765 -10.116   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.77 on 108 degrees of freedom
## Multiple R-squared:  0.4941, Adjusted R-squared:  0.4848 
## F-statistic: 52.74 on 2 and 108 DF,  p-value: < 2.2e-16
vif(m27)
##  gender     zn7 
## 1.15531 1.15531

Gender and grade from the 8th year included

(sm28 <- summary(m28 <- lm(iq ~ gender + zn8, data = IQ)))
## 
## Call:
## lm(formula = iq ~ gender + zn8, data = IQ)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.5378  -7.9585  -0.0763   7.1273  31.0778 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  137.402      3.223  42.634  < 2e-16 ***
## gender         4.474      2.303   1.943   0.0547 .  
## zn8          -17.095      1.846  -9.263 2.21e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.22 on 108 degrees of freedom
## Multiple R-squared:  0.451,  Adjusted R-squared:  0.4408 
## F-statistic: 44.36 on 2 and 108 DF,  p-value: 8.673e-15
vif(m28)
##   gender      zn8 
## 1.169022 1.169022

Fitted regression functions

pzn <- c(1, 3.4)
pdata <- data.frame(zn7 = rep(pzn, 2), zn8 = rep(pzn, 2), gender = rep(c(0, 1), each = 2))

pm27 <- predict(m27, newdata = pdata)
pm28 <- predict(m28, newdata = pdata)

Plot with fitted regression functions

par(mfrow = c(1, 2), mar = c(5, 4, 1, 1) + 0.1)
plot(iq ~ zn7, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
lines(pzn, pm27[1:2], col = COLS["girl"], lwd = 2)
lines(pzn, pm27[3:4], col = COLS["boy"], lwd = 2)
legend(2.5, 140, legend = c("Girl", "Boy"), col = COLS, pt.bg = BGCOLS, pch = PCH, lty = 1, lwd = 2)
plot(iq ~ zn8, col = COLS[fgender], bg = BGCOLS[fgender], pch = PCH[fgender], data = IQ)
lines(pzn, pm28[1:2], col = COLS["girl"], lwd = 2)
lines(pzn, pm28[3:4], col = COLS["boy"], lwd = 2)
legend(2.5, 140, legend = c("Girl", "Boy"), col = COLS, pt.bg = BGCOLS, pch = PCH, lty = 1, lwd = 2)

plot of chunk fig-ProblRegrSpace-01-03

par(mfrow = c(1, 1))