### -------------------------------------------------------------------------###
### Winter Term 2019/2020 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #13 Confounders and effect modifiers ###
### Sandwich estimators, partial residuals ###
### ###
### -------------------------------------------------------------------------###
### Working directory
setwd("H:/nmsa407_LinRegr/") # To be changed to something existing on your PC.
rm(list=ls())
### Load needed packages
library("mffSM")
library("splines")
library("lmtest")
#####
data(Policie, package = "mffSM")
head(Policie)
dim(Policie)
summary(Policie)
# ?Policie
### Our main interest is how 'fat' [percentage of the applicant's body fat]
### is related to the 'height' of the applicant.
### Basic scatterplots to start
###
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~fat + height + weight, smooth = FALSE, regLine = FALSE,
data = Policie, pch = 16)
# 3d visualisation
library(rgl)
with(Policie, plot3d(height,weight,fat, main="Policie data", size=5, col=2))
#### Three models
#### ----------------
m1 <- lm(fat ~ height, data = Policie)
summary(m1)
plot(fat ~ height, data = Policie, main=
"Percentage of fat vs. height of the applicant")
lines(lowess(Policie$fat ~ Policie$height), lty="dashed",lwd=2)
abline(a=m1$coef[1], b=m1$coef[2], col="blue",lwd=2)
legend("bottomright", lty=c("solid", "dashed"), col=c("blue", "black"),
legend=c("linear fit", "lowess"), lwd=2)
# Interpret the effect of height on the percentage of body fat.
# Is this effect in agreement with your experience?
par(mfrow = c(1,2));
plot(fat~weight, data=Policie, main="Fat vs. weight");
plot(height~weight, data=Policie, main="Height vs. weight");
par(mfrow = c(1,1));
# Explain the effect of 'height' on 'fat' in the following model [m2].
# Is 'weight' a confounder of this relationship?
m2 <- lm(fat ~ height + weight, data = Policie)
summary(m2)
# Is 'weight' an effect modifier of the effect of 'height' on 'fat'?
m3 <- lm(fat ~ height + weight + height:weight, data = Policie)
summary(m3)
anova(m1,m3)
anova(m2,m3)
summary(m4 <- lm(fat ~ weight, data = Policie))
anova(m4,m3)
#### Visualization (of models m1 and m2)
#### ------------------------------------
summary(Policie[, "weight"])
# For the following weights the relationship between the
# fat and height will be shown more explicitly
weight.show <- c(70, 80, 90) ### approx. quartiles + median
Policie <- transform(Policie, fweight = cut(weight, breaks = c(50, 75, 85, 120),
labels = c("<=75", "75-85", ">85")))
PALwe <- diverge_hcl(length(weight.show))
LINwe <- c("blue", "grey50", "red")
names(PALwe) <- names(LINwe) <- levels(Policie[, "fweight"])
PCH <- 23
BTY <- "o";
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(fat ~ height, data = Policie, pch = PCH, col = LINwe, bg = PALwe[fweight],
xlab = "Height [cm]", ylab = "Body fat [%]")
for (i in 1:length(weight.show)){
# in the additive model
abline(a = coef(m2)["(Intercept)"] + coef(m2)["weight"] * weight.show[i],
b = coef(m2)["height"], col = LINwe[i], lwd = 2)
# in model with interactions change to the following
# abline(a = coef(m3)["(Intercept)"] + coef(m3)["weight"] * weight.show[i],
# b = coef(m3)["height"] + coef(m3)["height:weight"] * weight.show[i],
# col = LINwe[i], lwd = 2)
}
abline(m1, col = "orange", lwd = 2)
legend(185, 27, legend = levels(Policie[, "fweight"]), pch = PCH, col = LINwe,
pt.bg = PALwe, bty = "n")
legend(188, 27, legend = weight.show, col = LINwe, lty = 1, lwd = 2, bty = "n")
text(185, 27.5, labels = "Weight [kg]", pos = 4)
legend(188, 23.5, legend = "Marginal", col = "orange",lty = 1,lwd = 2,bty = "n")
## 3D visualisation in the additive model
with(Policie, plot3d(height,weight,fat, main="Additive model", size=5))
hgrid = seq(min(Policie$height), max(Policie$height), length=100)
wgrid = seq(min(Policie$weight), max(Policie$weight), length=100)
fgrid = outer(hgrid, wgrid, function(x,y) coef(m2)["(Intercept)"] +
x*coef(m2)["height"] + y*coef(m2)["weight"])
surface3d(hgrid,wgrid,fgrid, alpha = .4)
points3d(Policie$height, Policie$weight, fitted(m2),col=2, size=3)
segments3d(rep(Policie$height, each=2), rep(Policie$weight, each=2),
rbind(fitted(m2),Policie$fat),col=2,alpha=.4)
## 3D visualisation in the model without weight
with(Policie, plot3d(height,weight,fat, main="Model without weight", size=5))
hgrid = seq(min(Policie$height), max(Policie$height), length=100)
wgrid = seq(min(Policie$weight), max(Policie$weight), length=100)
fgrid = outer(hgrid, wgrid, function(x,y) coef(m1)["(Intercept)"] +
x*coef(m1)["height"])
surface3d(hgrid,wgrid,fgrid, alpha = .4)
points3d(Policie$height, Policie$weight, fitted(m1),col=2, size=3)
segments3d(rep(Policie$height, each=2), rep(Policie$weight, each=2),
rbind(fitted(m1),Policie$fat),col=2,alpha=.4)
# ---------------------------------------------------------------------
# ---------------------------------------------------------------------
# # Pribram dataset
# ---------------------------------------------------------------------
# The sampling area (about 5 km2) was located in the vicinity of
# a lead smelter (pec na olovo) in Pribram, Czech Republic. The smelter
# has been operating since 1786. The primary smelting of lead ores ceased
# in 1972 and was replaced by industry processing of secondary lead
# sources such as car batteries and other lead-bearing materials.
# Soil samples were taken from 121 sites selected in the studied area
# surrounding the smelter.
# The dataset contains measurements of the concentration of several metals:
# As, Pb, Zn (these measurements are already logarithmically transformed
# with the base 2, and then centered by the sample mean)
# Measurements of some parameters of organic activity (also already log2
# transformed but not centered):
# dehydro ... dehydrogenase activity
# respir ... respiratory activity
# biomass
# Measurements of some other characteristics of the soil
# cox ... Soil microbial carbon (log2 transformed and centered)
# pH ... pH-factor of the soil
# dist ... distance from the smelter
# The primary interest is in the effect of the metals on the organic activity.
# ----------------------------------------------------------------------------
DATA <- read.table("Data/pribram.csv", sep=";", header=TRUE);
head(DATA);
summary(DATA);
### Consider first the relationship of dehydrogenase activity (dehydro) and Zn
summary(m0 <- lm(dehydro ~ Zn, data=DATA))
plot(dehydro ~ Zn, data=DATA, xlab="Zn", ylab="dehydro", main="Dehydro vs. Zn");
lines(lowess(DATA$dehydro~DATA$Zn), lty="dashed", lwd=2)
abline(a=m0$coef[1], b=m0$coef[2], col="blue", lwd=2)
legend(1.5,7, lty=c("solid", "dashed"), col=c("blue", "black"),
legend=c("linear fit", "lowess"), lwd=2)
# The relationship of Zn and dehydro is not as expected...
# The problem might be in the fact that this is an OBSERVATIONAL STUDY
# and not a DESIGNED EXPERIMENT. In observational studies one has to
# be careful about possible confounding factors (confounders).
# In this study the problem was that due to the contamination the soil near
# the smelter was less used which resulted in higher values of the organic
# activity.
par(mfrow = c(1,2));
plot(DATA$dist, DATA$Zn, xlab="Distance from the smelter", ylab="Zn",
main="Zn vs. distance");
lines(lowess(DATA$Zn~DATA$dist), lty="dashed", lwd=2)
plot(DATA$dist, DATA$dehydro, xlab="Distance from the smelter",
ylab="dehydrogenase activity", main="Dehydro vs. distance");
lines(lowess(DATA$dehydro~DATA$dist), lty="dashed", lwd=2)
par(mfrow = c(1,1));
with(DATA, plot3d(Zn, dist, dehydro, main="Pribram", size=5))
# Thus we can try to include distance from the smelter as a covariate
summary(lm(dehydro ~ Zn + dist, data=DATA))
# Explain the effect of Zn on dehydro estimated by this model
# One can also try...
summary(lm(dehydro ~ Zn*dist, data=DATA))
## function used for 3D visualisation of models with at most two regressors
plotlm.3d = function(x,y,z,m){
plot3d(x,y,z,xlab=names(coef(m))[2], ylab=names(coef(m))[3],zlab = "Reponse",
size=5)
xgrid = seq(min(x), max(x), length=100)
ygrid = seq(min(y), max(y), length=100)
nc = length(coef(m))
if(nc==1) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1])
if(nc==2) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2])
if(nc==3) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2]
+ b*coef(m)[3])
if(nc==4) zgrid = outer(xgrid, ygrid, function(a,b) coef(m)[1] + a*coef(m)[2]
+ b*coef(m)[3] + a*b*coef(m)[4])
surface3d(xgrid,ygrid,zgrid, alpha = .4)
points3d(x,y,fitted(m),col=2, size=3)
segments3d(rep(x, each=2), rep(y, each=2), rbind(fitted(m),z),col=2,alpha=.4)
}
m = lm(dehydro ~ Zn+dist, data=DATA)
with(DATA,plotlm.3d(Zn, dist, dehydro,m))
# The problem is that although the distance from the smelter seems to be an
# important variable, it is from the biological point of view not the most
# appropriate variable to control for.
# After discussion with the researchers we decided to control for cox
# (soil microbial carbon) as it is known that organic activity depends on the
# amount of the available carbon in the soil.
par(mfrow = c(1,2));
plot(DATA$cox, DATA$Zn, xlab="Cox", ylab="Zn", main="Zn vs. cox");
lines(lowess(DATA$Zn~DATA$cox), lty="dashed",lwd=2)
plot(DATA$cox, DATA$dehydro, xlab="Cox", ylab="dehydrogenase activity",
main="Dehydro vs. cox");
lines(lowess(DATA$dehydro~DATA$cox), lty="dashed",lwd=2)
par(mfrow = c(1,1));
#
summary(m1 <- lm(dehydro ~ Zn + cox, data=DATA))
# Interpret the effect of Zn on dehydro in this model.
# Compare this effect with the effect of Zn in a model m0
# (without cox).
summary(m0)
# Is cox a confounder for the relationship of dehydro vs Zn?
# One can be also interested if cox is an effect modifier of the relationship
# of dehydro vs Zn.
summary(m2 <- lm(dehydro ~ Zn*cox, data=DATA))
anova(m2)
### Plot of effect of the Zn on dehydro as estimated by model m2
plot(DATA$cox, coef(m2)['Zn'] + coef(m2)['Zn:cox']*DATA$cox, xlab="cox",
main="Effect of the Zn on dehydro [m2]", type="l");
abline(h=0, lty="dotted")
with(DATA,plotlm.3d(Zn,cox,dehydro,m2))
# Sometimes the effect modification disappears when we use a more flexible
# function of the variable that we need to control.
# To visualize if the effect of cox is only linear in model
# 'm1: dehydro~Zn + cox', one can use the partial residuals
head(residuals(m1, type="partial"))
# manual computation
par.res = residuals(m1)+
matrix(coef(m1)[2:3],nrow=nrow(DATA),ncol=2,byrow=TRUE)*model.matrix(m1)[,2:3]
head(par.res)
max(abs(par.res-residuals(m1,type="partial")))
# the partial residuals are already zero-mean centred (cox and Zn are centred)
colSums(par.res)
#
coef(lm(par.res[,2]~DATA$cox))
coef(m1)[3]
#
coef(lm(par.res[,1]~DATA$Zn))
coef(m1)[2]
parc.res.cox <- residuals(m1, type="partial")[,2];
plot(DATA$cox, parc.res.cox, xlab = "Cox",
ylab="The zero-mean partial residuals for 'cox'",
main="Partial residuals [cox] - model m1");
abline(a=0, b=m1$coef[3], col="blue", lwd=2);
lines(lowess(parc.res.cox~DATA$cox), lty="dashed", lwd=2)
# compare the plots of cox against residuals and partial residuals, respectively
par(mfrow=c(1,2))
plot(residuals(m1)~cox, data=DATA, main="Residuals ~ cox")
abline(a=0,b=0,lwd=2)
# (recall that b=0 in the plot above is apropriate because)
lm(residuals(m1)~cox,data=DATA)
#
plot(residuals(m1)+coef(m1)["cox"]*cox~cox, data=DATA,
main = "Partial residuals ~ cox")
abline(a=0,b=coef(m1)["cox"],lwd=2)
par(mfrow=c(1,1))
# the plot with partial residuals also from function termplot
termplot(m1, terms="cox", partial.resid=TRUE, smooth=panel.smooth, lwd.term=2,
col.res=1)
# compare with the predictions for different values of Zn
with(DATA,plot(dehydro~cox))
with(DATA,lines(lowess(dehydro~cox), lty=2, lwd=2))
Zn0 = 0
coxgrid = with(DATA,seq(min(cox),max(cox),length=101))
lines(coxgrid,predict(m1,newdata=data.frame(Zn=Zn0, cox=coxgrid)),lwd=2,col=2)
Zn0 = 1
coxgrid = with(DATA,seq(min(cox),max(cox),length=101))
lines(coxgrid,predict(m1,newdata=data.frame(Zn=Zn0, cox=coxgrid)),lwd=2,col=3)
legend("topleft",c("Zn=0","Zn=1"),col=2:3,lwd=2)
# compare with the next plot
plot(DATA$cox,residuals(m0))
abline(lm(residuals(m0)~cox,data=DATA),col=2,lwd=2)
### for the Policie data
###
mP0 = lm(fat~weight, data=Policie)
mP1 = lm(fat~height, data=Policie)
mP2 = lm(fat~height+weight, data=Policie)
plot(fat ~ height, data = Policie,
main="Percentage of fat vs. height of the applicant")
lines(lowess(Policie$fat ~ Policie$height), lty="dashed",lwd=2)
w0 = 80
hgrid = with(Policie,seq(min(height),max(height),length=101))
lines(hgrid,predict(mP2,newdata=data.frame(weight=w0,height=hgrid)),lwd=2,col=2)
abline(mP1, col=3,lwd=2)
legend(180,6, col=c(2,3), legend=c("height + weight (weight = 80)", "height"),
lwd=2)
termplot(mP2, terms="height", partial.resid=TRUE, smooth=panel.smooth,
lwd.term=2, col.res=1)
plot(Policie$height,residuals(mP0))
abline(lm(residuals(mP0)~height,data=Policie),col=2,lwd=2)
# Let us consider the following relatively simple spline parametrization of cox
DATA$cox.sp <- bs(DATA$cox, knots = 0, degree = 2);
qqq <- order(DATA$cox);
plot(DATA$cox[qqq], DATA$cox.sp[qqq,1], type="l", ylim=range(DATA$cox.sp));
lines(DATA$cox[qqq], DATA$cox.sp[qqq,2], type="l", col="blue");
lines(DATA$cox[qqq], DATA$cox.sp[qqq,3], type="l", col="red");
#
summary(m3 <- lm(dehydro ~ Zn+cox.sp, data=DATA))
anova(m1, m3)
# What is the effect of Zn on dehydro as estimated by m3?
# Do we still need to have an interaction?
anova(lm(dehydro ~ Zn * cox.sp, data=DATA))
# Let us take m3 as the final model
mFin <- m3;
# Basic diagnostic plot
plotLM(mFin)
# Test of the normality of the errors
shapiro.test(mFin$residuals)
# Test of homoscedasticity - Koenker's studentized version
bptest(mFin, ~fitted(mFin))
bptest(mFin)
### Durbin-Watson test of correlation of errors
# p-value based on some asymptotic approximations
dwtest(mFin)
# p-value with the help of bootstrap
library(car)
durbinWatsonTest(mFin)
# The 'manual' calculation of the test statistic of Durbin-Watson test
U <- residuals(mFin);
n <- length(U)
(DW <- sum((U[-1] - U[-n])^2)/sum(U^2));
plot(U[-n], U[-1], xlab=expression(U[i]), ylab=expression(U[i+1]),
main=paste("Autocorrelation with lag = 1 :", round(cor(U[-n], U[-1]), 2)));
abline(reg=lm(U[-1]~U[-n]), lwd=2)
# Do you have any idea why does the DW-test give a significant result?
plot(DATA$dist, main="Distance vs the index of the observation")
plot(DATA$Zn, main="Zn vs the index of the observation")
plot(residuals(mFin), type="l",main="Residuals vs the index of the observation")
abline(0,0,lty=3,lwd=2)
# The results about homoscedasticity are borderline and the sample size
# is moderate. Sandwich estimator might be of interest
library("sandwich")
# Estimate of the variance matrix of the regression coefficients
(VHC <- vcovHC(mFin, type = "HC3"));
# type = "HC3" It uses residuals(mFin)^2/(1 - hatvalues(mFin))^2
# on the diagonal of the "meat" matrix of the sandwich.
## Manual calculation of the sandwich estimator - start
X <- model.matrix(mFin);
Bread <- solve(t(X) %*% X) %*% t(X)
Meat <- diag(residuals(mFin)^2 / (1 - hatvalues(mFin))^2)
Bread %*% Meat %*% t(Bread)
round(Bread %*% Meat %*% t(Bread) - VHC, 10)
## Manual calculation of the sandwich estimator - end
# Summary table using the sandwich estimator of variance
coeftest(mFin, vcov = VHC)
# manual computation
(sighat<-sqrt(diag(VHC)))
(Tstat<-coef(mFin)/sighat)
(2*pt(abs(Tstat), df=mFin$df.res, lower=FALSE))
# Summary table using the standard estimator of variance under homoscedasticity
summary(mFin)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # Soil data # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
load("Data/SoilData.RData")
# The data comes from an experiment focused on the evaluation
# of the phosporus concentration in some specific types of soil after
# adding one of two available ash-based fertilizers.
head(soilData);
summary(soilData);
# # Variables of interest for this exercise class
# P ... phosphorus concentration in a soil sample measured in ug per kg
# after adding an ash-based fertilizer;
# SoilP ... initial phosphorus concentration in a soil sample [ug per kg];
# SoilpH ... pH acidity of the soil sample;
# SoilType ... factor variable with four levels for four different soil types;
DATA <- soilData;
# Considering Soiltype as a factor
DATA$fSoilType <- factor(DATA$SoilType);
# Response: Increase in the concentration of phosphorus
DATA$Y <- log(DATA$P/DATA$SoilP)
# Suppose we are interested in the relation of the response on pH of the soil
summary(m0 <- lm(Y ~ SoilpH, data=DATA))
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y",
main="log(P/SoilP) vs. SoilpH");
lines(lowess(DATA$Y~DATA$SoilpH), lty="dashed", lwd=2)
abline(reg=m0, col="blue", lwd=2)
# or equivalently: abline(a=m0$coef[1], b=m0$coef[2], col="blue")
legend(4.75,-4, lty=c("solid", "dashed"), col=c("blue", "black"),
legend=c("linear fit", "lowess"), lwd=2)
# Taking fSoilType into consideration
summary(m1 <- lm(Y ~ SoilpH+fSoilType, data=DATA))
# Is type of the soil (SoilType) an effect modifier of the relationship
# of the response and pH?
anova(m2 <- lm(Y ~ SoilpH*fSoilType, data=DATA))
# Can SoilpH be removed from the model altogether?
Anova(m2)
# How is such a result possible?
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y",
main="log(P/SoilP) vs. SoilpH", pch=SoilType, col=SoilType, lwd=2);
i <- 0;
for(f in levels(DATA$fSoilType)){
i <- i+1;
qqq <- as.logical(DATA$SoilType == f);
lines(lowess(DATA$Y[qqq]~DATA$SoilpH[qqq]), lty="dashed", col=i, lwd=2)
b <- coef(lm(fitted(m2)[qqq]~DATA$SoilpH[qqq]))[2]; # Sign of the coefficient
if(b > 0){
yrange <- range(fitted(m2)[qqq])
}
else{
yrange <- (range(fitted(m2)[qqq]))[c(2,1)]
}
lines(range(DATA$SoilpH[qqq]), yrange, col=i, lwd=2)
}
legend(x=5, y=-4, pch=1:4, col=1:4, legend=paste("Soil type",
levels(DATA$fSoilType)), lwd=2)
legend(x=5, y=-4.72, lty=c("solid", "dashed"),
legend=c("linear fit [m2]", "lowess"), lwd=2)
# Do you have any idea why does the DW test give a significant result?
dwtest(m2); # using asymptotic approximation
durbinWatsonTest(m2); # using bootstrap
plot(residuals(m2),main="Residulas vs index: observed data")
## what if the rows of the data were randomly shuffled?
DATA2 = DATA[sample.int(nrow(DATA)),]
m2shuf = lm(Y ~ SoilpH*fSoilType, data = DATA2)
dwtest(m2shuf)
plot(residuals(m2shuf),main="Residulas vs index: random shuffle")
## in the other extreme case, what if the rows are ordered according to
# the values of residuals?
DATA3 = DATA[order(residuals(m2)),]
m2ord = lm(Y ~ SoilpH*fSoilType, data = DATA3)
dwtest(m2ord)
plot(residuals(m2ord),main="Residulas vs index: ordered residuals")
# Observed values of Y as a function of the index of the observations
plot(DATA$Y, col=DATA$SoilType, pch=DATA$SoilType, lwd=2, main="Y vs index")
legend("bottomright",paste("soil type ", 1:4, sep=""), col=1:4, pch=1:4, lty=0,
lwd=2)
plotLM(m2)
# heteroscedasticity issues?
bptest(m2, ~fitted(m2))
bptest(m2)
# residuals appear to have different variances
boxplot(residuals(m2)~DATA$SoilType)