### --------------------------------------------------------------------------------------###
### Winter Term 2017/2018 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #13 Confounders and effect modifiers ###
### Sandwich estimators, partial residuals ###
### ###
### --------------------------------------------------------------------------------------###
### Working directory
setwd("H:/nmsa407_LinRegr/") # To be changed to something which exists on your PC.
rm(list=ls())
### Load needed packages
library("mffSM")
library("splines")
library("colorspace")
library("lmtest")
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
data(Policie, package = "mffSM")
head(Policie)
dim(Policie)
summary(Policie)
# ?Policie
### Our basic interest is how 'fat' [percentage of the applicant's body fat]
### is related to the 'height' of the applicant.
### Basic scatterplots to start
###
palette(c("darkblue", "red3", "olivedrab", rainbow_hcl(5)))
scatterplotMatrix(~fat + height + weight, reg.line = lm, smooth = FALSE, spread = TRUE, diagonal = "histogram", data = Policie, pch = 16)
#### Three models
#### ----------------
m1 <- lm(fat ~ height, data = Policie)
summary(m1)
plot(fat ~ height, data = Policie, main="Percentage of fat vs. height of the applicant")
lines(lowess(Policie$fat ~ Policie$height), lty="dashed",lwd=2)
abline(a=m1$coef[1], b=m1$coef[2], col="blue",lwd=2)
legend(188,6, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"), lwd=2)
# Interpret the effect of height on the percentage of body fat.
# Is this effect in agreement with your experience?
par(mfrow = c(1,2));
plot(fat~weight, data=Policie, main="Fat vs. weight");
plot(height~weight, data=Policie, main="Height vs. weight");
par(mfrow = c(1,1));
# Explain the effect of 'height' on 'fat' in the following model [m2].
# Is 'weight' a confounder of this relationship?
m2 <- lm(fat ~ height + weight, data = Policie)
summary(m2)
# Is 'weight' an effect modifier of the effect of 'height' on 'fat'?
m3 <- lm(fat ~ height + weight + height:weight, data = Policie)
summary(m3)
anova(m1,m3)
anova(m2,m3)
summary(m4 <- lm(fat ~ weight, data = Policie))
anova(m4,m3)
#### Visualization (of models m1 and m2)
#### ------------------------------------
summary(Policie[, "weight"])
# For the following weights the relationship between the
# fat and height will be shown more explicitly
weight.show <- c(70, 80, 90) ### approx. quartiles + median
Policie <- transform(Policie, fweight = cut(weight, breaks = c(50, 75, 85, 120), labels = c("<=75", "75-85", ">85")))
PALwe <- diverge_hcl(length(weight.show))
LINwe <- c("blue", "grey50", "red")
names(PALwe) <- names(LINwe) <- levels(Policie[, "fweight"])
PCH <- 23
# PCH <- c(1,2,3)
BTY <- "o";
par(mfrow = c(1, 1), bty = BTY, mar = c(4, 4, 1, 1) + 0.1)
plot(fat ~ height, data = Policie, pch = PCH, col = LINwe, bg = PALwe[fweight], xlab = "Height [cm]", ylab = "Body fat [%]")
for (i in 1:length(weight.show)){
# in the additive model
abline(a = coef(m2)["(Intercept)"] + coef(m2)["weight"] * weight.show[i], b = coef(m2)["height"], col = LINwe[i], lwd = 2)
# in model with interactions
# abline(a = coef(m3)["(Intercept)"] + coef(m3)["weight"] * weight.show[i], b = coef(m3)["height"] + coef(m3)["height:weight"] * weight.show[i], col = LINwe[i], lwd = 2)
}
abline(m1, col = "orange", lwd = 2)
legend(185, 27, legend = levels(Policie[, "fweight"]), pch = PCH, col = LINwe, pt.bg = PALwe, bty = "n")
legend(188, 27, legend = weight.show, col = LINwe, lty = 1, lwd = 2, bty = "n")
text(185, 27.5, labels = "Weight [kg]", pos = 4)
legend(188, 23.5, legend = "Marginal", col = "orange", lty = 1, lwd = 2, bty = "n")
# ---------------------------------------------------------------------
# ---------------------------------------------------------------------
# # Pribram dataset
# ---------------------------------------------------------------------
# The sampling area (about 5 km2) was located in the vicinity of
# a lead smelter (pec na olovo) in Pribram, Czech Republic. The smelter
# has been operating since 1786. The primary smelting of lead ores ceased
# in 1972 and was replaced by industry processing of secondary lead
# sources such as car batteries and other lead-bearing materials.
# Soil samples were taken from 121 sites selected in the studied area
# surrounding the smelter.
# The dataset contains measurements of the concentration of several metals:
# As, Pb, Zn (these measurements are already logarithmically transformed
# with the base 2, and then centered by the sample mean)
# Measurements of some parameters of organic activity (also already log2 transformed
# but not centered):
# dehydro ... dehydrogenase activity
# respir ... respiratory activity
# biomass
# # Measurements of some other characteristics of the soil
# cox ... Soil microbial carbon (log2 transformed and centered)
# pH ... pH-factor of the soil
# dist ... distance from the smelter
# The primary interest is in the effect of the metals on the organic activity.
# ----------------------------------------------------------------------------
DATA <- read.table("Data/pribram.csv", sep=";", header=TRUE);
head(DATA);
summary(DATA);
### Consider first the relationship of dehydrogenase activity (dehydro) and Zn
summary(m0 <- lm(dehydro ~ Zn, data=DATA))
plot(dehydro ~ Zn, data=DATA, xlab="Zn", ylab="dehydro", main="Dehydro vs. Zn");
lines(lowess(DATA$dehydro~DATA$Zn), lty="dashed", lwd=2)
abline(a=m0$coef[1], b=m0$coef[2], col="blue", lwd=2)
legend(1.5,7, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"), lwd=2)
# The relationship of Zn and dehydro is not as expected...
# The problem might be in the fact that this is an OBSERVATIONAL STUDY
# and not a DESIGNED EXPERIMENT. In observational studies one has to
# be careful about possible confounding factors (confounders).
# In this study the problem was that due to the contamination the soil near
# the smelter was less used which resulted in higher values of the organic
# activity.
par(mfrow = c(1,2));
plot(DATA$dist, DATA$Zn, xlab="Distance from the smelter", ylab="Zn", main="Zn vs. distance");
lines(lowess(DATA$Zn~DATA$dist), lty="dashed", lwd=2)
plot(DATA$dist, DATA$dehydro, xlab="Distance from the smelter", ylab="dehydrogenase activity",
main="Dehydro vs. distance");
lines(lowess(DATA$dehydro~DATA$dist), lty="dashed", lwd=2)
par(mfrow = c(1,1));
# Thus we can try to include distance from the smelter as a covariate
summary(lm(dehydro ~ Zn + dist, data=DATA))
# Explain the effect of Zn on dehydro estimated by this model
# One can also try...
summary(lm(dehydro ~ Zn*dist, data=DATA))
# The problem is that although the distance from the smelter seems to be an important
# variable, it is from the biological point of view not the most appropriate variable
# to control for.
# After discussion with the researchers we decided to control for
# cox (soil microbial carbon) as it is known that organic activity depends on the
# amount of the available carbon in the soil.
par(mfrow = c(1,2));
plot(DATA$cox, DATA$Zn, xlab="Cox", ylab="Zn", main="Zn vs. cox");
lines(lowess(DATA$Zn~DATA$cox), lty="dashed",lwd=2)
plot(DATA$cox, DATA$dehydro, xlab="Cox", ylab="dehydrogenase activity",
main="Dehydro vs. cox");
lines(lowess(DATA$dehydro~DATA$cox), lty="dashed",lwd=2)
par(mfrow = c(1,1));
#
summary(m1 <- lm(dehydro ~ Zn + cox, data=DATA))
# Interpret the effect of Zn on dehydro in this model.
# Compare this effect with the effect of Zn in a model m0
# (without cox).
summary(m0)
# Is cox a confounder for the relationship of dehydro vs Zn?
# One can be also interested if cox is even an effect modifier of the relationship
# of dehydro vs Zn.
summary(m2 <- lm(dehydro ~ Zn*cox, data=DATA))
anova(m2)
Anova(m2)
# # Plot of effect of the Zn on dehydro as estimated by model m2
plot(DATA$cox, coef(m2)['Zn'] + coef(m2)['Zn:cox']*DATA$cox, xlab="cox", main="Effect of the Zn on dehydro [m2]", type="l");
abline(h=0, lty="dotted")
# Sometimes the effect modification disappears when we use a more flexible function of
# the variable that we need to control.
# To visualize if the effect of cox is only linear in model 'm1: dehydro~Zn + cox',
# one can use the partial residuals
head(residuals(m1, type="partial"))
parc.res.cox <- residuals(m1, type="partial")[,2];
plot(DATA$cox, parc.res.cox, xlab = "Cox", ylab="The zero-mean partial residuals for 'cox'", main="Partial residuals [cox] - model m1");
abline(a=0, b=m1$coef[3], col="blue", lwd=2);
lines(lowess(parc.res.cox~DATA$cox), lty="dashed", lwd=2)
# Let us consider the following relatively simple spline parametrization of cox
DATA$cox.sp <- bs(DATA$cox, knots = 0, degree = 2);
qqq <- order(DATA$cox);
plot(DATA$cox[qqq], DATA$cox.sp[qqq,1], type="l", ylim=range(DATA$cox.sp));
lines(DATA$cox[qqq], DATA$cox.sp[qqq,2], type="l", col="blue");
lines(DATA$cox[qqq], DATA$cox.sp[qqq,3], type="l", col="red");
#
summary(m3 <- lm(dehydro ~ Zn+cox.sp, data=DATA))
anova(m1, m3)
# What is the effect of Zn on dehydro as estimated by m3?
# Do we still need to have an interaction?
anova(lm(dehydro ~ Zn * cox.sp, data=DATA))
# Let us take m3 as the final model
mFin <- m3;
# Basic diagnostic plot
plotLM(mFin)
# Test of the normality of the errors
shapiro.test(mFin$residuals)
# Test of homoscedasticity - Koenker's studentized version
bptest(mFin, ~fitted(mFin))
bptest(mFin)
### Durbin-Watson test of correlation of errors
# p-value based on some asymptotic approximations
dwtest(mFin)
# p-value with the help of bootstrap
library(car)
durbinWatsonTest(mFin)
# The 'manual' calculation of the test statistic of Durbin-Watson test
U <- residuals(mFin);
n <- length(U)
(DW <- sum((U[-1] - U[-n])^2)/sum(U^2));
plot(U[-n], U[-1], xlab=expression(U[i]), ylab=expression(U[i+1]), main=paste("Autocorrelation with lag = 1 :", round(cor(U[-n], U[-1]), 2)));
abline(reg=lm(U[-1]~U[-n]))
# Do you have any idea why does the DW-test give a significant result?
plot(DATA$dist)
# The results about homoscedasticity are borderline and the sample size
# is moderate. Sandwich estimator might be of interest
library("sandwich")
# Estimate of the variance matrix of the regression coefficients
(VHC <- vcovHC(mFin, type = "HC3"));
# type = "HC3" It uses residuals(mFin)^2/(1 - hatvalues(mFin))^2
# on the diagonal of the "meat" matrix of the sandwich.
## Manual calculation of the sandwich estimator - start
X <- model.matrix(mFin);
Bread <- solve(t(X) %*% X) %*% t(X)
Meat <- diag(residuals(mFin)^2 / (1 - hatvalues(mFin))^2)
Bread %*% Meat %*% t(Bread)
round(Bread %*% Meat %*% t(Bread) - VHC, 10)
## Manual calculation of the sandwich estimator - end
# Summary table using the sandwich estimator of variance
coeftest(mFin, vcov = VHC)
# Summary table using the standard estimator of variance that assumes homoscedasticity
summary(mFin)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # Soil data # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
load("Data/SoilData.RData")
# The data comes from an experiment focused on the evaluation
# of the phosporus concentration in some specific types of soil after
# adding one of two available ash-based fertilizers.
head(soilData);
summary(soilData);
# # Variables of interest for this exercise class
# P ... phosphorus concentration in a soil sample measured in ug per kg after adding an ash-based fertilizer;
# SoilP ... initial phosphorus concentration in a soil sample measured in ug per kg;
# SoilpH ... pH acidity of the soil sample;
# SoilType ... factor variable with four levels for four different soil types;
DATA <- soilData;
# Considering Soiltype as a factor
DATA$fSoilType <- factor(DATA$SoilType);
# Response - Concentration of phosphorus
DATA$Y <- log(DATA$P/DATA$SoilP)
# Suppose we are interested in the relationship of the response and pH of the soil
summary(m0 <- lm(Y ~ SoilpH, data=DATA))
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH");
lines(lowess(DATA$Y~DATA$SoilpH), lty="dashed", lwd=2)
abline(reg=m0, col="blue", lwd=2)
# or equivalently: abline(a=m0$coef[1], b=m0$coef[2], col="blue")
legend(4.75,-4, lty=c("solid", "dashed"), col=c("blue", "black"), legend=c("linear fit", "lowess"), lwd=2)
# Taking fSoilType into consideration
summary(m1 <- lm(Y ~ SoilpH+fSoilType, data=DATA))
# Is type of the soil (SoilType) an effect modifier of the relationship of the response and pH?
anova(m2 <- lm(Y ~ SoilpH*fSoilType, data=DATA))
plot(Y ~ SoilpH, data=DATA, xlab="SoilpH", ylab="Y", main="log(P/SoilP) vs. SoilpH", pch=SoilType, col=SoilType, lwd=2);
i <- 0;
for(f in levels(DATA$fSoilType)){
i <- i+1;
qqq <- as.logical(DATA$SoilType == f);
lines(lowess(DATA$Y[qqq]~DATA$SoilpH[qqq]), lty="dashed", col=i, lwd=2)
b <- coef(lm(fitted(m2)[qqq]~DATA$SoilpH[qqq]))[2]; # Sign of the coefficient
if(b > 0){
yrange <- range(fitted(m2)[qqq])
}
else{
yrange <- (range(fitted(m2)[qqq]))[c(2,1)]
}
lines(range(DATA$SoilpH[qqq]), yrange, col=i, lwd=2)
}
legend(x=5, y=-4, pch=1:4, col=1:4, legend=paste("Soil type", levels(DATA$fSoilType)), lwd=2)
legend(x=5, y=-4.72, lty=c("solid", "dashed"), legend=c("linear fit [m2]", "lowess"), lwd=2)
# Do you have any idea why does the DW test give a significant result?
dwtest(m2); # using asymptotic approximation
durbinWatsonTest(m2); # using bootstrap
#
plot(DATA$Y, col=DATA$SoilType, pch=DATA$SoilType, lwd=2)
legend("bottomright",paste("soil type ", 1:4, sep=""), col=1:4, pch=1:4, lty=0, lwd=2)
plotLM(m2)
bptest(m2, ~fitted(m2))
bptest(m2)
boxplot(residuals(m2)~DATA$SoilType)