### -------------------------------------------------------------------------### ### NMSA407 - Linear Regression ### ### Winter Term 2020/2021 | Online ### ### ### ### EXERCISE #12 Multiple Comparisons and Sandwich Estimator ### ### -------------------------------------------------------------------------### ### -------------------------------------------------------------------------### ### Block 1: TEACHING MATERIAL ### ### -------------------------------------------------------------------------### library("mffSM") library("lmtest") library("multcomp") ### working data file load(url("http://msekce.karlin.mff.cuni.cz/~maciak/NMSA407/mana.RData")) ### B R I E F D A T A D E S C R I P T I O N ### Data comes from the project MANA (Maturita na necisto) in launched in ### the Czech Republic in 2005. This project was a part of a bigger project ### whose goal was to prepare a new form of graduation exam ("statni maturita"). ### This dataset contains the results in English language at grammar schools. ### ### VARIABLES ### region: Region of the Czech Republic ### 1 = Praha ### 2 = Stredocesky ### 3 = Jihocesky ### 4 = Plzensky ### 5 = Karlovarsky ### 6 = Ustecky ### 7 = Liberecky ### 8 = Kralovehradecky ### 9 = Pardubicky ### 10 = Vysocina ### 11 = Jihomoravsky ### 12 = Olomoucky ### 13 = Zlinsky ### 14 = Moravskoslezsky ### ### specialization: The specialization of the intended university ### 1 = education ### 2 = social science ### 3 = languages ### 4 = law ### 5 = math-physics ### 6 = engineering ### 7 = science ### 8 = medicine ### 9 = economics ### 10 = agriculture ### 11 = art ### ### gender: Gender of the student, ### 0 = female, 1 = male ### ### graduation: Is the student going to graduate in English? ### 0 = no, 1 = yes ### ### entry.exam: Is an exam on English knowledge part of an entry exam at the ### university (where the student is going to apply)? ### 0 = no, 1 = yes ### ### score: The score in English language in MANA test. ### ### avg.mark: Average mark (grade) from all subjects at the last report card ### (vysvedceni). ### ### mark.eng: Average mark (grade) from the English language at the last ### 5 report cards (vysvedceni). head(mana) dim(mana) summary(mana) with(mana, table(region)) with(mana, table(specialization)) with(mana, table(gender)) with(mana, table(graduation)) with(mana, table(entry.exam)) ### Instead of the numbers appropriate codes are used ### A - Capital City Prague (Praha) ### B - South Moravia - Jihomoravsky kraj (Brno) ### C - South Bohemia - Jihocesky kraj (Ceske Budejovice) ### E - Pardubice - Pardubicky kraj ### H - Hradec Kralove - Kralovehradecky kraj ### J - Highland (Vysocina) Region - Kraj Vysocina (Jihlava) ### K - Karlovy Vary - Karlovarsky kraj ### L - Liberec - Liberecky kraj ### M - Olomouc - Olomoucky kraj ### P - Plzen - Plzensky kraj ### S - Central Bohemia (Prague) - Stredocesky kraj ### T - Moravia-Silesia - Moravskoslezsky kraj (Ostrava) ### U - Usti nad Labem - Ustecky kraj ### Z - Zlin - Zlinsky kraj mana <- transform(mana, fregion = factor(region, levels = 1:14, labels = c("A", "S", "C", "P", "K", "U", "L", "H", "E", "J", "B", "M", "Z", "T")), fspec = factor(specialization, levels = 1:11, labels = c("Educ", "Social", "Lang", "Law", "Mat-Phys", "Engin", "Science", "Medic", "Econom", "Agric", "Art")), fgender = factor(gender, levels = 0:1, labels = c("Female", "Male")), fgrad = factor(graduation, levels = 0:1, labels = c("No", "Yes")), fentry = factor(entry.exam, levels = 0:1, labels = c("No", "Yes"))) summary(mana) with(mana, table(fregion)) with(mana, table(fspec)) ### We are interested in possible differences in grading among Czech regions. ### The quality of education is (at least for our purposes) expexted to be ### expressed by an average mark on the last certificate [avg.mark]. ### The prospective specializations are taken into account. ### Exploratory analysis lSPEC <- levels(mana[, "fspec"]) colSPEC <- diverge_hcl(length(lSPEC)) names(colSPEC) <- lSPEC lREG <- levels(mana[, "fregion"]) colREG <- diverge_hcl(length(lREG)) names(colREG) <- lREG with(mana, interaction.plot(fregion, fspec, avg.mark, col = colSPEC, type = "b", lwd = 2, lty = 1, xlab = "Region", trace.label = "Specialization", ylab = "Mean of avg.mark")) with(mana, interaction.plot(fspec, fregion, avg.mark, col = colREG, type = "b", lwd = 2, lty = 1, xlab = "Specialization", trace.label = "Region", ylab = "Mean of avg.mark")) ### Not really insightful with so many groups... But still, do you think that ### an additive model will be a "maybe wrong but useful" model? ### Absolute counts for the given categories addmargins(with(mana, table(fregion, fspec))) ### Note, there are some zeros in the Agric column. Are there any consequences ### for the two-way ANOVA linear model with interactions? mInter <- lm(avg.mark ~ fregion*fspec, data = mana) summary(mInter) ### Do you have an idea why those NA's occur in the output? ### Is this a usefull model or it is worth of going for some simplification? Anova(mInter) Anova(mInter, type = "II") Anova(mInter, type = "III") with(mana, tapply(avg.mark,list(fregion, fspec),mean)) # cellwise averages ### Interaction just not significant, but with our (quite huge) sample size, ### can we assume that the interaction terms are practically unimportant? ### Additive model mAddit <- lm(avg.mark ~ fregion + fspec, data = mana) summary(mAddit) ### Why there are no NA's observations now? ### Basic diagnostics plotLM(mAddit) ### Breusch-Pagan test for homoscedasticity ncvTest(mAddit, var.formula = ~fitted(mAddit)) bptest(mAddit, varformula = ~fitted(mAddit)) # Koenker's studentized version bptest(mAddit) ### Shapiro-Wilk test for normality of errors based on raw residuals shapiro.test(residuals(mAddit)) ## The implemented algorithm does not allow for more than 5000 observations. ## Are we actually really interested in testing normality? ### Another commonly used test of normality is D'Agostino test library(fBasics); dagoTest(residuals(mAddit)) ### for more information see ### https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Normality_Tests.pdf ### Tests for partial effects of region and specialization Anova(mAddit, type = "II") Anova(mAddit, type = "III") ### 1. Multiple comparisons: TukeyHSD aAddit <- aov(avg.mark ~ fregion + fspec, data = mana) TukeyHSD(aAddit) all.equal(coef(aAddit),coef(mAddit)) ### Which regions are different (given specialization)? (treg <- TukeyHSD(aAddit)[["fregion"]]) (treg <- treg[order(treg[, "p adj"]),]) treg[treg[, "p adj"] < 0.05,] ### Compare the results with treg2 <- TukeyHSD(aov(avg.mark ~ fregion, data = mana))[["fregion"]] treg2 <- treg2[order(treg2[, "p adj"]),] treg2[treg2[, "p adj"] < 0.05,] sort(reg.means <- tapply(mana$avg.mark, mana$fregion, mean)); (diff.reg.means <- outer(reg.means, reg.means, "-")); diff.reg.means["E","C"] treg[treg[, "p adj"] < 0.05,][1,1] treg2[treg2[, "p adj"] < 0.05,][1,1] ### Tukey HSD procedure in a one-way ANOVA model G = nrow(diff.reg.means) n = nrow(mana) nG = table(mana$fregion) s2 = sum(aov(avg.mark ~ fregion, data = mana)$res^2)/(n-G) diff.reg.means["E", "C"] + c(-1,1) * qtukey(0.95,G,n-G) * sqrt(1/2*(1/nG["E"]+1/nG["C"])*s2) treg2["E-C",2:3] ### Tukey HSD procedure in a two-way ANOVA model H = length(levels(mana$fspec)) s2 = sum(aAddit$res^2)/(n-G-H+1) diff.reg.means["E", "C"] + c(-1,1) * qtukey(0.95,G,n-G-H+1) * sqrt(1/2*(1/nG["E"]+1/nG["C"])*s2) treg["E-C",2:3] ### 2. Multiple comparisons: Hothorn-Bretz-Westfall MCP ### Now, we provide the corresponding estimates of the partial effects ### (differences in means of avg.mark between regions given specialization). coef(mAddit) ### Contrast matrix for the regions CX <- contr.treatment(length(lREG)) rownames(CX) <- lREG colnames(CX) <- names(coef(mAddit))[2:length(lREG)] print(CX) ### Contrast matrix for the specializations CZ <- contr.treatment(length(lSPEC)) rownames(CZ) <- lSPEC colnames(CZ) <- names(coef(mAddit))[(length(lREG) + 1):(length(coef(mAddit)))] print(CZ) ### Estimating the group means under the assumption of the additive model LmuAddit <- cbind(1, kronecker(rep(1, length(lSPEC)), CX), kronecker(CZ, rep(1, length(lREG)))) colnames(LmuAddit) <- names(coef(mAddit)) rownames(LmuAddit) <- outer(lREG, lSPEC, paste, sep = ":") print(LmuAddit[1:3,]) print(LmuAddit[15:17,]) LSest(mAddit, L = LmuAddit) ### The estimated group means reported in a table muAddit <- matrix(LSest(mAddit, L = LmuAddit)[["Estimate"]], nrow = length(lREG), ncol = length(lSPEC)) rownames(muAddit) <- lREG colnames(muAddit) <- lSPEC print(muAddit) ### This is not the same as the cell-wise averages. Why? with(mana, tapply(avg.mark,list(fregion, fspec),mean)) ### Observed group means in a plot with(mana, interaction.plot(fspec, fregion, avg.mark, col = colREG, type = "b", lwd = 2, lty = 1, xlab = "Specialization", trace.label = "Region", ylab = "Mean of avg.mark")) ### Model based group means in a plot dfrAddit <- data.frame(avg.mark = as.numeric(muAddit), fregion = rep(lREG, length(lSPEC)), fspec = rep(lSPEC, each = length(lREG))) print(dfrAddit) dfrAddit$fspec = factor(dfrAddit$fspec, levels=levels(mana$fspec)) with(dfrAddit, interaction.plot(fspec, fregion, avg.mark, col = colREG, type = "b", lwd = 2, lty = 1, xlab = "Specialization", pch=levels(fregion), cex=0.5, trace.label = "Region", ylab = "Model based mean avg.mark")) ### What is the link between the observed and the model based group means? ### What are we trying to do when estimating pairwise differences with respect ### to region (partial effects of region given specialization)? ### The "L" matrix to calculate all pairwise differences with respect to regions Lcore <- matrix(nrow = 0, ncol = ncol(CX)) ## the core part colnames(Lcore) <- colnames(CX) print(Lcore) ## matrix with 0 rows rname <- character(0) for (g1 in 1:(length(lREG) - 1)){ for (g2 in (g1 + 1):length(lREG)){ Lcore <- rbind(Lcore, CX[g2,] - CX[g1,]) rname <- c(rname, paste(lREG[g2], "-", lREG[g1])) } } rownames(Lcore) <- rname dim(Lcore) print(Lcore) L <- cbind(0, Lcore, matrix(0, nrow = nrow(Lcore), ## the whole matrix ncol = length(lSPEC) - 1)) colnames(L) <- names(coef(mAddit)) rownames(L) <- rownames(Lcore) dim(L) print(L) ### Unadjusted estimation of the partial effects | based on the additive model temp <- LSest(mAddit, L = L) class(temp) <- c("data.frame") temp <- round(temp,4) temp <- temp[order(temp[["P value"]]),]; temp[temp[["P value"]] <= 0.05,] temp["T - Z",] ## for example, the difference T-Z not significant ### Bretz-Hothorn-Westfall procedure - all functions from package multcomp mcp <- glht(mAddit, linfct = L, rhs = 0) print(mcp) ### The above can be done more easily with the help of the built-in option mcp <- glht(mAddit, linfct = mcp(fregion = "Tukey"), rhs = 0) print(mcp) ### Note, that the output is the same. The word "Tukey" in the function call ### is just a keyword to ask for all pairwise differences with respect to the ### factor covariate fregion. ### With respect to the underlying theory, the Tukey procedure is not involved ### in the calculations above. ### Sandwich estimator to adjust also for possible heteroscedasticity library(sandwich) ### Estimate of the variance matrix of the regression coefficients (VHC <- vcovHC(mAddit, type = "HC3")); ### The option 'type = "HC3"' uses residuals(mFin)^2/(1 - hatvalues(mFin))^2 ### on the diagonal of the "meat" matrix of the sandwich. ### Manual calculation X <- model.matrix(mAddit); Bread <- solve(t(X) %*% X) %*% t(X) Meat <- diag(residuals(mAddit)^2 / (1 - hatvalues(mAddit))^2) Bread %*% Meat %*% t(Bread) all.equal(Bread %*% Meat %*% t(Bread), VHC) ### Summary table using the standard error estimator under the homoscedasticity summary(mAddit) ### Summary table using the sandwich estimator to adjust for heteroscedasticity coeftest(mAddit, vcov = VHC) ### Hothorn-Bretz-Westfall MCP with the sandwich estimator mcp_sandwich <- glht(mAddit, linfct = mcp(fregion = "Tukey"), rhs = 0, vcov=sandwich) ### compare the partial effects under the homoscedasticity/heteroscedasticity print(mcp) print(mcp_sandwich) ### P-values adjusted for MCP will now be calculated. ### Multivariate integrals now must be computed by a Monte Carlo method. ### This may take some time... (load(url("http://msekce.karlin.mff.cuni.cz/~maciak/NMSA407/nmsa407-mult-comparison.RData"))) ## path to this file might need some adjustment ### set.seed(20010911) ## (cca 15 sec. @ 3.20GH, 2 x 8 GB RAM) ### system.time(smcp <- summary(mcp)) ### set.seed(20010911) ## (cca 15 sec. @ 3.20GH, 2 x 8 GB RAM) ### system.time(smcp_sandwich <- summary(mcp_sandwich)) smcp # == summary(mcp) smcp_sandwich # == summary(mcp_sandwich) ### Manual computation of std. errors in summary(mcp) X = model.matrix(mAddit) V = L %*% solve(t(X) %*% X) %*% t(L) D = diag(V) s = summary(mAddit)$sigma (std1 = s*sqrt(D)) (std2 = smcp$test$sigma) all.equal(std1, std2) ### Simultaneous confidence intervals will now be calculated. ### For this, only one quantile of the max-abs-t distribution ### must be calculated (again by Monte Carlo method). ### Again, some time (but less) is needed. ### set.seed(20010911) ## (cca 5 sec. @ 3.20GH, 2 x 8 GB RAM) ### system.time(cimcp <- confint(mcp, level = 0.95)) ### set.seed(20010911) ## (cca 5 sec. @ 3.20GH, 2 x 8 GB RAM) ### system.time(cimcp_sandwich <- confint(mcp_sandwich, level = 0.95)) ### print(cimcp) ### print(cimcp_sandwich) ### Put both intervals and P-values into one data.frame MCP <- data.frame(Estimate = cimcp[["confint"]][, "Estimate"], Lower = cimcp[["confint"]][, "lwr"], Upper = cimcp[["confint"]][, "upr"], Pvalue = smcp[["test"]][["pvalues"]]) MCP <- MCP[order(MCP[, "Pvalue"]),] round(MCP[MCP[, "Pvalue"] < 0.05,], digits=4) ### For comparison when the sandwich estimator is used MCP_sandwich <- data.frame(Estimate = cimcp_sandwich[["confint"]][, "Estimate"], Lower = cimcp_sandwich[["confint"]][, "lwr"], Upper = cimcp_sandwich[["confint"]][, "upr"], Pvalue = smcp_sandwich[["test"]][["pvalues"]]) MCP_sandwich <- MCP_sandwich[order(MCP_sandwich[, "Pvalue"]),] round(MCP_sandwich[MCP_sandwich[, "Pvalue"] < 0.05,], digits=4) ### Compare with model based means of regions given specialisation == Educ round(sort(muAddit[,1]), 3) ### For comparison Tukey HSD round(treg[treg[, "p adj"] < 0.05,], 4) ### -------------------------------------------------------------------------### ### Block 2: INDIVIDUAL WORK ### ### -------------------------------------------------------------------------### ### How would the calculations of the TukeyHSD and Hothorn-Bretz-Westfall MCP ### change if some different parametrization of the factor covariates would be ### used? For instance, consider the contrast sum parametrization. mAdditSum <- lm(avg.mark ~ fregion + fspec, data = mana, contrasts = list(fregion = "contr.sum", fspec = "contr.sum")) summary(mAdditSum) ### Interpret the estimated parameters - for instance, for intercept: newdata = with(mana,data.frame(expand.grid(levels(fregion),levels(fspec)))) colnames(newdata) = c("fregion", "fspec") mean(predict(mAdditSum,newdata=newdata)) mAdditSum$coef[1] ### Slightly more complicated model, now with [score] as the response m11Sum <- lm(score ~ fgender + avg.mark + fspec*mark.eng, data = mana, contrasts = list(fspec = "contr.sum")) summary(m11Sum) drop1(m11Sum, test = "F") ### Specialization is an effect modifier of the relationship between [score] ### and [mark.eng]. Which specializations differ with respect to the size of the ### effect of [mark.eng] on [score]? ### => Pairwise comparisons of slopes pertaining to the [mark.eng] regressor ### for different specializations. ### THe underlying ontrast matrix to parameterize categorical covariate "fspec" CZ <- contr.sum(lSPEC) rownames(CZ) <- lSPEC colnames(CZ) <- names(coef(m11Sum))[4:13] print(CZ) ### "L" matrix to calculate [mark.eng] slopes for different specializations Lslope <- cbind(matrix(0, nrow = nrow(CZ), ncol = 13), 1, CZ) rownames(Lslope) <- lSPEC colnames(Lslope) <- names(coef(m11Sum)) print(Lslope) LSest(m11Sum, L = Lslope) (slps <- LSest(m11Sum, L = Lslope)[["Estimate"]]) ### Is there any specialization for which there is no significant relationship ### between [mark.eng] (English at school) and [score] (English at MANA test) ### given gender and [avg.mark]? mean(slps) coef(m11Sum)["mark.eng"] ## the same as mean(slps) ### Core part of the "L" matrix to calculate all pairwise differences between ### the slope parameter estimates Lcore2 <- matrix(nrow = 0, ncol = ncol(CZ)) colnames(Lcore2) <- colnames(CZ) print(Lcore2) ## matrix with 0 rows rname <- character(0) for (h1 in 1:(length(lSPEC) - 1)){ for (h2 in (h1 + 1):length(lSPEC)){ Lcore2 <- rbind(Lcore2, CZ[h2,] - CZ[h1,]) rname <- c(rname, paste(lSPEC[h2], "-", lSPEC[h1])) } } rownames(Lcore2) <- rname print(Lcore2) ### Complete "L" matrix to calculate all pairwise differences in slopes ### Can you explain why is the "L" matrix constructed in this way? coef(m11Sum) LslopeDiff <- cbind(matrix(0, nrow = nrow(Lcore2), ncol = 14), Lcore2) colnames(LslopeDiff) <- names(coef(m11Sum)) rownames(LslopeDiff) <- rownames(Lcore2) print(LslopeDiff) ### Unadjusted inference temp <- LSest(m11Sum, LslopeDiff); class(temp) <- c("data.frame") temp <- round(temp,4) temp <- temp[order(temp[["P value"]]),]; temp[temp[["P value"]] <= 0.05,] ### Bretz-Hothorn-Westfall MCP mcpSlp <- glht(m11Sum, linfct = LslopeDiff, rhs = 0) mcpSlp_sandwich <- glht(m11Sum, linfct = LslopeDiff, rhs = 0, vcov=sandwich) ### With the manual calculations ### set.seed(20010911) ### smcpSlp <- summary(mcpSlp) ## 7 sec. ### cimcpSlp <- confint(mcpSlp, level = 0.95) ## 4 sec. ### set.seed(20010911) ### smcpSlp_sandwich <- summary(mcpSlp_sandwich) ## 7 sec. ### cimcpSlp_sandwich <- confint(mcpSlp_sandwich, level = 0.95) ## 4 sec. print(smcpSlp, 4) print(smcpSlp_sandwich, 4) print(cimcpSlp, 2) print(cimcpSlp_sandwich, 2) ### When MCP tis aken into account, are there any two specializations ### with significantly different [mark.eng] slopes? ### Which specializations differ with respect to the mean [score] ### when [mark.eng] is equal to 2 (median of [mark.eng]) while comparing ### students of the same gender and the same value of [avg.mark]. ### ### => Pairwise comparisons of specializations for a particular value ### of [mark.eng] adjusted for a possible effect of gender and [avg.mark]. median(mana[, "mark.eng"]) ### Complete "L" matrix to calculate all requested pairwise differences from the ### fitted linear model. coef(m11Sum) mmark.eng <- 2 LspecDiff <- cbind(0, 0, 0, Lcore2, 0, mmark.eng * Lcore2) colnames(LspecDiff) <- names(coef(m11Sum)) rownames(LspecDiff) <- rownames(Lcore2) print(LspecDiff) print(LspecDiff[1:5,]) ### Unadjusted inference temp <- LSest(m11Sum, LspecDiff) class(temp) <- c("data.frame") temp <- round(temp,4) temp <- temp[order(temp[["P value"]]),]; temp[temp[["P value"]] <= 0.05,] ### Bretz-Hothorn-Westfall MCP mcpSpec <- glht(m11Sum, linfct = LspecDiff, rhs = 0) mcpSpec_sandwich <- glht(m11Sum, linfct = LspecDiff, rhs = 0, vcov=sandwich) set.seed(20010911) smcpSpec <- summary(mcpSpec) cimcpSpec <- confint(mcpSpec, level = 0.95) set.seed(20010911) smcpSpec_sandwich <- summary(mcpSpec_sandwich) cimcpSpec_sandwich <- confint(mcpSpec_sandwich, level = 0.95) print(smcpSpec) print(smcpSpec_sandwich) print(cimcpSpec) print(cimcpSpec_sandwich) ### Put both intervals and P-values into one data.frame MCPSpec <- data.frame(Estimate = cimcpSpec[["confint"]][, "Estimate"], Lower = cimcpSpec[["confint"]][, "lwr"], Upper = cimcpSpec[["confint"]][, "upr"], Pvalue = smcpSpec[["test"]][["pvalues"]]) MCPSpec <- MCPSpec[order(MCPSpec[, "Pvalue"]),] MCPSpec[, "PvalueF"] <- format.pval(MCPSpec[, "Pvalue"], digits = 2, eps = 1e-4) MCPSpec[MCPSpec[, "Pvalue"] < 0.05, c("Estimate", "Lower", "Upper", "PvalueF")] MCPSpec_sandwich <- data.frame( Estimate = cimcpSpec_sandwich[["confint"]][,"Estimate"], Lower = cimcpSpec_sandwich[["confint"]][, "lwr"], Upper = cimcpSpec_sandwich[["confint"]][, "upr"], Pvalue = smcpSpec_sandwich[["test"]][["pvalues"]]) MCPSpec_sandwich <- MCPSpec_sandwich[order(MCPSpec_sandwich[, "Pvalue"]),] MCPSpec_sandwich[, "PvalueF"] <- format.pval(MCPSpec_sandwich[, "Pvalue"], digits = 2, eps = 1e-4) MCPSpec_sandwich[MCPSpec_sandwich[, "Pvalue"] < 0.05, c("Estimate", "Lower", "Upper", "PvalueF")] ### -------------------------------------------------------------------------### ### Block 3: ADDITIONAL MATERIAL ### ### -------------------------------------------------------------------------### ### Consider different options for the 'type' parameter in the call of ### the vcovHC function and try to understand the underlying model. (vcovHC(mAddit, type = "const")) ### For instance, compare the following: all.equal(vcov(mAddit), vcovHC(mAddit, type = "const")) ### Use the help for the vcovHC function to get an idea about different options.