### Initial operations ### ++++++++++++++++++++++++++++++++++++++++++++ setwd("/home/komarek/teach/mff_2020/nmst432_AdvRegr/Problem_4/") # library("colorspace") library("xtable") # print(load("AdvRegr_4_nels.RData")) ### Basic exploration ### ++++++++++++++++++++++++++++++++++++++++++++ ### Marginal frequencies with(nels, table(ses, useNA = "ifany")) with(nels, table(sesmed, useNA = "ifany")) with(nels, table(parents, useNA = "ifany")) with(nels, table(foreign, useNA = "ifany")) with(nels, table(fa.educ, useNA = "ifany")) with(nels, table(mo.educ, useNA = "ifany")) with(nels, table(region, useNA = "ifany")) with(nels, table(fa.wrk, useNA = "ifany")) with(nels, table(mo.wrk, useNA = "ifany")) ### (i) ses vs. fa.educ ### ++++++++++++++++++++++++++++++++++++++++++++ ### Contingency table (xtab1 <- with(nels, table(ses, fa.educ))) ### Exploration: column proportions prop.table(xtab1, margin = 2) ### Formatted numbers (ptab1 <- round(prop.table(xtab1, margin = 2) * 100, 1)) ### Table in LaTeX print(xtable(ptab1, digits = c(0, 1, 1, 1)), floating = FALSE) ### Plot (two slightly different versions) par(mfrow = c(1, 2), mar = c(3, 3, 3, 1) + 0.1) plot(t(xtab1), col = rainbow_hcl(4), main = "SES by father's education") plot(ses ~ fa.educ, data = nels, col = rainbow_hcl(4), main = "SES by father's education") par(mfrow = c(1, 1)) ### Chi^2 test chisq.test(xtab1) ### Data frame for loglinear modelling qq1 <- as.data.frame(xtab1, responseName = "N") print(qq1) ### Saturated model fit1 <- glm(N ~ (ses + fa.educ)^2, family = poisson, data = qq1) fit1 <- glm(N ~ ses + fa.educ + ses:fa.educ, family = poisson, data = qq1) ## the same as above summary(fit1) ### Surprised by (numerically) zero residual deviance? ### Independence model fit0 <- glm(N ~ ses + fa.educ, family = poisson, data = qq1) summary(fit0) ### Deviance (likelihood-ratio) test of independence anova(fit0, fit1, test = "LRT") ### Score test of independence chisq.test(xtab1) ### Yes, the classical chi-sq test of independence ### is the score test in the corresponding loglinear model anova(fit0, fit1, test = "Rao") ### Also the Wald test can be considered (be1 <- coef(fit1)) V1 <- vcov(fit1) (interIndex <- grep(":fa.educ", names(be1))) W <- as.numeric(be1[interIndex] %*% solve(V1[interIndex, interIndex], be1[interIndex])) pW <- pchisq(W, df = length(interIndex), lower.tail = FALSE) Wald <- data.frame(W = W, df = length(interIndex), Pvalue = pW) print(Wald) ### Coef interpretation exp(coef(fit1)[-1]) round(exp(coef(fit1)[-1]), 2) ### Meaning of each coefficient? ### Check levels of the two factors with(nels, table(ses, useNA = "ifany")) with(nels, table(fa.educ, useNA = "ifany")) contr.treatment(3) ### ODDS on better ses (compared to ses = 1) ### given father's education ### ---------------------------------------- # ### fa.educ = Elementary be1[paste("ses", 2:4, sep = "")] (oddsElem <- exp(be1[paste("ses", 2:4, sep = "")])) # ### fa.educ = High be1[paste("ses", 2:4, sep = "")] be1[paste("ses", 2:4, ":fa.educHigh", sep = "")] (oddsHigh <- exp(be1[paste("ses", 2:4, sep = "")] + be1[paste("ses", 2:4, ":fa.educHigh", sep = "")])) # ### fa.educ = College be1[paste("ses", 2:4, sep = "")] be1[paste("ses", 2:4, ":fa.educCollege", sep = "")] (oddsColl <- exp(be1[paste("ses", 2:4, sep = "")] + be1[paste("ses", 2:4, ":fa.educCollege", sep = "")])) ### All in one table ODDSbetterSES <- data.frame(Elementary = oddsElem, High = oddsHigh, College = oddsColl) print(ODDSbetterSES) ### ODDS on better education (compared to fa.educ = Elementary) ### given family SES ### ----------------------------------------------------------------------- # ### ses = 1 (odds1 <- exp(be1[paste("fa.educ", c("High", "College"), sep = "")])) # ### ses = 2 (odds2 <- exp(be1[paste("fa.educ", c("High", "College"), sep = "")] + be1[paste("ses2:fa.educ", c("High", "College"), sep = "")])) # ### ses = 3 (odds3 <- exp(be1[paste("fa.educ", c("High", "College"), sep = "")] + be1[paste("ses3:fa.educ", c("High", "College"), sep = "")])) # ### ses = 4 (odds4 <- exp(be1[paste("fa.educ", c("High", "College"), sep = "")] + be1[paste("ses4:fa.educ", c("High", "College"), sep = "")])) ### All in one table ODDShigherEduc <- data.frame(ses1 = odds1, ses2 = odds2, ses3 = odds3, ses4 = odds4) print(ODDShigherEduc) ### ODDS RATIOS (ratios of odds on higher ses compared to ses = 1 ### when comparing higher levels of education with Elementary one) ### ### = ODDS RATIOS (ratios of odds on higher level of education ### compared to Elementary one) ### when comparing higher ses with ses = 1) ### ### ----------------------------------------------------------------------------- exp(be1[grep(":fa.educ", names(be1))]) ### ODDS RATIOS (ratios of odds on higher ses compared to ses = 1 ### when comparing College education with High education) ### ### = ODDS RATIOS (ratios of odds on College education compared to High education ### when comparing higher ses with ses = 1 ### ------------------------------------------------------------------------------ exp(be1[grep(":fa.educCollege", names(be1))] - be1[grep(":fa.educHigh", names(be1))]) ### Are fit0 and fit1 the only reasonable models in this situation? ### ### How about the model below? ### Does it have reasonable interpretation? ### ---------------------------------------------------------------------- qq1 <- transform(qq1, nses = as.numeric(ses)) print(qq1) summary(qq1) ### Linear trend? fit1n <- glm(N ~ (nses + fa.educ)^2, family = poisson, data = qq1) summary(fit1n) be1n <- coef(fit1n) ### Odds on better ses (by 1) ### ---------------------------------------- # ### fa.educ = Elementary exp(be1n["nses"]) # (oddsnElem <- exp(be1n["nses"] * 1:3)) ## trend exp(be1[paste("ses", 2:4, sep = "")]) ## saturated # ### fa.educ = High exp(be1n["nses"] + be1n["nses:fa.educHigh"]) # (oddsnHigh <- exp((be1n["nses"] + be1n["nses:fa.educHigh"]) * 1:3)) ## trend exp(be1[paste("ses", 2:4, sep = "")] + ## saturated be1[paste("ses", 2:4, ":fa.educHigh", sep = "")]) # ### fa.educ = College exp(be1n["nses"] + be1n["nses:fa.educCollege"]) # (oddsnColl <- exp((be1n["nses"] + be1n["nses:fa.educCollege"]) * 1:3)) ## trend exp(be1[paste("ses", 2:4, sep = "")] + ## saturated be1[paste("ses", 2:4, ":fa.educCollege", sep = "")]) ### All in one table ODDSnbetterSES <- data.frame(Elementary = oddsnElem, High = oddsnHigh, College = oddsnColl) # ### Compare print(ODDSnbetterSES) ## SES ordinal (numeric) print(ODDSbetterSES) ## SES nominal ### Is (saturated) fit1 significantly better than fit1n? ### ------------------------------------------------------- anova(fit1n, fit1, test = "LRT") ### Odds on better education (compared to fa.educ = Elementary) ### ----------------------------------------------------------------------- # ### ses = 1 (oddsn1 <- exp(be1n[paste("fa.educ", c("High", "College"), sep = "")] + be1n[paste("nses:fa.educ", c("High", "College"), sep = "")])) # exp(be1[paste("fa.educ", c("High", "College"), sep = "")]) ## Compare (saturated) # ### ses = 2 (oddsn2 <- exp(be1n[paste("fa.educ", c("High", "College"), sep = "")] + 2 * be1n[paste("nses:fa.educ", c("High", "College"), sep = "")])) # exp(be1[paste("fa.educ", c("High", "College"), sep = "")] + be1[paste("ses2:fa.educ", c("High", "College"), sep = "")]) # ### ses = 3 (oddsn3 <- exp(be1n[paste("fa.educ", c("High", "College"), sep = "")] + 3 * be1n[paste("nses:fa.educ", c("High", "College"), sep = "")])) # exp(be1[paste("fa.educ", c("High", "College"), sep = "")] + be1[paste("ses3:fa.educ", c("High", "College"), sep = "")]) # ### ses = 4 (oddsn4 <- exp(be1n[paste("fa.educ", c("High", "College"), sep = "")] + 4 * be1n[paste("nses:fa.educ", c("High", "College"), sep = "")])) # exp(be1[paste("fa.educ", c("High", "College"), sep = "")] + be1[paste("ses4:fa.educ", c("High", "College"), sep = "")]) ### All in one table ODDSnhigherEduc <- data.frame(ses1 = oddsn1, ses2 = oddsn2, ses3 = oddsn3, ses4 = oddsn4) # ### Compare print(ODDSnhigherEduc) ## SES ordinal (numeric) print(ODDShigherEduc) ## SES nominal ##### (Useful?) function to calculate ##### a goodness-of-fit test ##### - perhaps useful for multi-dimensional tables ##### ============================================== gof <- function(m){ DD <- deviance(m) df <- m$df.residual pval <- pchisq(DD, df, lower.tail = FALSE) nparm <- length(coef(m)) LowCount <- sum(fitted(m) <= 5) cat("Goodness-of-fit test, model with ", nparm, " parameters\n", sep = "") cat("Deviance = ", DD, ", df = ", df, "\n", sep = "") cat("P-value: ", ifelse(pval < 0.001, "<0.001", format(round(pval, 3), nsmall = 3)), "\n\n", sep = "") if (LowCount){ cat("Number of cells with low fitted counts: ", LowCount, "\n\n") print(summary(fitted(m))) } } gof(fit1n) ## Test of a linear trend for X again # gof(fit1) ### Hmmmm... ### Do you have a better name for chi^2 distribution ### with 0 degrees of freedom? ### Correct p-value: pchisq(0, df = 0, lower.tail = FALSE) # gof(fit0) ## Test of independence again ### (ii) ses vs. fa.educ vs. region ### ++++++++++++++++++++++++++++++++++++++++++++ ### Table and related data.frame (xtab2 <- with(nels, table(ses, fa.educ, region))) qq2 <- as.data.frame(xtab2, responseName = "N") ### Saturated model fit2 <- glm(N ~ (ses + fa.educ + region)^3, family = poisson, data = qq2) summary(fit2) ### Deviance test on the three-way interaction drop1(fit2, test = "LRT") ### Coefficients (be2 <- coef(fit2)) pv2 <- summary(fit2)$coefficients[, "Pr(>|z|)"] (pv2 <- round(pv2, 3)) ### Some odds ratios ### Odds ratios (odds on higher ses compared to ses = 1) when comparing higher ### educations with elementary ones ### = odds ratios (odds on higher education compared to elementary one) when ### comparing higher ses with ses = 1 ### --> conditional ones (given region) # ## Region = Northeast (reference) ## ------------------------------ (lorNE <- c(be2[grep("^ses[0-9]:fa.educHigh$", names(be2))], be2[grep("^ses[0-9]:fa.educCollege$", names(be2))])) (orNE <- exp(lorNE)) (orNE <- matrix(round(orNE, 2), nrow = 3)) rownames(orNE) <- paste("ses", 2:4, sep = "") colnames(orNE) <- paste("educ", c("High", "College")) print(orNE) ## Region = Midwest ## ------------------------------ ### Try by yourself and report in your document. ### Some three-way interactions must be involved in your calculations ;-) # ## Region = South ## ------------------------------ ### Try by yourself, no need to report in your document. # ## Region = West ## ------------------------------ ### Try by yourself, no need to report in your document. ### Three-way interactions: changes of above conditional odds ratios ### when comparing given region with Northeast (reference) ### ------------------------------------------------------------------ # ### Midwest - Northeast (reference) ### ++++++++++++++++++++++++++++++++ (orChangeMW <- exp(be2[grep("^ses[0-9]:fa.educ(College|High):regionMidwest", names(be2))])) (pvChangeMW <- pv2[grep("^ses[0-9]:fa.educ(College|High):regionMidwest", names(pv2))]) ### West - Northeast (reference) ### +++++++++++++++++++++++++++++++ (orChangeWE <- exp(be2[grep("^ses[0-9]:fa.educ(College|High):regionWest", names(be2))])) (pvChangeWE <- pv2[grep("^ses[0-9]:fa.educ(College|High):regionWest", names(pv2))]) ### South and Northeast (reference) ### ++++++++++++++++++++++++++++++++ ### Try by yourself, just for yourself. ### Also Midwest-South, Midwest-West, South-West comparisons can be done... ### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ### Would you know how? ### Try by yourself, just for yourself.