###
### NMST432: Advanced Regression Models
###
### Exercise class assignment #1
### Some general steps.
###
### ==============================================
rm(list = ls())
setwd("/home/komarek/teach/mff_2020/nmst432_AdvRegr/Problem_1")
#.libPaths("L:/Statistika/R/library")
library("Epi")
library("mffSM")
##### -------------------------
##### Dataset
##### -------------------------
(load("AdvRegr_1_nelsNE.RData"))
### Basic information
dim(nelsNE)
str(nelsNE)
head(nelsNE)
summary(nelsNE)
print(varlabels)
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
###
### SOME HINTS
###
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
##### ----------------------------------------------------------
##### Illustration of use of the Relevel function (package Epi)
##### to merge or reorder factor levels.
##### ----------------------------------------------------------
help("Relevel", package = "Epi")
with(nelsNE, table(f2.tv))
with(nelsNE, table(f2.tv, useNA = "ifany"))
levels(nelsNE[, "f2.tv"])
yy <- as.integer(nelsNE[, "f2.tv"])
with(nelsNE, table(f2.tv, yy, useNA = "ifany"))
with(nelsNE, table(f2.tv, useNA = "ifany"))
nelsNE <- transform(nelsNE, f2.tv2 = Relevel(f2.tv, list("Not at all" = 1,
Little = 2,
Normal = c(3, 4),
Much = c(5, 6))))
with(nelsNE, table(f2.tv, f2.tv2, useNA = "ifany"))
##### ------------------------------------------------------------
##### Easily check numbers of NA's per row/column of a data.frame:
##### ------------------------------------------------------------
colSums(is.na(nelsNE))
table(rowSums(is.na(nelsNE))) ## What is this?
##### ------------------------------------------------------------
##### Easy selection of a subset of data with all non-NA's
##### on requested variables:
##### ------------------------------------------------------------
VARS <- c("f2.perc.math", "f2.sco.math", "fam.educ", "fam.mar")
Data <- subset(nelsNE, select = VARS)
dim(Data)
complete.cases(Data)
sum(complete.cases(Data))
dim(na.omit(Data))
DataNonNA <- na.omit(Data)
summary(DataNonNA)
summary(Data)
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
###
### STEP 1:
### Check all potential x-variables
### * Decide which of them are clearly useless (too much missingness,
### factor with almost all subjects in one category, ...), report
### which factors are no more considered.
### * Simplify factors if you think it is useful (Relevel).
### * In some cases, values of some factors may suggest to consider a bit smaller
### population. That is, some subjects may be removed from data
### based on a value of some factor(s). This should, however, have a good reason
### and should be reported.
### * Also here: think about possible multicollinearity,
### do not jointly consider variables providing (almost) the same information
### (factors with similar composition of subgroups) or highly correlated
### numeric variables.
### * A set of (highly correlated) numeric variables may serve as a basis
### for calculation of a new (useful) numeric variable.
###
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
colnames(nelsNE)
sapply(nelsNE, class)
### For each potential x-variable:
table(nelsNE[, "fam.sz"], useNA = "ifany")
### --> might be useful?
### --> in this or different form
### --> numeric or factor (with less levels)?
### A small function may simplify the task...
printTabs <- function(vars)
{
for (i in 1:length(vars)){
cat("\n\n", vars[i], ":")
print(table(nelsNE[, vars[i]], useNA = "ifany"))
}
}
### Apply the above function...
varNo1 <- c("b.mo", "b.yr", "f2.sch.reg", "f2.yr.left", "f2.grade")
printTabs(varNo1)
### For numeric variables, a scatterplot and pairwise correlations
### might be useful to decide on what to do...
xnum1 <- c("f2.gpa", paste("f2.gr", c("comp", "eng", "lang", "math", "sci", "soc"), sep = "."))
psych::pairs.panels(subset(nelsNE, select = xnum1),
bg = "palegoldenrod", col = "red3", pch = 21,
ellipses = FALSE, smooth = FALSE, lm = TRUE, hist.col = "lightblue", rug = FALSE)
### Recode sex
table(nelsNE[, "sex"], useNA = "ifany")
nelsNE <- transform(nelsNE, gender = factor(sex, levels = 1:2, labels = c("Male", "Female")))
table(nelsNE[, "gender"], useNA = "ifany")
### MANY OTHER STEPS ARE PROBABLY NEEDED...
### At the end, a list of potential x-variables to be considered for modelling
### should be created, e.g., as below.
###
### The set below is just an example, not necessarily containing only
### useful variables!!! Analogously, some useful variables have perhaps been omitted!!!
xvars <- c("fam.sz", "fam.comp", "fam.educ", "ses.perc", "race", "gender", paste("f2", c("mo.left", "gr.eng", "gr.math", "gr.sci", "gr.soc", "moved", "othsch", "menrol", "games", "tv", "teach", "disc", "cig", "alco", "drunk", "weed", "skip", "arrest"), sep = "."))
length(xvars)
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
###
### STEP 2:
### - data.frame containing only variables that may be used in modelling
### - only "Urban" schools that should be used to develop the model
###
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### Dataset used to develop the model
DataMod <- subset(nelsNE, f2.sch.loc == "Urban",
select = c("id", "f2.sco.math", "f2.perc.math", xvars))
dim(DataMod)
### Only rows with no missing values
### (to be able to perform submodel testing in a reasonable way)
DataMod <- na.omit(DataMod)
dim(DataMod)
### How many subjects dropped? --> report
summary(DataMod)
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
###
### STEP 3:
### Initial model, decide about the response variable
### and perhaps its transformation. Initial refinment
### of x-variables.
###
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### Quick view on a marginal distribution of potential y-variables
par(mfrow = c(1, 2))
hist(DataMod[, "f2.sco.math"], col = "palegreen", xlab = "Score", main = "Score", prob = TRUE)
hist(DataMod[, "f2.perc.math"], col = "palegreen", xlab = "Percentile", main = "Percentile", prob = TRUE)
par(mfrow = c(1, 1))
### Initial model (with score in mathematics)
paste(xvars, collapse = "+")
m0 <- lm(paste("f2.sco.math ~", paste(xvars, collapse = "+")), data = DataMod)
summary(m0) ## not really useful now
drop1(m0, test = "F") ## also not really useful at this stage
mffSM::plotLM(m0)
### --> decide on the response used and perhaps its transformation
### Warning message:
### not plotting observations with leverage one:
### 47, 66, 85, 308
### -------------------------------------------------
### Leverage 1, why?
hatvalues(m0)[hatvalues(m0) >= 0.9]
m0[["model"]]["4553",]
m0[["model"]]["4944",]
m0[["model"]]["9851",]
m0[["model"]]["21395",]
### Likely reason: value of a categorical covariate that appears only once in the dataset
#
with(m0[["model"]], table(fam.comp, useNA = "ifany"))
with(m0[["model"]], table(fam.educ, useNA = "ifany"))
with(m0[["model"]], table(f2.games, useNA = "ifany")) ## Here, only 1 obs. in a cell
## --> what is a consequence?
with(m0[["model"]], table(f2.tv, useNA = "ifany"))
### etc.
m0[["model"]]["4553", "f2.games"] ## leverage 1 because of f2.games
m0[["model"]]["4944", "f2.games"] ## other variable is responsible for leverage 1
m0[["model"]]["9851", "f2.games"] ## other variable is responsible for leverage 1
m0[["model"]]["21395", "f2.games"] ## other variable is responsible for leverage 1
### still should something be done with some factors
### Multicollinearity
car::vif(m0) ## not really multicollinearity problems here, we are lucky...
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
###
### STEP 4:
### Model building, mainly by the mean of submodel F-tests
### performed while also using a natural intelligence (NI)
###
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### Look for candidates for removal
(D0 <- drop1(m0, test = "F"))
pv0 <- D0[, "Pr(>F)"]
names(pv0) <- rownames(D0)
print(pv0)
pv0[pv0 > 0.6]
### Try to remove more x's at once
xrem0 <- c("f2.gr.soc", "f2.teach", "f2.disc")
paste(xrem0, collapse = "+")
paste(". ~ . -(", paste(xrem0, collapse = "+"), ")")
m1 <- update(m0, paste(". ~ . -(", paste(xrem0, collapse = "+"), ")"))
#summary(m1)
anova(m1, m0) ## is not a simpler model significantly worse?
mffSM::plotLM(m1) ## is basic diagnostics still OK?
### How about if anything from removed would again improve the model
### if added separately (anti-multicollinearity measure).
add1(m1, scope = m0, test = "F")
### What else to remove?
(D1 <- drop1(m1, test = "F"))
pv1 <- D1[, "Pr(>F)"]
names(pv1) <- rownames(D1)
print(pv1)
pv1[pv1 > 0.5]
xrem1 <- c("fam.sz", "f2.weed", "f2.arrest")
m2 <- update(m1, paste(". ~ . -(", paste(xrem1, collapse = "+"), ")"))
#summary(m1)
anova(m2, m1) ## is not a simpler model significantly worse compared to the previous one?
anova(m2, m0) ## is not a simpler model significantly worse also compared to the starting one?
mffSM::plotLM(m2) ## is basic diagnostics still OK?
add1(m2, scope = m1, test = "F") ## try to add only last removed
add1(m2, scope = m0, test = "F") ## try to add all removed
### And so on...
### At some point, we may also want to check changes in estimated beta's.
### Removal of some variables should not cause too big changes of estimated beta's
### being kept in the model.
### How about change of estimated beta's?
cf2 <- coef(m2)
cf1 <- coef(m1)[names(cf2)]
(data.frame(m1 = cf1, m2 = cf2))
(adif12 <- abs(cf2 - cf1)/abs(cf2))
adif12[adif12 > 1]