### ---------------------------------------------------------------------###
### Winter Term 2018/2019 ###
### NMSA407 - Linear Regression ###
### ###
### EXERCISE #1 Statistical analysis of dependence Y ~ X ###
### for different types of Y and X. ###
### ###
### Review of topics from the 3rd grade statistical courses ###
### ---------------------------------------------------------------------###
### Setting the working directory
### =============================
setwd("H:/nmsa407_LinRegr/")
### Loading necessary packages
### ==========================
### In K10/K11, we have to add a net directory L:/Statistika/R/library to the path
### where the R packages are looked for:
### .libPaths("L:/Statistika/R/library")
### This is not needed on most private PC's where R and all extension packages are
### installed on one place.
### If you encounter problems when loading this package in K4/K10/K11 because of a low
### version of R software (< 3.2.0), then you can run R software from the
### network disc L, where the version 3.2.2. is installed. The path is
### L:\Statistika\R\bin\x64\Rgui.exe
# install.packages("mffSM_1.1.zip", repos = NULL)
library("mffSM")
### Loading the data we will be working with this exercise
data(Cars2004nh, package = "mffSM")
head(Cars2004nh)
summary(Cars2004nh)
### Excluding cars where consumption is missing.
Cars2004nh <- Cars2004nh[!is.na(Cars2004nh$consumption),]
### Creating a subsample (to increase some of the subsamples)
set.seed(201617)
iii <- c(sample((1:nrow(Cars2004nh))[Cars2004nh$ftype=="personal"], size=100), sample((1:nrow(Cars2004nh))[Cars2004nh$ftype!="personal"], size=120))
Cars2004nh <- Cars2004nh[iii,]
### Deriving some additional variables
### fdrive2: factor which distinguishes whether the drive is front or other (rear or 4x4)
Cars2004nh <- transform(Cars2004nh, fdrive2 = factor(1*(fdrive == "front") + 2*(fdrive != "front"), labels = c("front", "other")))
with(Cars2004nh, table(fdrive, fdrive2))
### cons10: 0/1 variable which is 1 if consumption <= 10 l/100 km and 0 otherwise
### fcons10: factor No/Yes derived from cons10 (0 -> No, 1 -> Yes)
Cars2004nh <- transform(Cars2004nh, cons10 = 1 * (consumption <= 10),
fcons10 = factor(1 * (consumption <= 10), labels = c("No", "Yes")))
summary(Cars2004nh)
### QUESTIONS:
### Does consumption depend on a drive (distinguishing only front and other)?
###
### Y = consumption (continuous)
### X = fdrive2 (dichotomous)
COL2 <- rainbow_hcl(2)
### Boxplot - usually the most useful plot for this problem
plot(consumption ~ fdrive2, data = Cars2004nh, col = COL2)
with(Cars2004nh, tapply(consumption, fdrive2, summary))
with(Cars2004nh, tapply(consumption, fdrive2, sd, na.rm = TRUE))
# Y1 = subset(Cars2004nh, fdrive2 == "front")$consumption
# Y2 = subset(Cars2004nh, fdrive2 == "other")$consumption
# var.test(Y1,Y2)
# (ts<-var(subset(Cars2004nh, fdrive2 == "front")$consumption)/var(subset(Cars2004nh, fdrive2 == "other")$consumption))
# (df1<-length(subset(Cars2004nh, fdrive2 == "front")$consumption-1))
# (df2<-length(subset(Cars2004nh, fdrive2 == "other")$consumption-1))
# p<-pf(ts,df1=df1-1,df2=df2-1)
# 2*(min(p,1-p))
### Histograms - useful to see the shapes of distributions
par(mfrow = c(1, 2))
hist(subset(Cars2004nh, fdrive2 == "front")$consumption, prob = TRUE, main = "Front", col = COL2[1], xlab = "Consumption [l/100 km]")
hist(subset(Cars2004nh, fdrive2 == "other")$consumption, prob = TRUE, main = "Other", col = COL2[2], xlab = "Consumption [l/100 km]")
par(mfrow = c(1, 1))
Y1 = subset(Cars2004nh, fdrive2 == "front")$consumption
Y2 = subset(Cars2004nh, fdrive2 == "other")$consumption
#
par(mfrow = c(1, 2))
hist(Y1, prob = TRUE, main = "Front", col = COL2[1], xlab = "Consumption [l/100 km]")
curve(dnorm(x,mean=mean(Y1),sd=sd(Y1)),add=TRUE,col=2,lwd=2)
hist(Y2, prob = TRUE, main = "Other", col = COL2[2], xlab = "Consumption [l/100 km]")
curve(dnorm(x,mean=mean(Y2),sd=sd(Y2)),add=TRUE,col=2,lwd=2)
par(mfrow = c(1, 1))
### QUESTIONS:
### What are we testing by each of the tests?
### What is the model behind each of these tests?
### Does the model seem to be good enough to be useful?
### Which artefacts of the model might be relatively safely ignored and why?
### What is the conclusion?
### What else can be learned from the output?
### Welch two-sample t-test (not assuming equal variances)
t.test(consumption ~ fdrive2, data = Cars2004nh)
(T=(mean(Y1)-mean(Y2))/sqrt(var(Y1)/length(Y1) + var(Y2)/length(Y2)))
2*min(pnorm(T),1-pnorm(T))
### Standard two-sample t-test (assuming equal variances)
t.test(consumption ~ fdrive2, data = Cars2004nh, var.equal = TRUE)
n1 = length(Y1)
n2 = length(Y2)
(T=(mean(Y1)-mean(Y2))/sqrt(((n1-1)*var(Y1)+(n2-1)*var(Y2))/(n1+n2-2)*(1/n1+1/n2)))
2*(1-pnorm(abs(T))) # p-value for the exact test
2*(1-pt(abs(T),df=n1+n2-2)) # p-value for the asymptotic test
### Wilcoxon two-sample rank test
wilcox.test(consumption ~ fdrive2, data = Cars2004nh, conf.int = TRUE)
median(Y1) - median(Y2)
### Kolmogorov-Smirnov test
ks.test(subset(Cars2004nh, fdrive2 == "front")$consumption, subset(Cars2004nh, fdrive2 == "other")$consumption, data = Cars2004nh)
plot(ecdf(Y1),main="ECDF")
plot(ecdf(Y2),col=2,add=TRUE)
### One-sided alternative
t.test(consumption ~ fdrive2, data = Cars2004nh, alternative = "less", mu = -2)
t.test(consumption ~ fdrive2, data = Cars2004nh, alternative = "less", mu = -2, var.equal = TRUE)
wilcox.test(consumption ~ fdrive2, data = Cars2004nh, alternative = "less", mu = -2, conf.int = TRUE)
### QUESTIONS:
### Does consumption depend on a drive (distinguishing front/rear/4x4)?
###
### Y = consumption (continuous)
### X = fdrive (general categorical)
COL3 <- rainbow_hcl(3)
### Boxplot - usually the most useful plot for this problem
plot(consumption ~ fdrive, data = Cars2004nh, col = COL3)
with(Cars2004nh, tapply(consumption, fdrive, summary))
with(Cars2004nh, tapply(consumption, fdrive, sd, na.rm = TRUE))
### Histograms - useful to see the shapes of distributions
par(layout(matrix(c(0,1,1,0, 2,2,3,3), nrow = 2, byrow = TRUE)))
hist(subset(Cars2004nh, fdrive == "front")$consumption, prob = TRUE, main = "Front", col = COL3[1], xlab = "Consumption [l/100 km]")
hist(subset(Cars2004nh, fdrive == "rear")$consumption, prob = TRUE, main = "Rear", col = COL3[2], xlab = "Consumption [l/100 km]")
hist(subset(Cars2004nh, fdrive == "4x4")$consumption, prob = TRUE, main = "4x4", col = COL3[3], xlab = "Consumption [l/100 km]")
par(mfrow = c(1, 1))
### QUESTIONS:
### What are we testing by each of the tests?
### What is the model behind each of these tests?
### Does the model seem to be good enough to be useful?
### Which artefacts of the model might be relatively safely ignored and why?
### What is the conclusion?
### What else can be learned from the output?
summary(aov(consumption ~ fdrive, data = Cars2004nh)) # Standard ANOVA
oneway.test(consumption ~ fdrive, data = Cars2004nh) # ANOVA without assuming equal variances
kruskal.test(consumption ~ fdrive, data = Cars2004nh) # Kruskal-Wallis Rank Sum Test
# same test statistics as in the standard ANOVA above
summary(m0<-lm(consumption~fdrive,data=Cars2004nh))
anova(m0)
oneway.test(consumption ~ fdrive, data = Cars2004nh,var.equal=TRUE)
X = subset(Cars2004nh, fdrive == "front")$consumption
Y = subset(Cars2004nh, fdrive == "rear")$consumption
Z = subset(Cars2004nh, fdrive == "4x4")$consumption
nX = length(X)
nY = length(Y)
nZ = length(Z)
mX = mean(X)
mY = mean(Y)
mZ = mean(Z)
m = mean(c(X,Y,Z))
nX*(m - mX)^2+nY*(m-mY)^2+nZ*(m-mZ)^2
sum((X - mX)^2)+sum((Y-mY)^2)+sum((Z-mZ)^2)
sum((c(X,Y,Z)-m)^2)
nX+nY+nZ
### QUESTIONS:
### Does consumption depend on a length of the car?
###
### Y = consumption (continuous)
### X = length (continuous)
### Scatter plot
plot(consumption ~ length, data = Cars2004nh, col = "red3", bg = "orange", pch = 23)
### Histograms
par(mfrow = c(1, 2))
hist(Cars2004nh$consumption, prob = TRUE, main = "", col = "seagreen", xlab = "Consumption [l/100 km]")
hist(Cars2004nh$length, prob = TRUE, main = "", col = "seagreen", xlab = "Length [cm]")
par(mfrow = c(1, 1))
with(Cars2004nh, cor(consumption, length))
with(Cars2004nh, cor(consumption, length, use = "complete.obs"))
with(Cars2004nh, cor(consumption, length, method = "spearman", use = "complete.obs"))
### QUESTIONS:
### What are we testing by each of the tests?
### What is the model behind each of these tests?
### Does the model seem to be good enough to be useful?
### Which artefacts of the model might be relatively safely ignored and why?
### What is the conclusion?
### What else can be learned from the output?
with(Cars2004nh, cor.test(consumption, length))
n = sum(complete.cases(cbind(Cars2004nh$consumption, Cars2004nh$length)))
r = cor(Cars2004nh$consumption,Cars2004nh$length, use = "complete.obs")
(T = sqrt(n-2)*r/sqrt(1-r^2))
2*(1-pt(abs(T),df=n-2))
2*(1-pnorm(abs(T)))
with(Cars2004nh, cor.test(consumption, length, method = "spearman"))
### QUESTIONS:
### Is there any link between the test on a row "length"
### in the output from summary(m1) and any of the previous two tests?
m1 <- lm(consumption ~ length, data = Cars2004nh)
summary(m1)
X = Cars2004nh$length
Y = Cars2004nh$consumption
I = (!is.na(X)) & (!is.na(Y))
X = X[I]
Y = Y[I]
n = length(X)
# point estimates of regression coefficients
(b<-sum((X-mean(X))*(Y-mean(Y)))/sum((X-mean(X))^2))
(a = mean(Y) - mean(X)*b)
# b is just a re-scaled correlation coefficient
cor(X,Y)*sqrt(var(Y)/var(X))
# in matrix notation
Xm = cbind(1,X)
solve(t(Xm)%*%Xm)%*%t(Xm)%*%matrix(Y,ncol=1)
# residuals and the estimate of residual variance
r = Y - (a+b*X)
summary(r)
(sig = sqrt(sum(r^2)/(n-2)))
n - 2
# standard deviations
sqrt(sig^2*solve(t(Xm)%*%Xm))
# multiple R^2
cor(X,Y)^2
### QUESTIONS:
### Is there any link of the above output to the lines in the plot below?
### When does the correlations coefficient equal to the slope parameter?
plot(consumption ~ length, data = Cars2004nh, col = "red3", bg = "orange", pch = 23)
abline(m1, col = "blue", lwd = 2)
abline(h = mean(Cars2004nh$consumption, na.rm = TRUE), col = "darkgreen", lwd = 2)
X = X/sd(X)
Y = Y/sd(Y)
summary(lm(Y~X))
cor(X,Y)
X = X - mean(X)
Y = Y - mean(Y)
summary(m2<-lm(Y~X))
cor(X,Y)
plot(Y~X, col = "red3", bg = "orange", pch = 23)
abline(m2, col = "blue", lwd = 2)
abline(h = mean(Y), col = "darkgreen", lwd = 2)
### QUESTIONS:
### Does the fact that the car is economical (in the U.S. sense -> consumption <= 10)
### depend on the drive (while distinguishing only front and other)
###
### Y = cons10 or fcons10 (dichotomous)
### X = fdrive2 (dichotomous)
COL2 <- diverge_hcl(2)
plot(fcons10 ~ fdrive2, data = Cars2004nh, col = COL2)
(TAB2 <- with(Cars2004nh, table(fdrive2, fcons10)))
round(prop.table(TAB2, margin = 1), 2)
### QUESTIONS:
### What are we testing by each of the tests?
### What is the model behind each of these tests?
### Does the model seem to be good enough to be useful?
### Which artefacts of the model might be relatively safely ignored and why?
### What is the conclusion?
chisq.test(TAB2) # Using continuity correction to be more conservative
chisq.test(TAB2, correct = FALSE); # Not using continuity correction.
print(chisq.test(TAB2)$expected) # Expected observed counts under the null hypothesis
(T = chisq.test(TAB2,correct=FALSE))
n = sum(TAB2) # number of observations
R = rowSums(TAB2)/n # observed proportion in rows
C = colSums(TAB2)/n # observed proportion in columns
R%*%t(C)*n # expected counts under independence
(T$observed - T$expected)/sqrt(T$expected) # residuals
T$residuals
(ts<-sum(T$res^2)) # test statistic
pchisq(ts,df=1,lower.tail=FALSE) # p-value
### QUESTIONS:
### Are the tests below different from those above (with the same 'correct' value)?
### What else can now be learned from the output?
### Can you reconstruct the test statistics in the test below?
(x <- TAB2[, "Yes"])
(n <- apply(TAB2, 1, sum))
prop.test(x, n)
prop.test(x, n, correct = FALSE)
p2 = x/n # proportion estimates in groups
p = sum(x)/sum(n) # common proportion estimate
(ts<-(diff(p2)/sqrt(p*(1-p)*sum(1/n)))^2) # test statistic
pchisq(ts,df=1,lower.tail=FALSE)
### QUESTIONS:
### What are we testing now?
### Note: also here, version without continuity correction might be used.
prop.test(x, n, alternative = "greater")
### QUESTIONS:
### Would it be possible to find out whether the proportion of economical cars among those with the front drive
### is by more than 40 percentage points higher than the proportion of economical cars among those with the other drive?
### Even if it is not possible to get a p-value by just one call to some function, is it at least possible
### to decide on a 5% significance level?
### QUESTIONS:
### Does the fact that the car is economical (in the U.S. sense -> consumption <= 10)
### depend on the drive (while distinguishing front/rear/4x4)
###
### Y = cons10 or fcons10 (dichotomous)
### X = fdrive (general categorical)
COL2 <- diverge_hcl(2)
plot(fcons10 ~ fdrive, data = Cars2004nh, col = COL2)
(TAB3 <- with(Cars2004nh, table(fdrive, fcons10)))
prop.table(TAB3, margin = 1)
### QUESTIONS:
### What are we testing by this test?
### What is the model behind this test?
### Does the model seem to be good enough to be useful?
### Which artefacts of the model might be relatively safely ignored and why?
### What is the conclusion?
chisq.test(TAB3)
TAB3
print(chisq.test(TAB3)$expected)
print(chisq.test(TAB3)$residuals)
print(chisq.test(TAB3)$stdres)
# standardized residuals, Agresti (2007, Section 2.4.5)
n = sum(TAB3) # number of observations
R = rowSums(TAB3)/n # observed proportion in rows
C = colSums(TAB3)/n # observed proportion in columns
CP = (1-R)%*%t((1-C)) # complementary cell probabilities under independence
(TAB3 - chisq.test(TAB3)$expected)/sqrt(chisq.test(TAB3)$expected*CP)
### QUESTIONS:
### Does the drive (front/rear/4x4) depends
### on the type of the car (personal/wagon/SUV/pickup/sport/minivan)?
###
### Y = fdrive (general categorical)
### X = ftype (general categorical)
COL3 <- diverge_hcl(3)
plot(fdrive ~ ftype, data = Cars2004nh, col = COL3)
(TAB4 <- with(Cars2004nh, table(ftype, fdrive)))
prop.table(TAB4, margin = 1)
### QUESTIONS:
### What are we testing by this test?
### What is the model behind this test?
### Does the model seem to be good enough to be useful?
### Which artefacts of the model might be relatively safely ignored and why?
### What is the conclusion?
chisq.test(TAB4)
print(chisq.test(TAB4)$expected)
### QUESTIONS:
### Minivans and pickups might be considered as just one category.
### Will it help in improving the chi^2 approximation of the test statistic?
### Can you answer this in advance before running the code below?
Cars2004nh <- transform(Cars2004nh, ftype5 = factor(ifelse(!(ftype %in% c("pickup", "minivan")), ftype, "pm"), labels = c(levels(ftype)[c(1, 2, 3, 5)], "pickup/minivan")))
with(Cars2004nh, table(ftype, ftype5))
(TAB4modif <- with(Cars2004nh, table(ftype5, fdrive)))
prop.table(TAB4modif, margin = 1)
chisq.test(TAB4modif)
print(chisq.test(TAB4modif)$expected)
print(chisq.test(TAB4modif)$residuals)
print(chisq.test(TAB4modif)$stdres)
### QUESTIONS:
### Can you say something to describe the dependency between type and drive?
###
### Situations: Y = dichotomous, X = continuous
### Y = general categorical, X = continuous
###
### Not treated by standard courses at a bachelor level at MFF UK.
### Something will be in a summer term (NMST432: Advanced Regression Models)
### END OF LAB SESSION 1