# NMST 434: Exercise session 12
# May 7 and 14, 2020

## Resampling procedures II
# 0. Ex. 70: Two sample Kolmogorov-Smirnov test
# 1. Ex. 71: Permutation test of independence
# 2. Heteroscedastic normal linear model
# 3. Durbin-Watson's test
# 4. Ex. 66: Bootstrap in AR processes
# 5. Confidence interval for median
# 6. Bootstrap in logit regression

rm(list=ls());
pb = txtProgressBar(style=3); # progress bar

###
### Example 70: Two-sample permutation Kolmogorov-Smirnov test
###

K = 100
pRes = rep(NA,K);pKS = rep(NA,K)
k = 1

# uncomment the loop for a simulation study
# for(k in 1:K){
n = 10
m = 40
X = rnorm(n)
Y = rnorm(m)
(ksT=ks.test(X,Y))
T = ksT$stat
# asymptotic Kolmogorov-Smirnov's test

# permutation approach
B = 9999
Z = c(X,Y)
T. = rep(NA,B)
for(b in 1:B){
  setTxtProgressBar(pb,b/B)
  Z. = sample(Z)  # random permutation of the pooled sample
  T.[b] = ks.test(Z.[1:n],Z.[-(1:n)])$stat
}
hist(T.,B/100,prob=TRUE)
abline(v=T,lwd=2,col=2)
pRes[k] = (1+sum(T.>T))/(B+1)      # resampling p-value
pKS[k] = ksT$p.value               # p-value from the ks.test
# }

# sum(pRes<=.05);sum(pKS<=0.05)

pB = (1+cumsum(T.>T))/(1:B+1)
plot(1:B,pB,type="l",main="Resampled p-value as a function of B",ylab="p-value",
     xlab="Permutations")
abline(h=ksT$p.value,col=2,lwd=2)
legend("topright",c("K-S p-value", "Resampled p-value"),col=c(2,1),lwd=c(2,1))

### 
### Example 71: Permutation test of independence 
### 

A <- c(25, 35, 24, 19, 21, 24);
B <- c(9, 18, 6, 10, 17,  24); 
C <- c(9, 6, 10, 12, 17, 14); 

# Three players are rolling a standard die. The results are 
# found in the table 'TAB'. Based on these results 
# can we say that the result of rolling is dependent on the person? 

TAB <- rbind(A, B, C);
colnames(TAB) <- 1:6;
TAB

plot(as.table(TAB))

# Asymptotic chi-squared test
chisq.test(TAB)

# # Permutation approach 
player <- rep(c("A", "B", "C"), rowSums(TAB));
dice <- rep(rep(1:6,3), t(TAB));

# # Check 
# table(player, dice) - TAB

B <- 9999;
stat.perm <- numeric(B)

for(b in 1:B){
  setTxtProgressBar(pb,b/B)
	dice.perm <- 	sample(dice);
	stat.perm[b] <- chisq.test(table(player, dice.perm))$stat
}

(1+sum(stat.perm >= chisq.test(TAB)$stat))/(B+1)

# The above permutation approach is (hopefully) the same as when you run in R 
chisq.test(TAB, simulate.p.value=T)
chisq.test(TAB, simulate.p.value=T, B=100000) # B is the number of resamples
# ?chisq.test

###
### Example: Heteroscedastic normal linear models 
### 

library(sandwich)

n <- 100;
x0 <- rep(1, n);        # regressor for the intercept  
x1 <- 2*runif(n);       # regressor for the slope
X <- cbind(x0, x1);     # the regression matrix X
B <- 1000;

beta <- c(0,1);         # true values of the parameters  

# # Heteroscedastic normal errors 
sigma <- function(x1) exp(x1)
eps <- rnorm(n)*sigma(x1);
Y <- X%*%beta + eps;

# Summary 
summary(fit.lm <- lm(Y~x1));

# Plot 
plot(x1, Y, cex=0.6,pch=16)
abline(fit.lm$coef, col="blue", lwd=2)


# # Bootstrap 
beta0.boot.NB <- numeric(B); 
beta1.boot.NB <- numeric(B);
beta0.stud.NB <- numeric(B); 
beta1.stud.NB <- numeric(B);
beta0.boot.MB <- numeric(B); 
beta1.boot.MB <- numeric(B);

alfa <- 0.05;

# Estimates from the data 
beta0 <- fit.lm$coef[1];
beta1 <- fit.lm$coef[2];

res <- residuals(fit.lm)/sqrt(1-hatvalues(fit.lm)); # standardized residuals
Yhat <- fitted(fit.lm);

indexy <- 1:n;

for(b in 1:B){
  setTxtProgressBar(pb,b/B)
  # #   Nonparametric bootstrap 
  iii <- sample(indexy, size=n, replace=TRUE);  
  x1s <- x1[iii];
  Ys <- Y[iii];
  fit <- lm(Ys~x1s);
  beta0.boot.NB[b] <- fit$coef[1];
  beta1.boot.NB[b] <- fit$coef[2];
  
	(sd.sandwich <- sqrt(diag(vcovHC(fit, type = "HC3"))));
	
  # # 	Studentized nonparametric bootstrap 
  beta0.stud.NB[b] <- (fit$coef[1] - beta0)/sd.sandwich[1];
  beta1.stud.NB[b] <- (fit$coef[2] - beta1)/sd.sandwich[2];

  # #   Model based bootstrap 
  resb <- sample(res, size=n, replace=TRUE);    
  Ys <- Yhat + resb;
  fit <- lm(Ys~x1);
  beta0.boot.MB[b] <- fit$coef[1];
  beta1.boot.MB[b] <- fit$coef[2];  
}

kvant0.NB <- quantile(beta0.boot.NB-beta0, c(1-alfa/2,alfa/2));
kvant1.NB <- quantile(beta1.boot.NB-beta1, c(1-alfa/2,alfa/2));

kvant0.NB.stud <- quantile(beta0.stud.NB, c(1-alfa/2,alfa/2));
kvant1.NB.stud <- quantile(beta1.stud.NB, c(1-alfa/2,alfa/2));

kvant0.MB <- quantile(beta0.boot.MB-beta0, c(1-alfa/2,alfa/2));
kvant1.MB <- quantile(beta1.boot.MB-beta1, c(1-alfa/2,alfa/2));

# Comparison of bootstrap distributions 
par(mfrow=c(1,2))
hist(beta1.boot.MB, main="beta1 - model based bootstrap",prob=TRUE)
abline(v=beta[2],col=2,lwd=2,lty=2)
abline(v=beta1,col=3,lwd=2,lty=2)
#
hist(beta1.boot.NB, main="beta1 - nonparametric bootstrap",prob=TRUE);
abline(v=beta[2],col=2,lwd=2,lty=2)
abline(v=beta1,col=3,lwd=2,lty=2)
legend("topleft",c("True","Estimate"),col=c(2,3),lwd=2,lty=2)
par(mfrow=c(1,1))

# # Standard deviation of estimators via bootstrap 
SE.model.boot <- c(sd(beta0.boot.MB), sd(beta1.boot.MB)); # Model bootstrap 
SE.nonp.boot <- c(sd(beta0.boot.NB), sd(beta1.boot.NB));  # Nonparametric bootstrap 

# # Estimated asymptotic standard deviations  
SE.lm <- sqrt(diag(vcov(fit.lm)))
SE.sandwich <- sqrt(diag(vcovHC(fit.lm, type = "HC3")))

res <- rbind(SE.lm, SE.model.boot, SE.nonp.boot, SE.sandwich);
colnames(res) <- c("beta0", "beta1");
print(res, digits = 3)

#
#

# # Basic confidence intervals based on nonparametric bootstrap 
CI.boot.basic <- beta1 - kvant1.NB
# # Confidence interval based on model based bootstrap
CI.boot.model <- beta1 - kvant1.MB
# # Studentized bootstrap confidence intervals (based on nonparametric bootstrap)  
CI.boot.stud <- beta1 - kvant1.NB.stud * SE.sandwich[2]
# # Confidence intervals based on normal approximation + sandwich 
CI.sandwich <- (beta1 + c(-1,1)*qt(1-alfa/2, df=n-1)*sd.sandwich[2])
# # Confidence intervals based on normal approximation and bootstrap variance estimation
CI.var.boot <- beta1 + c(-1,1)*qt(1-alfa/2, df=n-1)*sd(beta1.boot.NB)
# Standard asymptotic confidence interval ignoring heteroscedasticity 
CI.lm <- confint(fit.lm)[2,]

print(rbind(CI.lm, CI.sandwich, CI.var.boot, CI.boot.basic, CI.boot.stud, CI.boot.model), digits=3)

# 
#

# # # Simulation study 
# # Confidence interval for beta_1, n = 100, heteroscedastic normal errors
#                             coverage cover.lower cover.upper aver.length
# Stand. asympt. lm              0.903       0.954       0.949        2.50
# Sandwich                       0.940       0.966       0.974        2.88
# As. + var. estim. via boot.    0.947       0.972       0.975        2.89
# Boot. basic                    0.939       0.968       0.971        2.84
# Boot. student                  0.943       0.971       0.972        3.03

# # Confidence interval for beta_1, n = 100, homoscedastic normal errors
#                             coverage cover.lower cover.upper aver.length
# Stand. asympt. lm              0.945       0.969       0.976        2.20
# Sandwich                       0.938       0.967       0.971        2.21
# As. + var. estim. via boot.    0.942       0.967       0.975        2.19
# Boot. basic                    0.935       0.965       0.970        2.16
# Boot. student                  0.945       0.969       0.976        2.22



### 
### Example: the Durbin-Watson test 
### 

library(splines)
# con<-url('http://msekce.karlin.mff.cuni.cz/~nagy/NMST434/Data/pribram.csv');
# DATA = read.table(con, sep=";", header=TRUE)
DATA <- read.table("pribram.csv", sep=";", header=TRUE);
head(DATA);
summary(DATA);

# Let us consider the following relatively simple spline parametrization of cox
DATA$cox.sp <- bs(DATA$cox, knots = 0, degree = 2);
summary(mFin <- lm(dehydro ~ Zn+cox.sp, data=DATA))

# # Durbin-Watson test of correlation of errors 
# p-value based on some asymptotic numerical approximations 
library(lmtest)
dwtest(mFin)
# p-value with the help of bootstrap 
library(car)
durbinWatsonTest(mFin)
durbinWatsonTest(mFin, reps=10000)


# # Assesing the significance of DW with the help of bootstrap 
n <- nrow(DATA);
B <- 999;
stat.boot.nonp <- numeric(B);
stat.boot.model <- numeric(B);
u <- residuals(mFin);
f <- fitted(mFin)

for(b in 1:B){
  setTxtProgressBar(pb, b/B)
	iii <- sample(n, replace=T)
	DATA.boot <- DATA[iii,];
	m.boot <- lm(dehydro ~ Zn+cox.sp, data=DATA.boot); 
	stat.boot.nonp[b] <- durbinWatsonTest(m.boot, simulate=FALSE)$dw;
  
  u.boot <- u[iii]; 
  m.boot <- lm(f + u.boot ~ Zn+cox.sp, data=DATA); 
  stat.boot.model[b] <- durbinWatsonTest(m.boot, simulate=FALSE)$dw;  
}

# Estimated p-value 
dw <- durbinWatsonTest(mFin, simulate=FALSE)$dw

# Boot nonparam
2*(1+ min(sum(stat.boot.nonp >= dw), sum(stat.boot.nonp <= dw)))/(1+B)

# Boot model based (probably the one implemented in 'durbinWatsonTest')
2*(1+ min(sum(stat.boot.model >= dw), sum(stat.boot.model <= dw)))/(1+B)

###
### Example 66: Bootstrap in autoregression
###

# dataset lh, see ?lh
plot(lh)

# lh <- arima.sim(n = 1000, list(ar = c(0.5)))

# estimation of the parameters using OLS for AR models
(fit <- ar.ols(lh, aic=FALSE, order.max=1, intercept=FALSE))

# Bootstrap 
n <- length(lh);
B <- 500;
rho.boot <- numeric(B);

# 
resid <- fit$resid; # residuals 

resid.boot <- matrix(sample(resid[-1], replace=T, n*B), n, B);
X.boot <- matrix(NA, n+1, B);
X.boot[1,] <- 0;
rhohat <- fit$ar[1];

# Generating B-time series 
for(i in 1:n){
  X.boot[i+1,] <- rhohat*X.boot[i,] + resid.boot[i,];  
}

for(b in 1:B){
  print(b)
  rho.boot[b] <- ar.ols(X.boot[-1,b], aic=FALSE, order.max=1, intercept=FALSE)$ar[1]
}

alpha <- 0.05;

# # Basic boot CI
2*rhohat - quantile(rho.boot, c(1-alpha/2, alpha/2))
# # Asymptotic CI
rhohat + c(-1,1)*qnorm(1-alpha/2)*fit$asy.se.coef$ar

### 
### Example: Confidence interval for a median 
###

library(MASS);
data(galaxies);

#     A numeric vector of velocities in km/sec of 82 galaxies from 6
#     well-separated conic sections of an unfilled survey of the
#     Corona Borealis region. 

# Find the 95% confidence interval for the median velocity.  

summary(galaxies);

x <- galaxies;
n <- length(x);
hist(x, breaks=10)
rug(x)

# # A.  CI based on asymptotic normality 
# Estimator of f(median), where f is the density function 
(f.estim <- density(galaxies, from=median(x), to=median(x))$y[1])
# Alternatively one can use the estimator of the sparsity function as 
# discussed in quantile regression 

sd.estim <- sqrt(1/(4*f.estim^2));

# Confidence interval 
(CI.as <- median(x) + c(-1,1)*qt(1-alfa/2, df=n-1)*sd.estim/sqrt(n));


# B. Alternatively one can try to use the standard nonparametric bootstrap 
B <- 1000;
Median.boot <- numeric(B); # medians from bootstrapped samples 
Rb <- numeric(B);  # boot median - median
Rb.stud <- numeric(B); # (boot median - median)/sd(median)

Tn <- median(x);

for(i in 1:B){
  xs <- sample(x, size=n, replace=TRUE);
  Median.boot[i] <- median(xs); 
  Rb[i] <- sqrt(n)*(Median.boot[i] - Tn);
  fboot.estim <- density(xs, from=median(x), to=median(x))$y[1]; 
  sd.estim.boot <- sqrt(1/(4*fboot.estim^2));
  Rb.stud[i] <- sqrt(n)*(Median.boot[i] - Tn)/sd.estim.boot;
}

kvant <- quantile(Rb, c(1-alfa/2,alfa/2));
kvant.stud <- quantile(Rb.stud, c(1-alfa/2,alfa/2));

# CI based on bootstrapping sqrt(n)(theta(F_{n}) - theta(F))
(CI.boot <- Tn - kvant/sqrt(n)); 
(CI.boot.stud <- Tn - kvant.stud*sd.estim/sqrt(n)); 
# 
CI.as


par(mfrow=c(1,2));
hist(Median.boot, freq=FALSE); 
rug(jitter(Median.boot)); 
# lines(density(Median.boot));
abline(v=Tn, lty="dotted")
# # 
hist(Rb.stud, freq=FALSE); 
rug(jitter(Rb.stud)); 
# lines(density(Rb.stud));
# 
par(mfrow=c(1,1));

# # C. Smoothed bootstrap 

# hs <- min(bw.nrd(x), bw.ucv(x), bw.bcv(x), bw.SJ(x))*n^(1/5)/n^(1/4);
hs <- sd(x)/n^(1/5);

x <- galaxies;
n <- length(x);
Median.boot.smoothed <- numeric(B); # medians from bootstrapped samples 
Rb <- numeric(B);                   # boot median - median
Rb.stud <- numeric(B);              # (boot median - median)/sd(median)

Tn <- median(x);

for(i in 1:B){
  xs <- sample(x, size=n, replace=TRUE);
  xs.smoothed <- xs + hs*rnorm(n);
  Median.boot.smoothed[i] <- median(xs.smoothed);
  Rb[i] <- sqrt(n)*(Median.boot.smoothed[i] - Tn);
#   studentized 
  fboot.estim <- density(xs, from=median(x), to=median(x))$y[1]; 
  sd.estim.boot <- sqrt(1/(4*fboot.estim^2));
  Rb.stud[i] <- sqrt(n)*(Median.boot.smoothed[i] - Tn)/sd.estim.boot;   
}


kvant.smoothed <- quantile(Rb, c(1-alfa/2,alfa/2));
kvant.smoothed.stud <- quantile(Rb.stud, c(1-alfa/2,alfa/2));

# CI based on bootstrapping sqrt(n)(Tn - median)
(CI.boot.smoothed <- Tn - kvant.smoothed/sqrt(n)); 
(CI.boot.smoothed.stud <- Tn - kvant.smoothed.stud*sd.estim/sqrt(n)); 

res <- rbind(CI.as, CI.boot, CI.boot.smoothed, CI.boot.stud, CI.boot.smoothed.stud)
print(res, digits=4)

# # 

par(mfrow=c(1,2));
hist(Median.boot, freq=FALSE); 
rug(jitter(Median.boot)); 
# lines(density(Median.boot));
abline(v=Tn, lty="dotted")
# 
hist(Median.boot.smoothed, freq=FALSE); 
rug(jitter(Median.boot.smoothed)); 
# lines(density(Median.boot.smoothed));
abline(v=Tn, lty="dotted")
# 
par(mfrow=c(1,1));


### 
### Example: Logit model
### 

# The data 'stillbirth' contains data about deliveries 
# in Quensland (Australia) in years 1987-1992. 
# birth - stillbirth - a fetal death after 20 weeks of gestation 
#         live-birth - the baby was born alive 
# age - age of the fetus/baby (in weeks)
# race - white/aborigine 
# gender - male/female 

# con<-url('http://msekce.karlin.mff.cuni.cz/~nagy/NMST434/Data/stillbirth.csv');
# DATA = read.table(con, sep=";", header=TRUE);
DATA <- read.table("stillbirth.csv", sep=";", header=TRUE);
head(DATA)

# # Preparing data for logistic regression  
DATAa <- subset(DATA, birth=="stillbirth", select=-c(birth));
DATAb <- subset(DATA, birth=="live-birth", select=-c(birth));

DATA2 <- merge(DATAa, DATAb, by=c("race", "age", "gender"))
colnames(DATA2)[4:5] <- c("still", "live");
DATA2

# ni <- DATA2$live + DATA2$still;
# empir.prob <- round(DATA2$still/ni, 3);
# 
# data.frame(DATA2, empir.prob);

# # logit model
fit.logit <- glm(cbind(still,live)~race + age+gender, data=DATA2, family="binomial");

summary(fit.logit)

# #
# Dataset with each row corresponding to one delivery 
DATA.big <- data.frame(rep(DATA[,1], DATA$count), rep(DATA[,2], DATA$count), rep(DATA[,3], DATA$count), rep(DATA[,4], DATA$count))
names(DATA.big) <- names(DATA)[-5];
head(DATA.big)

# The following takes some time  
(fit.logit.big <- glm(birth~race + age+gender, data=DATA.big, family="binomial"));

# # Bootstrap 
p <- length(fit.logit$coefficients); # number 
n <- nrow(DATA2);
B <- 50;

beta.boot <- matrix(NA, nrow=B, ncol=p)
betahat <- fit.logit$coefficients; 

# Indices of the rows 
indexy <- c(rep(1:n, DATA2$still), rep((n+1):(2*n), DATA2$live));
N <- length(indexy);
DATAb <- DATA2;

for(i in 1:B){
  print(i)
  iii <- sample(indexy, size=N, replace=TRUE);  
  Yb <- matrix(colSums(outer(iii, 1:40, "==")), n, 2); 
  DATAb[,4:5] <- Yb;
  fit <- glm(cbind(still,live)~race + age+gender, data=DATAb, family="binomial");
  beta.boot[i,] <- fit$coef;  
}

# # Estimated standard deviation via nonparametric bootstrap
print(apply(beta.boot,2,sd),4)

# Comparison with the estimated asymptotoic standard errors 
sd.compare <- cbind(apply(beta.boot,2,sd), summary(fit.logit)$coef[,2]);
colnames(sd.compare) <- c("boot", "MLE-Wald");
print(sd.compare, digits=4)