# NMST 434: Exercise session 12 # May 7 and 14, 2020 ## Resampling procedures II # 0. Ex. 70: Two sample Kolmogorov-Smirnov test # 1. Ex. 71: Permutation test of independence # 2. Heteroscedastic normal linear model # 3. Durbin-Watson's test # 4. Ex. 66: Bootstrap in AR processes # 5. Confidence interval for median # 6. Bootstrap in logit regression rm(list=ls()); pb = txtProgressBar(style=3); # progress bar ### ### Example 70: Two-sample permutation Kolmogorov-Smirnov test ### K = 100 pRes = rep(NA,K);pKS = rep(NA,K) k = 1 # uncomment the loop for a simulation study # for(k in 1:K){ n = 10 m = 40 X = rnorm(n) Y = rnorm(m) (ksT=ks.test(X,Y)) T = ksT$stat # asymptotic Kolmogorov-Smirnov's test # permutation approach B = 9999 Z = c(X,Y) T. = rep(NA,B) for(b in 1:B){ setTxtProgressBar(pb,b/B) Z. = sample(Z) # random permutation of the pooled sample T.[b] = ks.test(Z.[1:n],Z.[-(1:n)])$stat } hist(T.,B/100,prob=TRUE) abline(v=T,lwd=2,col=2) pRes[k] = (1+sum(T.>T))/(B+1) # resampling p-value pKS[k] = ksT$p.value # p-value from the ks.test # } # sum(pRes<=.05);sum(pKS<=0.05) pB = (1+cumsum(T.>T))/(1:B+1) plot(1:B,pB,type="l",main="Resampled p-value as a function of B",ylab="p-value", xlab="Permutations") abline(h=ksT$p.value,col=2,lwd=2) legend("topright",c("K-S p-value", "Resampled p-value"),col=c(2,1),lwd=c(2,1)) ### ### Example 71: Permutation test of independence ### A <- c(25, 35, 24, 19, 21, 24); B <- c(9, 18, 6, 10, 17, 24); C <- c(9, 6, 10, 12, 17, 14); # Three players are rolling a standard die. The results are # found in the table 'TAB'. Based on these results # can we say that the result of rolling is dependent on the person? TAB <- rbind(A, B, C); colnames(TAB) <- 1:6; TAB plot(as.table(TAB)) # Asymptotic chi-squared test chisq.test(TAB) # # Permutation approach player <- rep(c("A", "B", "C"), rowSums(TAB)); dice <- rep(rep(1:6,3), t(TAB)); # # Check # table(player, dice) - TAB B <- 9999; stat.perm <- numeric(B) for(b in 1:B){ setTxtProgressBar(pb,b/B) dice.perm <- sample(dice); stat.perm[b] <- chisq.test(table(player, dice.perm))$stat } (1+sum(stat.perm >= chisq.test(TAB)$stat))/(B+1) # The above permutation approach is (hopefully) the same as when you run in R chisq.test(TAB, simulate.p.value=T) chisq.test(TAB, simulate.p.value=T, B=100000) # B is the number of resamples # ?chisq.test ### ### Example: Heteroscedastic normal linear models ### library(sandwich) n <- 100; x0 <- rep(1, n); # regressor for the intercept x1 <- 2*runif(n); # regressor for the slope X <- cbind(x0, x1); # the regression matrix X B <- 1000; beta <- c(0,1); # true values of the parameters # # Heteroscedastic normal errors sigma <- function(x1) exp(x1) eps <- rnorm(n)*sigma(x1); Y <- X%*%beta + eps; # Summary summary(fit.lm <- lm(Y~x1)); # Plot plot(x1, Y, cex=0.6,pch=16) abline(fit.lm$coef, col="blue", lwd=2) # # Bootstrap beta0.boot.NB <- numeric(B); beta1.boot.NB <- numeric(B); beta0.stud.NB <- numeric(B); beta1.stud.NB <- numeric(B); beta0.boot.MB <- numeric(B); beta1.boot.MB <- numeric(B); alfa <- 0.05; # Estimates from the data beta0 <- fit.lm$coef[1]; beta1 <- fit.lm$coef[2]; res <- residuals(fit.lm)/sqrt(1-hatvalues(fit.lm)); # standardized residuals Yhat <- fitted(fit.lm); indexy <- 1:n; for(b in 1:B){ setTxtProgressBar(pb,b/B) # # Nonparametric bootstrap iii <- sample(indexy, size=n, replace=TRUE); x1s <- x1[iii]; Ys <- Y[iii]; fit <- lm(Ys~x1s); beta0.boot.NB[b] <- fit$coef[1]; beta1.boot.NB[b] <- fit$coef[2]; (sd.sandwich <- sqrt(diag(vcovHC(fit, type = "HC3")))); # # Studentized nonparametric bootstrap beta0.stud.NB[b] <- (fit$coef[1] - beta0)/sd.sandwich[1]; beta1.stud.NB[b] <- (fit$coef[2] - beta1)/sd.sandwich[2]; # # Model based bootstrap resb <- sample(res, size=n, replace=TRUE); Ys <- Yhat + resb; fit <- lm(Ys~x1); beta0.boot.MB[b] <- fit$coef[1]; beta1.boot.MB[b] <- fit$coef[2]; } kvant0.NB <- quantile(beta0.boot.NB-beta0, c(1-alfa/2,alfa/2)); kvant1.NB <- quantile(beta1.boot.NB-beta1, c(1-alfa/2,alfa/2)); kvant0.NB.stud <- quantile(beta0.stud.NB, c(1-alfa/2,alfa/2)); kvant1.NB.stud <- quantile(beta1.stud.NB, c(1-alfa/2,alfa/2)); kvant0.MB <- quantile(beta0.boot.MB-beta0, c(1-alfa/2,alfa/2)); kvant1.MB <- quantile(beta1.boot.MB-beta1, c(1-alfa/2,alfa/2)); # Comparison of bootstrap distributions par(mfrow=c(1,2)) hist(beta1.boot.MB, main="beta1 - model based bootstrap",prob=TRUE) abline(v=beta[2],col=2,lwd=2,lty=2) abline(v=beta1,col=3,lwd=2,lty=2) # hist(beta1.boot.NB, main="beta1 - nonparametric bootstrap",prob=TRUE); abline(v=beta[2],col=2,lwd=2,lty=2) abline(v=beta1,col=3,lwd=2,lty=2) legend("topleft",c("True","Estimate"),col=c(2,3),lwd=2,lty=2) par(mfrow=c(1,1)) # # Standard deviation of estimators via bootstrap SE.model.boot <- c(sd(beta0.boot.MB), sd(beta1.boot.MB)); # Model bootstrap SE.nonp.boot <- c(sd(beta0.boot.NB), sd(beta1.boot.NB)); # Nonparametric bootstrap # # Estimated asymptotic standard deviations SE.lm <- sqrt(diag(vcov(fit.lm))) SE.sandwich <- sqrt(diag(vcovHC(fit.lm, type = "HC3"))) res <- rbind(SE.lm, SE.model.boot, SE.nonp.boot, SE.sandwich); colnames(res) <- c("beta0", "beta1"); print(res, digits = 3) # # # # Basic confidence intervals based on nonparametric bootstrap CI.boot.basic <- beta1 - kvant1.NB # # Confidence interval based on model based bootstrap CI.boot.model <- beta1 - kvant1.MB # # Studentized bootstrap confidence intervals (based on nonparametric bootstrap) CI.boot.stud <- beta1 - kvant1.NB.stud * SE.sandwich[2] # # Confidence intervals based on normal approximation + sandwich CI.sandwich <- (beta1 + c(-1,1)*qt(1-alfa/2, df=n-1)*sd.sandwich[2]) # # Confidence intervals based on normal approximation and bootstrap variance estimation CI.var.boot <- beta1 + c(-1,1)*qt(1-alfa/2, df=n-1)*sd(beta1.boot.NB) # Standard asymptotic confidence interval ignoring heteroscedasticity CI.lm <- confint(fit.lm)[2,] print(rbind(CI.lm, CI.sandwich, CI.var.boot, CI.boot.basic, CI.boot.stud, CI.boot.model), digits=3) # # # # # Simulation study # # Confidence interval for beta_1, n = 100, heteroscedastic normal errors # coverage cover.lower cover.upper aver.length # Stand. asympt. lm 0.903 0.954 0.949 2.50 # Sandwich 0.940 0.966 0.974 2.88 # As. + var. estim. via boot. 0.947 0.972 0.975 2.89 # Boot. basic 0.939 0.968 0.971 2.84 # Boot. student 0.943 0.971 0.972 3.03 # # Confidence interval for beta_1, n = 100, homoscedastic normal errors # coverage cover.lower cover.upper aver.length # Stand. asympt. lm 0.945 0.969 0.976 2.20 # Sandwich 0.938 0.967 0.971 2.21 # As. + var. estim. via boot. 0.942 0.967 0.975 2.19 # Boot. basic 0.935 0.965 0.970 2.16 # Boot. student 0.945 0.969 0.976 2.22 ### ### Example: the Durbin-Watson test ### library(splines) # con<-url('http://msekce.karlin.mff.cuni.cz/~nagy/NMST434/Data/pribram.csv'); # DATA = read.table(con, sep=";", header=TRUE) DATA <- read.table("pribram.csv", sep=";", header=TRUE); head(DATA); summary(DATA); # Let us consider the following relatively simple spline parametrization of cox DATA$cox.sp <- bs(DATA$cox, knots = 0, degree = 2); summary(mFin <- lm(dehydro ~ Zn+cox.sp, data=DATA)) # # Durbin-Watson test of correlation of errors # p-value based on some asymptotic numerical approximations library(lmtest) dwtest(mFin) # p-value with the help of bootstrap library(car) durbinWatsonTest(mFin) durbinWatsonTest(mFin, reps=10000) # # Assesing the significance of DW with the help of bootstrap n <- nrow(DATA); B <- 999; stat.boot.nonp <- numeric(B); stat.boot.model <- numeric(B); u <- residuals(mFin); f <- fitted(mFin) for(b in 1:B){ setTxtProgressBar(pb, b/B) iii <- sample(n, replace=T) DATA.boot <- DATA[iii,]; m.boot <- lm(dehydro ~ Zn+cox.sp, data=DATA.boot); stat.boot.nonp[b] <- durbinWatsonTest(m.boot, simulate=FALSE)$dw; u.boot <- u[iii]; m.boot <- lm(f + u.boot ~ Zn+cox.sp, data=DATA); stat.boot.model[b] <- durbinWatsonTest(m.boot, simulate=FALSE)$dw; } # Estimated p-value dw <- durbinWatsonTest(mFin, simulate=FALSE)$dw # Boot nonparam 2*(1+ min(sum(stat.boot.nonp >= dw), sum(stat.boot.nonp <= dw)))/(1+B) # Boot model based (probably the one implemented in 'durbinWatsonTest') 2*(1+ min(sum(stat.boot.model >= dw), sum(stat.boot.model <= dw)))/(1+B) ### ### Example 66: Bootstrap in autoregression ### # dataset lh, see ?lh plot(lh) # lh <- arima.sim(n = 1000, list(ar = c(0.5))) # estimation of the parameters using OLS for AR models (fit <- ar.ols(lh, aic=FALSE, order.max=1, intercept=FALSE)) # Bootstrap n <- length(lh); B <- 500; rho.boot <- numeric(B); # resid <- fit$resid; # residuals resid.boot <- matrix(sample(resid[-1], replace=T, n*B), n, B); X.boot <- matrix(NA, n+1, B); X.boot[1,] <- 0; rhohat <- fit$ar[1]; # Generating B-time series for(i in 1:n){ X.boot[i+1,] <- rhohat*X.boot[i,] + resid.boot[i,]; } for(b in 1:B){ print(b) rho.boot[b] <- ar.ols(X.boot[-1,b], aic=FALSE, order.max=1, intercept=FALSE)$ar[1] } alpha <- 0.05; # # Basic boot CI 2*rhohat - quantile(rho.boot, c(1-alpha/2, alpha/2)) # # Asymptotic CI rhohat + c(-1,1)*qnorm(1-alpha/2)*fit$asy.se.coef$ar ### ### Example: Confidence interval for a median ### library(MASS); data(galaxies); # A numeric vector of velocities in km/sec of 82 galaxies from 6 # well-separated conic sections of an unfilled survey of the # Corona Borealis region. # Find the 95% confidence interval for the median velocity. summary(galaxies); x <- galaxies; n <- length(x); hist(x, breaks=10) rug(x) # # A. CI based on asymptotic normality # Estimator of f(median), where f is the density function (f.estim <- density(galaxies, from=median(x), to=median(x))$y[1]) # Alternatively one can use the estimator of the sparsity function as # discussed in quantile regression sd.estim <- sqrt(1/(4*f.estim^2)); # Confidence interval (CI.as <- median(x) + c(-1,1)*qt(1-alfa/2, df=n-1)*sd.estim/sqrt(n)); # B. Alternatively one can try to use the standard nonparametric bootstrap B <- 1000; Median.boot <- numeric(B); # medians from bootstrapped samples Rb <- numeric(B); # boot median - median Rb.stud <- numeric(B); # (boot median - median)/sd(median) Tn <- median(x); for(i in 1:B){ xs <- sample(x, size=n, replace=TRUE); Median.boot[i] <- median(xs); Rb[i] <- sqrt(n)*(Median.boot[i] - Tn); fboot.estim <- density(xs, from=median(x), to=median(x))$y[1]; sd.estim.boot <- sqrt(1/(4*fboot.estim^2)); Rb.stud[i] <- sqrt(n)*(Median.boot[i] - Tn)/sd.estim.boot; } kvant <- quantile(Rb, c(1-alfa/2,alfa/2)); kvant.stud <- quantile(Rb.stud, c(1-alfa/2,alfa/2)); # CI based on bootstrapping sqrt(n)(theta(F_{n}) - theta(F)) (CI.boot <- Tn - kvant/sqrt(n)); (CI.boot.stud <- Tn - kvant.stud*sd.estim/sqrt(n)); # CI.as par(mfrow=c(1,2)); hist(Median.boot, freq=FALSE); rug(jitter(Median.boot)); # lines(density(Median.boot)); abline(v=Tn, lty="dotted") # # hist(Rb.stud, freq=FALSE); rug(jitter(Rb.stud)); # lines(density(Rb.stud)); # par(mfrow=c(1,1)); # # C. Smoothed bootstrap # hs <- min(bw.nrd(x), bw.ucv(x), bw.bcv(x), bw.SJ(x))*n^(1/5)/n^(1/4); hs <- sd(x)/n^(1/5); x <- galaxies; n <- length(x); Median.boot.smoothed <- numeric(B); # medians from bootstrapped samples Rb <- numeric(B); # boot median - median Rb.stud <- numeric(B); # (boot median - median)/sd(median) Tn <- median(x); for(i in 1:B){ xs <- sample(x, size=n, replace=TRUE); xs.smoothed <- xs + hs*rnorm(n); Median.boot.smoothed[i] <- median(xs.smoothed); Rb[i] <- sqrt(n)*(Median.boot.smoothed[i] - Tn); # studentized fboot.estim <- density(xs, from=median(x), to=median(x))$y[1]; sd.estim.boot <- sqrt(1/(4*fboot.estim^2)); Rb.stud[i] <- sqrt(n)*(Median.boot.smoothed[i] - Tn)/sd.estim.boot; } kvant.smoothed <- quantile(Rb, c(1-alfa/2,alfa/2)); kvant.smoothed.stud <- quantile(Rb.stud, c(1-alfa/2,alfa/2)); # CI based on bootstrapping sqrt(n)(Tn - median) (CI.boot.smoothed <- Tn - kvant.smoothed/sqrt(n)); (CI.boot.smoothed.stud <- Tn - kvant.smoothed.stud*sd.estim/sqrt(n)); res <- rbind(CI.as, CI.boot, CI.boot.smoothed, CI.boot.stud, CI.boot.smoothed.stud) print(res, digits=4) # # par(mfrow=c(1,2)); hist(Median.boot, freq=FALSE); rug(jitter(Median.boot)); # lines(density(Median.boot)); abline(v=Tn, lty="dotted") # hist(Median.boot.smoothed, freq=FALSE); rug(jitter(Median.boot.smoothed)); # lines(density(Median.boot.smoothed)); abline(v=Tn, lty="dotted") # par(mfrow=c(1,1)); ### ### Example: Logit model ### # The data 'stillbirth' contains data about deliveries # in Quensland (Australia) in years 1987-1992. # birth - stillbirth - a fetal death after 20 weeks of gestation # live-birth - the baby was born alive # age - age of the fetus/baby (in weeks) # race - white/aborigine # gender - male/female # con<-url('http://msekce.karlin.mff.cuni.cz/~nagy/NMST434/Data/stillbirth.csv'); # DATA = read.table(con, sep=";", header=TRUE); DATA <- read.table("stillbirth.csv", sep=";", header=TRUE); head(DATA) # # Preparing data for logistic regression DATAa <- subset(DATA, birth=="stillbirth", select=-c(birth)); DATAb <- subset(DATA, birth=="live-birth", select=-c(birth)); DATA2 <- merge(DATAa, DATAb, by=c("race", "age", "gender")) colnames(DATA2)[4:5] <- c("still", "live"); DATA2 # ni <- DATA2$live + DATA2$still; # empir.prob <- round(DATA2$still/ni, 3); # # data.frame(DATA2, empir.prob); # # logit model fit.logit <- glm(cbind(still,live)~race + age+gender, data=DATA2, family="binomial"); summary(fit.logit) # # # Dataset with each row corresponding to one delivery DATA.big <- data.frame(rep(DATA[,1], DATA$count), rep(DATA[,2], DATA$count), rep(DATA[,3], DATA$count), rep(DATA[,4], DATA$count)) names(DATA.big) <- names(DATA)[-5]; head(DATA.big) # The following takes some time (fit.logit.big <- glm(birth~race + age+gender, data=DATA.big, family="binomial")); # # Bootstrap p <- length(fit.logit$coefficients); # number n <- nrow(DATA2); B <- 50; beta.boot <- matrix(NA, nrow=B, ncol=p) betahat <- fit.logit$coefficients; # Indices of the rows indexy <- c(rep(1:n, DATA2$still), rep((n+1):(2*n), DATA2$live)); N <- length(indexy); DATAb <- DATA2; for(i in 1:B){ print(i) iii <- sample(indexy, size=N, replace=TRUE); Yb <- matrix(colSums(outer(iii, 1:40, "==")), n, 2); DATAb[,4:5] <- Yb; fit <- glm(cbind(still,live)~race + age+gender, data=DATAb, family="binomial"); beta.boot[i,] <- fit$coef; } # # Estimated standard deviation via nonparametric bootstrap print(apply(beta.boot,2,sd),4) # Comparison with the estimated asymptotoic standard errors sd.compare <- cbind(apply(beta.boot,2,sd), summary(fit.logit)$coef[,2]); colnames(sd.compare) <- c("boot", "MLE-Wald"); print(sd.compare, digits=4)