# NMST 434: Exercise session 3 # March 5, 2020 library(mffSM) ### # Example: Nonparametric MLE ### library(fdrtool) n = 30 X = rexp(n) plot(ecdf(X), lty=3, main="Estimates of the cdf of X") legend("bottomright",c("ECDF", "Least Concave Majorant","True"),lwd=c(2,2,1),col=1:3,pch=c(16,NA,NA),lty=c(3,1,1)) ll = gcmlcm(sort(c(0,X)),(0:n)/n, type="lcm") lines(ll$x.knots, ll$y.knots, col=2, lwd=2) lines(t<-seq(0,max(X)+1,length=101),pexp(t),col=3,lwd=1) n.knots = length(ll$x.knots) f.y = (ll$y.knots[-1]-ll$y.knots[-n.knots])/(ll$x.knots[-1]-ll$x.knots[-n.knots]) f.y = c(f.y,tail(f.y,1)) f.x = ll$x.knots plot(f.x,f.y,type="s",lwd=2,lty=3,xlim=c(0,max(X)),ylim=c(0,max(f.y)), main="Grenander's density estimator",xlab="x",ylab="f(x)") abline(h=0,col="gray") rug(X) legend("topright",c("LCM", "LCM2","True"), lwd=2, lty=c(3,1,1),col=c(1,2,3)) lines(t,dexp(t),col=3,lwd=2) lines(f.x,f.y,col=2,lwd=2) hist(X,prob=TRUE,add=TRUE,col=adjustcolor("orange",alpha.f=.25),border="orange") ### # Example 18: MLE in a mixture of normal distributions ### # mixture N(0,1)*1/2 + N(mu,sigma^2)*1/2 n = 101 # change to 11 and see what happens C = rbinom(n,1,1/2) mu = 5 sigma = 1 X = rep(NA,n) X[C==1] = rnorm(sum(C==1)) X[C==0] = rnorm(sum(C==0),mean=mu,sd=sigma) hist(X,prob=TRUE,ylim=c(0,max(dnorm(0)/2,dnorm(mu,mean=mu,sd=sigma)/2))) f = function(x,mu,sigma) return(1/2*dnorm(x)+1/2*dnorm(x,mean=mu,sd=sigma)) lines(t<-seq(min(X),max(X),length=1001), f(t,mu,sigma),lwd=2) l = function(par) -sum(log(f(X,par[1],par[2]))) # minus log-likelihood (minus because of optim that minimizes) # numerical MLE (MLE = optim(c(X[1],sigma),l)) # evaluate the log-likelihood of mu and sigma in a grid of points in R^2 eval = 101 smax = max(sigma,MLE$par[2],2) # smax = .001 # uncomment to see the grid near sigma = 0 s = seq(0,smax,length=eval) m = seq(min(X),max(X),length=eval) L = matrix(nrow=eval,ncol=eval) for(i in 1:eval) for(j in 1:eval) L[i,j] = -l(c(m[i],s[j])) # plot the log-likelihood contour(m,s,L, xlab="mu", ylab="sigma", main="Log-likelihood of mu and sigma") if(n<=11) abline(v=X,lty=2) points(mu,sigma,col=2,pch=16) points(MLE$par[1],MLE$par[2],col=3,pch=16) legend("bottomleft",c("True parameter","Numertically estimated MLE"),col=c(2,3),pch=16) L1 = L2 = rep(NA,eval) L3 = apply(L,2,max) # profile log-likelihood for(i in 1:eval) {L1[i] = -l(c(X[1],s[i]));L2[i] = -l(c(mu,s[i]))} plot(c(s,s,s),c(L1,L2,L3),type="n",xlab="sigma",ylab="log-likelihood") legend("bottomright",c("at mu=X[1]", "at true mu", "profile log-lik."),col=c(1,2,3),lwd=2) lines(s,L1,lwd=2) lines(s,L2,col=2,lwd=2) lines(s,L3,col=3,lwd=2) abline(v=sigma,lty=2) abline(v=MLE$par[2],col=3,lwd=2) # numerical maximizer of log-likelihood ### # Example 17: Inconsistent MLE ### # Data is i.i.d., distributed as the mixture N(0,1)*1/2 + N(theta,exp(-2/theta^2))*1/2, # where theta is positive. # # CREDIT TO: # https://radfordneal.wordpress.com/2008/08/09/inconsistent-maximum-likelihood-estimation-an-ordinary-example/ # GENERATE DATA FROM THE MODEL WITH A GIVEN PARAMETER VALUE. Arguments are # the parameter value, t, and the number of data points to generate, m. gen <- function (t,n) { m <- rep(0,n) s <- rep(1,n) w <- runif(n) < 1/2 m[w] <- t s[w] <- exp(-1/t^2) rnorm(n,m,s) } # COMPUTE LOG DENSITIES OF DATA VALUES FOR A GIVEN PARAMETER VALUE. Arguments # are a vector of data values, x, and the parameter value, t. density <- function (x,t) { ll1 <- 0.5*dnorm(x,0,1) ll2 <- 0.5*dnorm(x,t,exp(-1/t^2)) ll1 + ll2 } # Special care is taken to compute the log of extreme density values without # overflow. Data points exactly equal to the parameter value are treated # specially, since for them the log of the exponential part of the density # vanishes, producing results that don't overflow even when the standard # deviation, exp(-1/t^2), overflows. log.density <- function (x,t) { ll1 <- dnorm(x,0,1,log=TRUE) + log(0.5) ll2 <- dnorm(x,t,exp(-1/t^2),log=TRUE) + log(0.5) ll2[x==t] <- -0.5*log(2*pi) + 1/t^2 + log(0.5) ll <- rep(NA,length(x)) for (i in 1:length(x)) { ll[i] <- add.logs(ll1[i],ll2[i]) } ll } # COMPUTE THE LOG LIKELIHOOD GIVEN A DATA VECTOR AND PARAMETER VALUE. Arguments # are the vector of data values, x, and the parameter value, t. log.lik <- function (x,t) { sum(log.density(x,t)) } # PLOT THE DENSITY FUNCTION FOR A GIVEN PARAMETER VALUE. Arguments are the # parameter value, t, and the grid of data values at which to evaluate the # density. The grid defaults to 200 points from -3 to 5 (or the parameter # value, if greater), plus the value of the parameter, where there may be a # narrow peak in the density that would be missed by the grid. The vertical # scale is scaled by a power of ten close to the maximum density, since the # density values might otherwise overflow. plot.density <- function (t, grid=sort(c(t,seq(-3,max(5,t),length=200)))) { ld <- log.density(grid,t) max.ld <- round(max(ld)/log(10)) ld <- ld - max.ld*log(10) plot(grid,exp(ld),type="l", xlab="data value", ylab=paste("density (x 10^",max.ld,")",sep="")) title(paste("Density with parameter",t)) } # PLOT THE LIKELIHOOD FUNCTION FOR GIVEN DATA, AND FIND MLE. The argument # is a vector of data values, x. Additional arguments may be given, and are # passed on the the plot funcition. The likelihood is plotted for a grid of # parameter values from 0.01 to 3, in steps of 0.01, plus all positive # data values less than 3. An approximation to the MLE is found by taking # the maximum over values on this grid. Data values in (-3,3) are shown at # the bottom of the plot. The vertical scale is scaled by a power of ten close # to the maximum likelihood, since likelihood values might otherwise overflow. plot.lik <- function (x,...) { grid <- sort(c(x[x>0 & x<3],seq(0.01,3,by=0.01))) ll <- rep(NA,length(grid)) for (i in 1:length(grid)) { ll[i] <- log.lik(x,grid[i]) } mlv <- max(ll) mle <- grid[ll==mlv][1] # max.ll <- round(mlv/log(10)) # max.ll <- round(log(mlv)/log(10)); # ll <- ll - max.ll*log(10) lik <- exp(ll); # lik <- ll; # plot(c(-3,3),c(0,max(lik)),type="n", # plot(c(-3,3),c(-100,mlv),type="n", # plot(c(-3,3),range(ll),type="n", plot(c(0,3),range(ll),type="n", # xlab="data / parameter", xlab="data", # ylab=paste("Log-likelihood/10^",max.ll,")",sep=""), ylab=paste("Log-likelihood",sep=""), ...) # points(x,rep(min(ll),length(x)),lty=19) lines(grid,ll) title(paste("Log-Likelihood -",length(x),"data points,", "MLE approximately",round(mle,4))) lines(rep(2,2), range(ll), col="blue", lty="dotted") } # ADD NUMBERS REPRESENTED BY LOGS. Computes log(exp(a)+exp(b)) in a way # that avoids overflow/underflow problems. add.logs <- function (a,b) { if (a>b) { a + log(1+exp(b-a)) } else { b + log(1+exp(a-b)) } } # PRODUCE DEMO PLOTS. Leaves the generated data in the global variable x for # possible later examination. demo <- function() { par(mfrow=c(2,3)) plot.density(2.5) plot.density(0.6) plot.density(0.2) # set.seed(1) fce <- function(x) min(x[x>0]) x <<- gen(2,100) # true parameter theta = 2 plot.lik(x[1:10]); m1 = fce(x[1:10]) m2 = fce(x[1:30]) m3 = fce(x) abline(v=m1,lwd=2,lty=2,col=2) plot.lik(x[1:30]) abline(v=m2,lwd=2,lty=2,col=2) plot.lik(x) abline(v=m3,lwd=2,lty=2,col=2) par(mfrow=c(1,1)) } demo() # # n = 1000 plot.lik(gen(2,10)) ### # Example 21: Breusch-Pagan test ### ### # Boston data from Linear Regression Data <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/Boston2.csv", header = TRUE, sep = ",", na.strings = "") Data <- transform(Data, fcrime = factor(crime, levels = 1:3, labels = c("None", "Some", "High"))) Data <- subset(Data, select = c("medv", "nox", "fcrime", "ldis", "ptratio", "indus", "age", "black")) m1 <- lm(medv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data) summary(m1) plotLM(m1) ### Breusch-Pagan test - assuming normality of errors library("lmtest") library("car") ncvTest(m1); # ?ncvTest ncvTest(m1,~fitted(m1)) bptest(m1, ~fitted(m1), studentize = FALSE); ### ncvTest : var.formula: "a one-sided formula for the error variance; if omitted, the error variance depends on the fitted values." ### bptest : var.formula: "by default the same explanatory variables are taken as in the main regression model." ### 'Manual calculation' U <- residuals(m1); U2 <- U^2 Yhat <- fitted(m1); n <- length(U) ### Test statistic (BP <- sum(U2 * (Yhat - mean(Yhat)))^2 / (2 * (mean(U2))^2 * sum((Yhat-mean(Yhat))^2))) ### Test statistic as regression sum of squares m = lm(U2~Yhat) crossprod(fitted(m)-mean(fitted(m)))/(2 * mean(U2)^2) ### Test statistic as another regression sum of squares m2 = lm(U2/mean(U2)~Yhat) crossprod(fitted(m2)-mean(fitted(m2)))/2 ### p-value 1-pchisq(BP, df=1) bptest(m1, ~fitted(m1), studentize = FALSE)$p.value ### Up to some minor (and asymptotically negligible) differences ### this test can be also performed by considering the overall test ### of the following 'linear model' (A = anova(lm(U2~Yhat))) A[1,2] # regression sum of squares in model U2~Yhat crossprod(fitted(m)-mean(fitted(m))) # regression sum of squares in model m above ### Koenker's studentized variant of Breusch-Pagan test - normality of errors not assumed bptest(m1, ~fitted(m1)) ### Manual calculation (BP.Koenker <- (sqrt(n)*cor(U2, Yhat))^2) 1-pchisq(BP.Koenker, df=1) bptest(m1, ~fitted(m1))$p.value ### One can also try to consider all the covariates bptest(m1, studentize=FALSE); ### Manual calculation (Zvara, p. 127) Z <- model.matrix(m1)[,-1]; sigma2 <- mean(U^2); Suz <- colMeans(U^2/sigma2*(Z - matrix(colMeans(Z), nrow(Z), ncol(Z), byrow=T))); n <- nrow(Z); ### Test statistic (stat<-n*t(Suz)%*%solve(2*var(Z)*(n-1)/n)%*%Suz) ### Manual calculation (Breusch and Pagan, p. 1289) Z2 <- model.matrix(m1) 1/2*colSums(Z2*(U^2/sigma2-1))%*%solve(t(Z2)%*%Z2)%*%colSums(Z2*(U^2/sigma2-1)) pchisq(stat,df=ncol(Z),lower=FALSE) bptest(m1, studentize=FALSE)$p.val ### Studentized version bptest(m1); ### Manual calculation kurt.estim <- mean((U^2-sigma2)^2)/sigma2^2 n*t(Suz)%*%solve(kurt.estim*var(Z)*(n-1)/n)%*%Suz ### Again, up to some minor (and asymptotically negligible) differences ### this test can be also performed by considering the overall test ### of the following 'linear model anova(lm(U^2~model.matrix(m1)))