# NMST 434: Exercise session 3 
# March 5, 2020

library(mffSM)

###
# Example: Nonparametric MLE
###

library(fdrtool)
n = 30
X = rexp(n)

plot(ecdf(X), lty=3, main="Estimates of the cdf of X")
legend("bottomright",c("ECDF", "Least Concave Majorant","True"),lwd=c(2,2,1),col=1:3,pch=c(16,NA,NA),lty=c(3,1,1))
ll = gcmlcm(sort(c(0,X)),(0:n)/n, type="lcm")
lines(ll$x.knots, ll$y.knots, col=2, lwd=2)
lines(t<-seq(0,max(X)+1,length=101),pexp(t),col=3,lwd=1)

n.knots = length(ll$x.knots)
f.y = (ll$y.knots[-1]-ll$y.knots[-n.knots])/(ll$x.knots[-1]-ll$x.knots[-n.knots])
f.y = c(f.y,tail(f.y,1))
f.x = ll$x.knots
plot(f.x,f.y,type="s",lwd=2,lty=3,xlim=c(0,max(X)),ylim=c(0,max(f.y)), 
     main="Grenander's density estimator",xlab="x",ylab="f(x)")
abline(h=0,col="gray")
rug(X)
legend("topright",c("LCM", "LCM2","True"), lwd=2, lty=c(3,1,1),col=c(1,2,3))
lines(t,dexp(t),col=3,lwd=2)
lines(f.x,f.y,col=2,lwd=2)
hist(X,prob=TRUE,add=TRUE,col=adjustcolor("orange",alpha.f=.25),border="orange")

### 
# Example 18: MLE in a mixture of normal distributions
###

# mixture N(0,1)*1/2 + N(mu,sigma^2)*1/2

n = 101             # change to 11 and see what happens
C = rbinom(n,1,1/2)
mu = 5
sigma = 1
X = rep(NA,n)
X[C==1] = rnorm(sum(C==1))
X[C==0] = rnorm(sum(C==0),mean=mu,sd=sigma)
hist(X,prob=TRUE,ylim=c(0,max(dnorm(0)/2,dnorm(mu,mean=mu,sd=sigma)/2)))
f = function(x,mu,sigma) return(1/2*dnorm(x)+1/2*dnorm(x,mean=mu,sd=sigma))
lines(t<-seq(min(X),max(X),length=1001), f(t,mu,sigma),lwd=2)
l = function(par) -sum(log(f(X,par[1],par[2])))  # minus log-likelihood (minus because of optim that minimizes)

# numerical MLE
(MLE = optim(c(X[1],sigma),l))

# evaluate the log-likelihood of mu and sigma in a grid of points in R^2
eval = 101
smax = max(sigma,MLE$par[2],2)
# smax = .001       # uncomment to see the grid near sigma = 0
s = seq(0,smax,length=eval)
m = seq(min(X),max(X),length=eval)
L = matrix(nrow=eval,ncol=eval)
for(i in 1:eval) for(j in 1:eval) L[i,j] = -l(c(m[i],s[j]))

# plot the log-likelihood
contour(m,s,L, xlab="mu", ylab="sigma", main="Log-likelihood of mu and sigma")
if(n<=11) abline(v=X,lty=2)
points(mu,sigma,col=2,pch=16)
points(MLE$par[1],MLE$par[2],col=3,pch=16)
legend("bottomleft",c("True parameter","Numertically estimated MLE"),col=c(2,3),pch=16)

L1 = L2 = rep(NA,eval)
L3 = apply(L,2,max)    # profile log-likelihood
for(i in 1:eval) {L1[i] = -l(c(X[1],s[i]));L2[i] = -l(c(mu,s[i]))}
plot(c(s,s,s),c(L1,L2,L3),type="n",xlab="sigma",ylab="log-likelihood")
legend("bottomright",c("at mu=X[1]", "at true mu", "profile log-lik."),col=c(1,2,3),lwd=2)
lines(s,L1,lwd=2)
lines(s,L2,col=2,lwd=2)
lines(s,L3,col=3,lwd=2)
abline(v=sigma,lty=2)
abline(v=MLE$par[2],col=3,lwd=2)  # numerical maximizer of log-likelihood

### 
# Example 17: Inconsistent MLE
###

# Data is i.i.d., distributed as the mixture N(0,1)*1/2 + N(theta,exp(-2/theta^2))*1/2,
# where theta is positive.

# # CREDIT TO: 
# https://radfordneal.wordpress.com/2008/08/09/inconsistent-maximum-likelihood-estimation-an-ordinary-example/ 

# GENERATE DATA FROM THE MODEL WITH A GIVEN PARAMETER VALUE.  Arguments are
# the parameter value, t, and the number of data points to generate, m.

gen <- function (t,n)
{
  m <- rep(0,n)
  s <- rep(1,n)
  
  w <- runif(n) < 1/2
  m[w] <- t
  s[w] <- exp(-1/t^2)

  rnorm(n,m,s)
}

# COMPUTE LOG DENSITIES OF DATA VALUES FOR A GIVEN PARAMETER VALUE.  Arguments
# are a vector of data values, x, and the parameter value, t.  

density <- function (x,t)
{
  ll1 <- 0.5*dnorm(x,0,1)
  ll2 <- 0.5*dnorm(x,t,exp(-1/t^2))
  ll1 + ll2 
}


# Special care is taken to compute the log of extreme density values without 
# overflow.  Data points exactly equal to the parameter value are treated
# specially, since for them the log of the exponential part of the density 
# vanishes, producing results that don't overflow even when the standard
# deviation, exp(-1/t^2), overflows.

log.density <- function (x,t)
{
  ll1 <- dnorm(x,0,1,log=TRUE) + log(0.5)

  ll2 <- dnorm(x,t,exp(-1/t^2),log=TRUE) + log(0.5)
  ll2[x==t] <- -0.5*log(2*pi) + 1/t^2 + log(0.5)

  ll <- rep(NA,length(x))
  for (i in 1:length(x))
  { ll[i] <- add.logs(ll1[i],ll2[i])
  }

  ll
}


# COMPUTE THE LOG LIKELIHOOD GIVEN A DATA VECTOR AND PARAMETER VALUE.  Arguments
# are the vector of data values, x, and the parameter value, t.

log.lik <- function (x,t)
{
  sum(log.density(x,t))
}


# PLOT THE DENSITY FUNCTION FOR A GIVEN PARAMETER VALUE.  Arguments are the
# parameter value, t, and the grid of data values at which to evaluate the
# density.  The grid defaults to 200 points from -3 to 5 (or the parameter
# value, if greater), plus the value of the parameter, where there may be a
# narrow peak in the density that would be missed by the grid.  The vertical
# scale is scaled by a power of ten close to the maximum density, since the
# density values might otherwise overflow.

plot.density <- function (t, grid=sort(c(t,seq(-3,max(5,t),length=200))))
{
  ld <- log.density(grid,t)
  max.ld <- round(max(ld)/log(10))
  ld <- ld - max.ld*log(10)

  plot(grid,exp(ld),type="l",
    xlab="data value",
    ylab=paste("density (x 10^",max.ld,")",sep=""))

  title(paste("Density with parameter",t))
}


# PLOT THE LIKELIHOOD FUNCTION FOR GIVEN DATA, AND FIND MLE.  The argument 
# is a vector of data values, x.  Additional arguments may be given, and are
# passed on the the plot funcition.  The likelihood is plotted for a grid of
# parameter values from 0.01 to 3, in steps of 0.01, plus all positive
# data values less than 3.  An approximation to the MLE is found by taking 
# the maximum over values on this grid.  Data values in (-3,3) are shown at 
# the bottom of the plot. The vertical scale is scaled by a power of ten close 
# to the maximum likelihood, since likelihood values might otherwise overflow.

plot.lik <- function (x,...)
{
  grid <- sort(c(x[x>0 & x<3],seq(0.01,3,by=0.01)))

  ll <- rep(NA,length(grid))
  for (i in 1:length(grid))
  { ll[i] <- log.lik(x,grid[i])
  }

  mlv <- max(ll)
  mle <- grid[ll==mlv][1]
#   max.ll <- round(mlv/log(10))
#   max.ll <- round(log(mlv)/log(10));

#   ll <- ll - max.ll*log(10)
  lik <- exp(ll);
#   lik <- ll;

#   plot(c(-3,3),c(0,max(lik)),type="n",
#   plot(c(-3,3),c(-100,mlv),type="n",
#   plot(c(-3,3),range(ll),type="n",
  plot(c(0,3),range(ll),type="n",
#     xlab="data / parameter",
    xlab="data",
#     ylab=paste("Log-likelihood/10^",max.ll,")",sep=""),
    ylab=paste("Log-likelihood",sep=""),
    ...)
#   points(x,rep(min(ll),length(x)),lty=19)
  lines(grid,ll)

  title(paste("Log-Likelihood -",length(x),"data points,",
              "MLE approximately",round(mle,4)))
	
	lines(rep(2,2), range(ll), col="blue", lty="dotted")
}


# ADD NUMBERS REPRESENTED BY LOGS.  Computes log(exp(a)+exp(b)) in a way
# that avoids overflow/underflow problems.

add.logs <- function (a,b)
{ if (a>b)
  { a + log(1+exp(b-a))
  }
  else
  { b + log(1+exp(a-b))
  }
}


# PRODUCE DEMO PLOTS.  Leaves the generated data in the global variable x for
# possible later examination.

demo <- function()
{ 
  par(mfrow=c(2,3))

  plot.density(2.5)
  plot.density(0.6)
  plot.density(0.2)

#   set.seed(1)
  fce <- function(x) min(x[x>0])
  
  
  x <<- gen(2,100)  # true parameter theta = 2
  plot.lik(x[1:10]);
  m1 = fce(x[1:10])
  m2 = fce(x[1:30])
  m3 = fce(x)
  abline(v=m1,lwd=2,lty=2,col=2)
  plot.lik(x[1:30])
  abline(v=m2,lwd=2,lty=2,col=2)
  plot.lik(x)
  abline(v=m3,lwd=2,lty=2,col=2)
	
	par(mfrow=c(1,1))
}

demo()

# # n = 1000
plot.lik(gen(2,10))

### 
# Example 21: Breusch-Pagan test                                                                    ###
###

# Boston data from Linear Regression

Data <- read.csv("http://www.karlin.mff.cuni.cz/~maciak/NMSA407/Boston2.csv", header = TRUE, sep = ",", na.strings = "")
Data <- transform(Data, fcrime = factor(crime, levels = 1:3, labels = c("None", "Some", "High")))
Data <- subset(Data, select = c("medv", "nox", "fcrime", "ldis", "ptratio", "indus", "age", "black"))
m1 <- lm(medv ~ nox*(fcrime + ldis + ptratio + indus + age + black) + I(nox^2), data = Data)
summary(m1)
plotLM(m1)

### Breusch-Pagan test - assuming normality of errors 

library("lmtest")
library("car")

ncvTest(m1); # ?ncvTest
ncvTest(m1,~fitted(m1))

bptest(m1, ~fitted(m1), studentize = FALSE);

### ncvTest : var.formula: "a one-sided formula for the error variance; if omitted, the error variance depends on the fitted values."
### bptest : var.formula:  "by default the same explanatory variables are taken as in the main regression model."

### 'Manual calculation' 
U <- residuals(m1);
U2 <- U^2
Yhat <- fitted(m1);
n <- length(U)

### Test statistic 
(BP <- sum(U2 * (Yhat - mean(Yhat)))^2 / (2 * (mean(U2))^2 * sum((Yhat-mean(Yhat))^2)))

### Test statistic as regression sum of squares
m = lm(U2~Yhat)
crossprod(fitted(m)-mean(fitted(m)))/(2 * mean(U2)^2)

### Test statistic as another regression sum of squares
m2 = lm(U2/mean(U2)~Yhat)
crossprod(fitted(m2)-mean(fitted(m2)))/2

### p-value 
1-pchisq(BP, df=1)
bptest(m1, ~fitted(m1), studentize = FALSE)$p.value

### Up to some minor (and asymptotically negligible) differences 
### this test can be also performed by considering the overall test 
### of the following 'linear model'
(A = anova(lm(U2~Yhat)))
A[1,2] # regression sum of squares in model U2~Yhat
crossprod(fitted(m)-mean(fitted(m))) # regression sum of squares in model m above


### Koenker's studentized variant of Breusch-Pagan test - normality of errors not assumed 
bptest(m1, ~fitted(m1))

### Manual calculation 
(BP.Koenker <- (sqrt(n)*cor(U2, Yhat))^2)
1-pchisq(BP.Koenker, df=1)
bptest(m1, ~fitted(m1))$p.value

### One can also try to consider all the covariates 
bptest(m1, studentize=FALSE);

### Manual calculation (Zvara, p. 127)
Z <- model.matrix(m1)[,-1];
sigma2 <- mean(U^2); 
Suz <- colMeans(U^2/sigma2*(Z - matrix(colMeans(Z), nrow(Z), ncol(Z), byrow=T)));
n <- nrow(Z);

### Test statistic 
(stat<-n*t(Suz)%*%solve(2*var(Z)*(n-1)/n)%*%Suz)

### Manual calculation (Breusch and Pagan, p. 1289)
Z2 <- model.matrix(m1)
1/2*colSums(Z2*(U^2/sigma2-1))%*%solve(t(Z2)%*%Z2)%*%colSums(Z2*(U^2/sigma2-1))

pchisq(stat,df=ncol(Z),lower=FALSE)
bptest(m1, studentize=FALSE)$p.val

### Studentized version 
bptest(m1);

### Manual calculation 
kurt.estim <- mean((U^2-sigma2)^2)/sigma2^2
n*t(Suz)%*%solve(kurt.estim*var(Z)*(n-1)/n)%*%Suz

### Again, up to some minor (and asymptotically negligible) differences 
### this test can be also performed by considering the overall test 
### of the following 'linear model
anova(lm(U^2~model.matrix(m1)))