##### 
#####  NMST440: Advanced Aspects of the R Environment
#####
#####  ---------------------------------------------------------------------
#####
#####  Work out results of simulations performed on a cluster (or the local PC)
#####
#####  ---------------------------------------------------------------------
#####
#####  Arnošt Komárek
#####  https://www2.karlin.mff.cuni.cz/~komarek
#####  komarek@karlin.mff.cuni.cz
#####
##### ======================================================================

rm(list = ls())
setwd("/home/komarek/teach/mff_2021/nmst440_AdvRko/Tutorial11")

RESDIR <- "./SimulIndTest_2021/RData/"          ### results of the local calculations (2021), M = 1000, total CPU time = about 4 min   
#RESDIR <- "./SimulIndTest_2017/RData/"           ### results of the cluster calculations (2017), M = 10000, total CPU time = about 13 days

(FILES <- dir(RESDIR))       ### Result files that are available


### Scenarios that were simulated
### --------------------------------

  ### Local (2021)
N <- c(10, 50, 100)
A <- c(0.1, 0.5, 1)
RHO <- c(0, 0.25, 0.50, 0.75, 0.95)
Mmax <- 1000

  ### Cluster (2017)
N <- c(10, 50, 100, 250)
A <- c(0.1, 0.5, 1)
RHO <- c(0, 0.25, 0.50, 0.75, 0.95)
Mmax <- 10000


### For each a and each rho, we create a matrix
### with columns corresponding to sample sizes.
### Matrices will be stored in a list Tsamp.
### ---------------------------------------------
Tsamp <- list()
for (a in A){
  for (rho in RHO){
    LNAME <- paste(a*10, "_", rho*100, sep = "")   
    Tsamp[[LNAME]] <- matrix(NA, nrow = Mmax, ncol = length(N))
    colnames(Tsamp[[LNAME]]) <- paste(N)

    for (n in N){
      FNAME <- paste("I_", a*10, "_", rho*100, "_", n, ".RData", sep = "")
      if (!(FNAME %in% FILES)){
        cat("*** ", FNAME, " not (yet) available.\n", sep = "")
        next
      }
      load(paste(RESDIR, FNAME, sep = ""))
      Tsamp[[LNAME]][, paste(n)] <- T      
    }      
  }  
}


### Check first few rows of each matrix
### ----------------------------------------------
  ### a = 0.1, rho = 0, 0.25, 0.50, 0.75, 0.95
Tsamp[["1_0"]][1:10,]
Tsamp[["1_25"]][1:10,]
Tsamp[["1_50"]][1:10,]
Tsamp[["1_75"]][1:10,]
Tsamp[["1_95"]][1:10,]

  ### a = 0.5, rho = 0, 0.25, 0.50, 0.75, 0.95
Tsamp[["5_0"]][1:10,]
Tsamp[["5_25"]][1:10,]
Tsamp[["5_50"]][1:10,]
Tsamp[["5_75"]][1:10,]
Tsamp[["5_95"]][1:10,]

  ### a = 1.0, rho = 0, 0.25, 0.50, 0.75, 0.95
Tsamp[["10_0"]][1:10,]
Tsamp[["10_25"]][1:10,]
Tsamp[["10_50"]][1:10,]
Tsamp[["10_75"]][1:10,]
Tsamp[["10_95"]][1:10,]


### Distribution of the test statistic for given a and sample size n when rho changes
### ----------------------------------------------------------------------------------------
library("colorspace")
COL <- rainbow_hcl(5)   ## colors for different rho's
names(COL) <- paste(RHO)

n <- 100

layout(matrix(c(4,4,1,1, 2,2,3,3), nrow = 2, byrow = TRUE))
for (a in A){

  ### Vector of all T's (to get its range for plot)
  Tall <- numeric()
  for (rho in RHO){
    Tall <- c(Tall, Tsamp[[paste(a*10, "_", rho*100, sep = "")]][, paste(n)])
  }  

  ### Empirical cdf's
  plot(ecdf(Tsamp[[paste(a*10, "_", 0, sep = "")]][, paste(n)]), col = COL[1], xlim = range(Tall), xlab = "T", main = paste("a = ", a, sep = ""))
  for (rho in RHO[-1]){
    plot(ecdf(Tsamp[[paste(a*10, "_", rho*100, sep = "")]][, paste(n)]), col = COL[paste(rho)], add = TRUE)
  }
}

### Legend
plot(c(0, 100), c(0, 100), type = "n", xlab = "", ylab = "", xaxt = "n", yaxt = "n", bty = "n")
legend(10, 70, legend = RHO, lty = 1, lwd = 2, col = COL)
text(10, 80, labels = expression(rho), pos = 4, font = 2)
par(mfrow = c(1, 1))


### Empirical critical values (for a test on 5% significance level)
### -----------------------------------------------------------------
C <- matrix(NA, nrow = length(A), ncol = length(N))
rownames(C) <- paste(A)
colnames(C) <- paste(N)
print(C)

for (a in A){
  C[paste(a),] <- apply(Tsamp[[paste(a*10, "_0", sep = "")]], 2, quantile, 0.95, na.rm = TRUE)
}  

print(C)


### Empirical power (for a test on 5% significance level)
### ------------------------------------------------------
Power <- matrix(NA, nrow = length(A) * length(RHO), ncol = length(N))
rownames(Power) <- paste(rep(RHO, each = length(A)), " - ", rep(A, length(RHO)), sep = "")
colnames(Power) <- paste(N)
print(Power)

for (rho in RHO){
  for (a in A){
    Ms <- apply(!is.na(Tsamp[[paste(a*10, "_", rho*100, sep = "")]]), 2, sum, na.rm = TRUE)
    Cs <- matrix(rep(C[paste(a),], each = nrow(Tsamp[[paste(a*10, "_", rho*100, sep = "")]])), nrow = nrow(Tsamp[[paste(a*10, "_", rho*100, sep = "")]]))
    Power[paste(rho, " - ", a, sep = ""),] <- apply(Tsamp[[paste(a*10, "_", rho*100, sep = "")]] >= Cs, 2, sum, na.rm = TRUE) / Ms
  }
}  

print(Power)

  ### Which value of a (among those tried) seems to be optimal?
  ### Surprised by 0.05 in all columns of the first three rows (rho = 0)?