#####
#####  NMST440: Computational Environment for Statistical Data Analysis
#####
#####  ---------------------------------------------------------------------
#####
#####  Working with big(ger) data,
#####  various sorts of apply
#####
#####  ---------------------------------------------------------------------
#####
#####  Arnošt Komárek
#####  http://msekce.karlin.mff.cuni.cz/~komarek
#####  komarek@karlin.mff.cuni.cz
#####
##### ======================================================================

### To run the following code run, do the following:
###  1) set the ROOT variable below to a path to some existing directory
###     on your computer;
###  2) create 'Data' subdirectory of the ROOT directory
###

rm(list = ls())
ROOT <- "/home/komarek/teach/mff_2020/nmst440_VypocProstr/Tutorial06/"
setwd(ROOT)


##### Read (slightly) bigger data
##### =====================================

  ### Create random (bigger) dataset
  ### (will be stored in a subdirectory 'Data' of the ROOT directory)
set.seed(951553282)
data <- rnorm(3e6)     
#data <- rnorm(3e5)     ### students with poorer computers

sink("./Data/bigdata.dat")
cat("## Random big data\n")
cat("x y z\n")
cat(data, sep = c(" ", " ", "\n"))
sink()
  ### --- end of code which just creates a random bigger dataset ---
  ### --- file 'bigdata.dat' should now exist in the 'Data'      ---
  ### --- subdirectory of ROOT                                   ---

  ### NOTE: The first row in bigdata.dat contains just a comment,
  ###       the second row contains variable names (x, y, z),
  ###       data values on a row are separated by space.

  ### Read data classically (using read.table)
?system.time
?proc.time

    ### using system.time around some COMMAND just runs the COMMAND
    ### and informs us about needed time to process the COMMAND
system.time(d1 <- read.table("./Data/bigdata.dat", skip = 1, header = TRUE))

?read.table

  ### Problem with 'read.table' and related functions ('read.csv' etc.) is that it works as
  ### follows:
  ###  1) file is read into memory as one (perhaps big) character variable (string)
  ###  2) using provided values of header, sep, quote, dec etc. (big) character
  ###     is parsed and splitted into various pieces of information to create
  ###     a final data.frame
  ###
  ### Read also Memory usage section in the above help to 'read.table'
  ###

  ### Bigger/big files can be read into R memory using, e.g., 'scan' function.

  ### Read using 'scan' function
  ### - by 'what' argument, we must say what type of input is expected
?scan
system.time(d2 <- scan("./Data/bigdata.dat", skip = 2, what = list(numeric(0), numeric(0), numeric(0))))

  ### d1 (read by data.frame) is a classical data.frame
class(d1)
head(d1)

  ### d2 is not a data.frame...
class(d2)
length(d2)     
c(length(d2[[1]]), length(d2[[2]]), length(d2[[3]]))

    ### each column from bigdata.dat --> one element of the list
d2[[1]][1:10]
d2[[2]][1:10]
d2[[3]][1:10]
#
d3 <- as.data.frame(d2)
head(d3)                   ### almost what we want, only column names are not really appealing...

colnames(d3) <- c("x", "y", "z")
head(d3)

  ### Read data directly including the column names
cnames <- scan("./Data/bigdata.dat", skip = 1, nline = 1, what = character(0))
print(cnames)

d2 <- scan("./Data/bigdata.dat", skip = 2, what = list(numeric(0), numeric(0), numeric(0)))
names(d2) <- cnames
d2[["x"]][1:10]
d2[["y"]][1:10]
d2[["z"]][1:10]

d3 <- as.data.frame(d2)
head(d3)

  ### Some list (to be used for illustrations below)
ld3 <- list(x = d2[[1]][1:100], y = d2[[2]][1:200], z = d2[[3]])

  ### Delete the ASCII file bigdata.dat
  ### (will not be needed anymore)
file.remove("./Data/bigdata.dat")


##### Vectorized calculation
##### =======================================

  ### apply: Rectangle structures
  ### ------------------------------------
rowMean <- apply(d3, 1, mean)                    ## apply some FUNCTION (mean) by rows (second argument = 1)
length(rowMean)
print(rowMean[1:20])
hist(rowMean, col = "darkolivegreen2")     ## by the way: illustration of CLT (or properties of normal distribution here)

colMean <- apply(d3, 2, mean, na.rm = TRUE)     ## apply some FUNCTION (mean) by columns (second argument = 2),
                  ## if used FUNCTION needs/wants additional arguments,
                  ## here na.rm = TRUE, those are just given to apply
                  ## as its 4th, 5th, ... argument
 
print(colMean)

  ### Compare time needed to calculate
  ### (only the first 10000 row means will be calculated)
howMany <- 1e4
system.time(rowMean <- apply(d3[1:howMany,], 1, mean))
system.time({
  rowMean <- numeric(howMany)
  for (i in 1:howMany) rowMean[i] <- mean(as.numeric(d3[i,]))
}) 


  ### sapply/lapply: apply something on elements of a list
  ### (or columns of a data.frame)
  ###
  ### NOTE: data.frame is internally a list whose elements
  ###       are columns of data.frame
  ###    
  ### --------------------------------------------------------

   ### Five ways on ho to access the first 10 values of the (first)
   ### column 'x' of data.frame d3
d3[1:10, "x"]
d3[1:10, 1]
d3[[1]][1:10]
d3[["x"]][1:10]
d3$x[1:10]

   ### sapply on data.frame
   ### - some FUNCTION (here mean) is applied to each element of list d3,
   ### - if FUNCTION needs/wants additional arguments, those are given
   ###   as the 2nd, 3rd, ... argument to sapply
   ### - if it's possible, sapply tries to return vector or matrix
sapply(d3, mean)       
sapply(d3, mean, na.rm = TRUE)
sapply(d3, mean, na.rm = TRUE, trim = 0.2)

    ### sapply on list
sapply(ld3, mean)

   ### lapply
   ### - pretty the same as 'sapply', nevertheless, it always returns a 'list'
lapply(d3, mean)
lapply(ld3, mean)

    ### sapply, FUN returns more than one value
sapply(d3, quantile, prob = c(0.25, 0.50, 0.75))
sapply(d3, quantile, prob = c(0.25, 0.50, 0.75), na.rm = TRUE)

sapply(ld3, quantile, prob = c(0.25, 0.50, 0.75))

    ### lapply, FUN returns more than one value
lapply(d3, quantile, prob = c(0.25, 0.50, 0.75))
lapply(ld3, quantile, prob = c(0.25, 0.50, 0.75))

    ### some other...
sapply(d3, length)
sapply(ld3, length)

apply(sapply(d3, is.na), 2, sum)   ## number of NA's in each column of the data.frame d3
sapply(lapply(d3, is.na), sum)     ## number of NA's in each column of the data.frame d3
sapply(sapply(ld3, is.na), sum)    ## number of NA's in each element of the list ld3

  ### sapply/lapply with operators

      ### for illustrations, three linear models are fitted
fit <- list()
fit[["y-x"]] <- lm(y ~ x, data = d3)
fit[["z-x"]] <- lm(z ~ x, data = d3)
fit[["z-y"]] <- lm(z ~ y, data = d3)

      ### "by hand" summary on each fitted model
summary(fit[["y-x"]])
summary(fit[["z-x"]])
summary(fit[["z-y"]])

      ### all at once
sapply(fit, summary)    ### not really appealing...
lapply(fit, summary)    ### much better :-)

sfit <- lapply(fit, summary)
sfit[["y-x"]]
sfit[["z-x"]]
sfit[["z-y"]]

sfit[["y-x"]]$r.squared
sfit[["y-x"]][["r.squared"]]
sfit[["z-x"]][["r.squared"]]
sfit[["z-y"]][["r.squared"]]

      ### also some operator (here 'list indexing' operator [[]]) can be used
      ### with sapply/lapply
      ### Here: on each element of a list, operator [[]] is used with its argument "r.squared"
sapply(sfit, "[[", "r.squared")
lapply(sfit, "[[", "r.squared")

sapply(fit, coef)
lapply(fit, coef)


  ### tapply: Categorized calculation
  ### --------------------------------------------------------

    ### It is now assumed that supplied data 'Kojeni.RData' are in
    ### 'Data' subdirectory of your working directory
getwd()        ## current working directory

print(load("./Data/Kojeni.RData"))        
summary(Kojeni)
dim(Kojeni)

    ### FUNCTION (here 'mean' is applied to values of some vector (here Kojeni$mage)
    ###     divided into subgroups determined by some factor (here Kojeni$feduc)
     ###
tapply(Kojeni$mage, Kojeni$feduc, mean)
with(Kojeni, tapply(mage, feduc, mean))      ## the same

tapply(Kojeni$mage, Kojeni$feduc, mean, na.rm = TRUE)   ### again, additional arguments to FUNCTION can be given within 'tapply'
with(Kojeni, tapply(mage, feduc, mean, na.rm = TRUE))

with(Kojeni, tapply(mage, feduc:fbplace, mean))    ## it possible to cross-classify by two factors
                                                   ## (two-way classification)
with(Kojeni, tapply(mage, feduc:fbplace:factor(n.child), mean))   ## or by three factors
                                                   ## NA = group was empty

with(Kojeni, by(mage, feduc:fbplace, mean))
with(Kojeni, by(mage, feduc:fbplace, mean, na.rm = TRUE))


  ### tapply: Basic tool to aggregate data
  ###         (quite common task especially in econometrics applications)
  ### --------------------------------------------------------------------

    ### Again some random dataset
set.seed(20010911)
n <- 15 + 365 + 25
daySeries <- data.frame(yield  = rnorm(n),
                        year   = rep(c(2013, 2014, 2015), c(15, 365, 25)),
                        month  = rep(c(12, 1:12, 1), c(15, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 25)),
                        day    = c(17:31, 1:31, 1:28, 1:31, 1:30, 1:31, 1:30, 1:31, 1:31, 1:30, 1:31, 1:30, 1:31, 1:25))
head(daySeries)

     ### Add date variable
daySeries <- transform(daySeries, date = as.Date(paste(year, month, day, sep = "-")))
head(daySeries)
summary(daySeries)
plot(yield ~ date, data = daySeries, type = "l", col = "darkblue", xlab = "Date", ylab = "Yield")

     ### Add time variable (1, 2, ...)
daySeries <- transform(daySeries, t = 1:nrow(daySeries))
plot(yield ~ t, data = daySeries, type = "l", col = "darkblue", xlab = "Time", ylab = "Yield")

     ### Year and Month factors
daySeries <- transform(daySeries, Year = factor(year), Month = factor(month))
print(daySeries)

summary(daySeries)

     ### Some frequency tables
with(daySeries, table(Year:Month))
with(daySeries, table(Year, Month))

      ### Monthly yield means
with(daySeries, tapply(yield, Year:Month, mean))

monthYield <- with(daySeries, tapply(yield, Year:Month, mean))
print(monthYield)

        ### only months we have some data
monthYield <- monthYield[!is.na(monthYield)]
print(monthYield)

names(monthYield)
substr(names(monthYield), 1, 4)
substr(names(monthYield), 6, nchar(names(monthYield)))

  ### Do you want to see only particular months?
charVec <- c("2013:12", "2014:1", "2014:2", "2014:10", "2014:11", "2014:12")
grep("12", charVec)     ## Decembers
grep("1", charVec)      ## January's?
grep(":1", charVec)     ## January's?
grep(":1$", charVec)    ## January's, yes!

(ind1 <- grep(":1$", names(monthYield)))
monthYield[ind1]

  ### month series in a data.frame
substr(names(monthYield), 1, 4)
substr(names(monthYield), 6, nchar(names(monthYield)))
monthRada <- data.frame(yield = monthYield,
                        year  = as.numeric(substr(names(monthYield), 1, 4)),
                        month = as.numeric(substr(names(monthYield), 6, nchar(names(monthYield)))))
print(monthRada)
rownames(monthRada) <- 1:nrow(monthRada)
print(monthRada)

monthRada <- transform(monthRada, Year = factor(year), Month = factor(month))
print(monthRada)
summary(monthRada)



##### Parallel calculation (intro)
##### =======================================
###
### Most (practically all) recent computers have multicore processors
### and often, each core is composed of several (often two) threads.
### That is, in fact, each (recent) computer is equal to several
### computers (in past). The "only" restriction is that all cores/threads
### share just one amount of the RAM memory.
###
### Anyway, many calculations can (easily) be parallelized by using
### more than one thread for calculations
###
library("parallel")    ### basic R package to calculate in parallel

  ### How many cores (in fact threads) we have?
(nCore <- detectCores())

  ### Ask for parallel calculation on 4 threads
  ### (if 4 available)
myCluster <- makeCluster(4)

sapply(d3, mean, na.rm = TRUE)                  ### not parallel
parSapply(myCluster, d3, mean, na.rm = TRUE)    ### parallel calculation

system.time(colMeans <- parSapply(myCluster, d3, mean, na.rm = TRUE))
print(colMeans)
system.time(colMeans <- sapply(d3, mean, na.rm = TRUE))
print(colMeans)
  ### here still too high price for core management, so (not parallel) sapply
  ### is faster than the parallel parSapply

stopCluster(myCluster)    ### stop the cluster

  ### --------------------------------------------------
  ### some additional illustrations

rm(list = ls())

myCluster <- makeCluster(4)

nbig <- 1e8
set.seed(20170316)
d3big <- data.frame(x = rnorm(nbig), y = rnorm(nbig), z = rnorm(nbig))
system.time(parSapply(myCluster, d3big, mean, na.rm = TRUE))
system.time(sapply(d3big, mean, na.rm = TRUE))

system.time(colMeans <- parLapply(myCluster, d3big, mean, na.rm = TRUE))
print(colMeans)
system.time(colMeans <- lapply(d3big, mean, na.rm = TRUE))
print(colMeans)

system.time(colMeans <- parApply(myCluster, d3big, 2, mean))
##   user  system elapsed 
##  4.096   0.606   5.843 
print(colMeans)

system.time(colMeans <- apply(d3big, 2, mean))
##   user  system elapsed 
##  2.402   0.323   2.725 
print(colMeans)

stopCluster(myCluster)

###
### In illustrations above, practically no advantage of parallel calculation was seen.
### Remember principles and wait till Tutorial xx (Computationally efficient simulations,
### cluster computation) to see really significant improvement when parallel calculation
### used as compared to non-parallel versions.
###