##### ##### NMST440: Computational Environment for Statistical Data Analysis ##### ##### --------------------------------------------------------------------- ##### ##### Working with big(ger) data, ##### various sorts of apply ##### ##### --------------------------------------------------------------------- ##### ##### Arnošt Komárek ##### http://msekce.karlin.mff.cuni.cz/~komarek ##### komarek@karlin.mff.cuni.cz ##### ##### ====================================================================== ### To run the following code run, do the following: ### 1) set the ROOT variable below to a path to some existing directory ### on your computer; ### 2) create 'Data' subdirectory of the ROOT directory ### rm(list = ls()) ROOT <- "/home/komarek/teach/mff_2020/nmst440_VypocProstr/Tutorial06/" setwd(ROOT) ##### Read (slightly) bigger data ##### ===================================== ### Create random (bigger) dataset ### (will be stored in a subdirectory 'Data' of the ROOT directory) set.seed(951553282) data <- rnorm(3e6) #data <- rnorm(3e5) ### students with poorer computers sink("./Data/bigdata.dat") cat("## Random big data\n") cat("x y z\n") cat(data, sep = c(" ", " ", "\n")) sink() ### --- end of code which just creates a random bigger dataset --- ### --- file 'bigdata.dat' should now exist in the 'Data' --- ### --- subdirectory of ROOT --- ### NOTE: The first row in bigdata.dat contains just a comment, ### the second row contains variable names (x, y, z), ### data values on a row are separated by space. ### Read data classically (using read.table) ?system.time ?proc.time ### using system.time around some COMMAND just runs the COMMAND ### and informs us about needed time to process the COMMAND system.time(d1 <- read.table("./Data/bigdata.dat", skip = 1, header = TRUE)) ?read.table ### Problem with 'read.table' and related functions ('read.csv' etc.) is that it works as ### follows: ### 1) file is read into memory as one (perhaps big) character variable (string) ### 2) using provided values of header, sep, quote, dec etc. (big) character ### is parsed and splitted into various pieces of information to create ### a final data.frame ### ### Read also Memory usage section in the above help to 'read.table' ### ### Bigger/big files can be read into R memory using, e.g., 'scan' function. ### Read using 'scan' function ### - by 'what' argument, we must say what type of input is expected ?scan system.time(d2 <- scan("./Data/bigdata.dat", skip = 2, what = list(numeric(0), numeric(0), numeric(0)))) ### d1 (read by data.frame) is a classical data.frame class(d1) head(d1) ### d2 is not a data.frame... class(d2) length(d2) c(length(d2[[1]]), length(d2[[2]]), length(d2[[3]])) ### each column from bigdata.dat --> one element of the list d2[[1]][1:10] d2[[2]][1:10] d2[[3]][1:10] # d3 <- as.data.frame(d2) head(d3) ### almost what we want, only column names are not really appealing... colnames(d3) <- c("x", "y", "z") head(d3) ### Read data directly including the column names cnames <- scan("./Data/bigdata.dat", skip = 1, nline = 1, what = character(0)) print(cnames) d2 <- scan("./Data/bigdata.dat", skip = 2, what = list(numeric(0), numeric(0), numeric(0))) names(d2) <- cnames d2[["x"]][1:10] d2[["y"]][1:10] d2[["z"]][1:10] d3 <- as.data.frame(d2) head(d3) ### Some list (to be used for illustrations below) ld3 <- list(x = d2[[1]][1:100], y = d2[[2]][1:200], z = d2[[3]]) ### Delete the ASCII file bigdata.dat ### (will not be needed anymore) file.remove("./Data/bigdata.dat") ##### Vectorized calculation ##### ======================================= ### apply: Rectangle structures ### ------------------------------------ rowMean <- apply(d3, 1, mean) ## apply some FUNCTION (mean) by rows (second argument = 1) length(rowMean) print(rowMean[1:20]) hist(rowMean, col = "darkolivegreen2") ## by the way: illustration of CLT (or properties of normal distribution here) colMean <- apply(d3, 2, mean, na.rm = TRUE) ## apply some FUNCTION (mean) by columns (second argument = 2), ## if used FUNCTION needs/wants additional arguments, ## here na.rm = TRUE, those are just given to apply ## as its 4th, 5th, ... argument print(colMean) ### Compare time needed to calculate ### (only the first 10000 row means will be calculated) howMany <- 1e4 system.time(rowMean <- apply(d3[1:howMany,], 1, mean)) system.time({ rowMean <- numeric(howMany) for (i in 1:howMany) rowMean[i] <- mean(as.numeric(d3[i,])) }) ### sapply/lapply: apply something on elements of a list ### (or columns of a data.frame) ### ### NOTE: data.frame is internally a list whose elements ### are columns of data.frame ### ### -------------------------------------------------------- ### Five ways on ho to access the first 10 values of the (first) ### column 'x' of data.frame d3 d3[1:10, "x"] d3[1:10, 1] d3[[1]][1:10] d3[["x"]][1:10] d3$x[1:10] ### sapply on data.frame ### - some FUNCTION (here mean) is applied to each element of list d3, ### - if FUNCTION needs/wants additional arguments, those are given ### as the 2nd, 3rd, ... argument to sapply ### - if it's possible, sapply tries to return vector or matrix sapply(d3, mean) sapply(d3, mean, na.rm = TRUE) sapply(d3, mean, na.rm = TRUE, trim = 0.2) ### sapply on list sapply(ld3, mean) ### lapply ### - pretty the same as 'sapply', nevertheless, it always returns a 'list' lapply(d3, mean) lapply(ld3, mean) ### sapply, FUN returns more than one value sapply(d3, quantile, prob = c(0.25, 0.50, 0.75)) sapply(d3, quantile, prob = c(0.25, 0.50, 0.75), na.rm = TRUE) sapply(ld3, quantile, prob = c(0.25, 0.50, 0.75)) ### lapply, FUN returns more than one value lapply(d3, quantile, prob = c(0.25, 0.50, 0.75)) lapply(ld3, quantile, prob = c(0.25, 0.50, 0.75)) ### some other... sapply(d3, length) sapply(ld3, length) apply(sapply(d3, is.na), 2, sum) ## number of NA's in each column of the data.frame d3 sapply(lapply(d3, is.na), sum) ## number of NA's in each column of the data.frame d3 sapply(sapply(ld3, is.na), sum) ## number of NA's in each element of the list ld3 ### sapply/lapply with operators ### for illustrations, three linear models are fitted fit <- list() fit[["y-x"]] <- lm(y ~ x, data = d3) fit[["z-x"]] <- lm(z ~ x, data = d3) fit[["z-y"]] <- lm(z ~ y, data = d3) ### "by hand" summary on each fitted model summary(fit[["y-x"]]) summary(fit[["z-x"]]) summary(fit[["z-y"]]) ### all at once sapply(fit, summary) ### not really appealing... lapply(fit, summary) ### much better :-) sfit <- lapply(fit, summary) sfit[["y-x"]] sfit[["z-x"]] sfit[["z-y"]] sfit[["y-x"]]$r.squared sfit[["y-x"]][["r.squared"]] sfit[["z-x"]][["r.squared"]] sfit[["z-y"]][["r.squared"]] ### also some operator (here 'list indexing' operator [[]]) can be used ### with sapply/lapply ### Here: on each element of a list, operator [[]] is used with its argument "r.squared" sapply(sfit, "[[", "r.squared") lapply(sfit, "[[", "r.squared") sapply(fit, coef) lapply(fit, coef) ### tapply: Categorized calculation ### -------------------------------------------------------- ### It is now assumed that supplied data 'Kojeni.RData' are in ### 'Data' subdirectory of your working directory getwd() ## current working directory print(load("./Data/Kojeni.RData")) summary(Kojeni) dim(Kojeni) ### FUNCTION (here 'mean' is applied to values of some vector (here Kojeni$mage) ### divided into subgroups determined by some factor (here Kojeni$feduc) ### tapply(Kojeni$mage, Kojeni$feduc, mean) with(Kojeni, tapply(mage, feduc, mean)) ## the same tapply(Kojeni$mage, Kojeni$feduc, mean, na.rm = TRUE) ### again, additional arguments to FUNCTION can be given within 'tapply' with(Kojeni, tapply(mage, feduc, mean, na.rm = TRUE)) with(Kojeni, tapply(mage, feduc:fbplace, mean)) ## it possible to cross-classify by two factors ## (two-way classification) with(Kojeni, tapply(mage, feduc:fbplace:factor(n.child), mean)) ## or by three factors ## NA = group was empty with(Kojeni, by(mage, feduc:fbplace, mean)) with(Kojeni, by(mage, feduc:fbplace, mean, na.rm = TRUE)) ### tapply: Basic tool to aggregate data ### (quite common task especially in econometrics applications) ### -------------------------------------------------------------------- ### Again some random dataset set.seed(20010911) n <- 15 + 365 + 25 daySeries <- data.frame(yield = rnorm(n), year = rep(c(2013, 2014, 2015), c(15, 365, 25)), month = rep(c(12, 1:12, 1), c(15, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 25)), day = c(17:31, 1:31, 1:28, 1:31, 1:30, 1:31, 1:30, 1:31, 1:31, 1:30, 1:31, 1:30, 1:31, 1:25)) head(daySeries) ### Add date variable daySeries <- transform(daySeries, date = as.Date(paste(year, month, day, sep = "-"))) head(daySeries) summary(daySeries) plot(yield ~ date, data = daySeries, type = "l", col = "darkblue", xlab = "Date", ylab = "Yield") ### Add time variable (1, 2, ...) daySeries <- transform(daySeries, t = 1:nrow(daySeries)) plot(yield ~ t, data = daySeries, type = "l", col = "darkblue", xlab = "Time", ylab = "Yield") ### Year and Month factors daySeries <- transform(daySeries, Year = factor(year), Month = factor(month)) print(daySeries) summary(daySeries) ### Some frequency tables with(daySeries, table(Year:Month)) with(daySeries, table(Year, Month)) ### Monthly yield means with(daySeries, tapply(yield, Year:Month, mean)) monthYield <- with(daySeries, tapply(yield, Year:Month, mean)) print(monthYield) ### only months we have some data monthYield <- monthYield[!is.na(monthYield)] print(monthYield) names(monthYield) substr(names(monthYield), 1, 4) substr(names(monthYield), 6, nchar(names(monthYield))) ### Do you want to see only particular months? charVec <- c("2013:12", "2014:1", "2014:2", "2014:10", "2014:11", "2014:12") grep("12", charVec) ## Decembers grep("1", charVec) ## January's? grep(":1", charVec) ## January's? grep(":1$", charVec) ## January's, yes! (ind1 <- grep(":1$", names(monthYield))) monthYield[ind1] ### month series in a data.frame substr(names(monthYield), 1, 4) substr(names(monthYield), 6, nchar(names(monthYield))) monthRada <- data.frame(yield = monthYield, year = as.numeric(substr(names(monthYield), 1, 4)), month = as.numeric(substr(names(monthYield), 6, nchar(names(monthYield))))) print(monthRada) rownames(monthRada) <- 1:nrow(monthRada) print(monthRada) monthRada <- transform(monthRada, Year = factor(year), Month = factor(month)) print(monthRada) summary(monthRada) ##### Parallel calculation (intro) ##### ======================================= ### ### Most (practically all) recent computers have multicore processors ### and often, each core is composed of several (often two) threads. ### That is, in fact, each (recent) computer is equal to several ### computers (in past). The "only" restriction is that all cores/threads ### share just one amount of the RAM memory. ### ### Anyway, many calculations can (easily) be parallelized by using ### more than one thread for calculations ### library("parallel") ### basic R package to calculate in parallel ### How many cores (in fact threads) we have? (nCore <- detectCores()) ### Ask for parallel calculation on 4 threads ### (if 4 available) myCluster <- makeCluster(4) sapply(d3, mean, na.rm = TRUE) ### not parallel parSapply(myCluster, d3, mean, na.rm = TRUE) ### parallel calculation system.time(colMeans <- parSapply(myCluster, d3, mean, na.rm = TRUE)) print(colMeans) system.time(colMeans <- sapply(d3, mean, na.rm = TRUE)) print(colMeans) ### here still too high price for core management, so (not parallel) sapply ### is faster than the parallel parSapply stopCluster(myCluster) ### stop the cluster ### -------------------------------------------------- ### some additional illustrations rm(list = ls()) myCluster <- makeCluster(4) nbig <- 1e8 set.seed(20170316) d3big <- data.frame(x = rnorm(nbig), y = rnorm(nbig), z = rnorm(nbig)) system.time(parSapply(myCluster, d3big, mean, na.rm = TRUE)) system.time(sapply(d3big, mean, na.rm = TRUE)) system.time(colMeans <- parLapply(myCluster, d3big, mean, na.rm = TRUE)) print(colMeans) system.time(colMeans <- lapply(d3big, mean, na.rm = TRUE)) print(colMeans) system.time(colMeans <- parApply(myCluster, d3big, 2, mean)) ## user system elapsed ## 4.096 0.606 5.843 print(colMeans) system.time(colMeans <- apply(d3big, 2, mean)) ## user system elapsed ## 2.402 0.323 2.725 print(colMeans) stopCluster(myCluster) ### ### In illustrations above, practically no advantage of parallel calculation was seen. ### Remember principles and wait till Tutorial xx (Computationally efficient simulations, ### cluster computation) to see really significant improvement when parallel calculation ### used as compared to non-parallel versions. ###