This report shows the time and memory used to run derfinder
for single base resolution differential expression analysis. It also shows the same information for going from BAM files to getting ready to run DESeq
(Anders and Huber, 2010) by using samtools
(, 2016) to convert to SAM format and HTSeq
(, 2014) to make the count tables. Furthermore, this process was compared to using the summarizeOverlaps()
function from the GenomicRanges
(Lawrence, Huber, Pagès, Aboyoun, et al., 2013) package as well as using the coverageToExon()
function included in the derfinder
package [requires the output from the fullCov step].
## Extract information from Gmail
system('cp ../../efficiency_analytics/client_secrets .')
system('python ../../efficiency_analytics/analyze_efficiency.py --email fellgernon@gmail.com --folder "Cluster/derSoftware" --outfile timing-derSoftware.txt')
## Load libraries
library("ggplot2")
library("knitr")
## Setup
## Define number of cores used
exps <- c('brainspan', 'simulation', 'hippo', 'snyder', 'stem')
## Read data and process it
all <- read.table('timing-derSoftware.txt', header = TRUE, stringsAsFactors = FALSE)
all <- all[!grepl('brainspan.*run3', all$jobid), ] # remove older info
all$step <- gsub('.*th', 'TopHat', sapply(strsplit(all$jobid, "-"), function(x) x[1]))
all$memG <- all$memory
all$memG[all$memunit == "M"] <- all$memG[all$memunit == "M"] / 1024
all$chr <- gsub('.*chr', 'chr', all$jobid)
all$chr[ !grepl('chr', all$chr) ] <- NA
## Experiment info
all$experiment <- NA
for(exp in exps) {
all$experiment[ grepl(exp, tolower(all$jobid)) ] <- exp
}
all$experiment[ all$step %in% c('TopHat', 'bigwig') ] <- 'simulation'
all$experiment[ all$jobid == 'makeBai-Sim' ] <- 'simulation'
## Cores info
all$cores <- mapply(function(chr, exp, step) {
if(step == 'fullCov') {
return(10L)
} else if(step == 'derA') {
if(exp == 'brainspan') {
return(ifelse(chr == 'chrY', 2L, ifelse(chr == 'chr1', 40L, ifelse(chr == 'chr2', 32L, ifelse(chr == 'chr3', 27L, ifelse(chr == 'chr19', 29L, 20L))))))
} else if (exp == 'simulation'){
return(1L)
} else if (exp == 'hippo'){
return(2L)
} else if (exp == 'snyder'){
return(4L)
} else if (exp == 'stem'){
return(8L)
}
} else if(step == 'regMat') {
return(5L)
} else if(step == 'TopHat') {
return(4L)
} else if(step == 'summOv') {
return(ifelse(exp == 'hippo', 24L, 10L))
} else {
return(1L)
}
}, all$chr, all$experiment, all$step)
all$timeByCore <- all$walltime * all$cores
all$memByCore <- all$memG / all$cores
## Add software labels
all$software <- factor(ifelse(all$step %in% c('toSam', 'htseq'), 'HTSeq', ifelse(all$step == 'summOv', 'GenomicRanges', ifelse(all$step == 'TopHat', 'TopHat', ifelse(all$step %in% c('makeBai', 'regVsDERs', 'PNAS', 'summInfo'), 'misc', ifelse(all$step == 'derR', 'regionReport', 'derfinder'))))))
## Experiment and cores groups info
all$experiment <- factor(all$experiment, levels = exps)
all$coresGroups <- all$cores
all$coresGroups[ all$cores >= 20] <- '20+'
all$coresGroups <- factor(all$coresGroups, levels = c(1, 2, 4, 5, 8, 10, '20+'))
## Types of analysis
all$analysis <- factor(ifelse(all$step %in% c('derMod', 'derA', 'derM'), 'Single-base DER', ifelse(all$step %in% c('toSam', 'htseq', 'summOv', 'covToEx'), 'Exon count', ifelse(all$step == 'regMat', 'Expressed-region DER', ifelse(all$step == 'fullCov', 'Load data', ifelse(all$step == 'derR', 'HTML report', 'misc'))))))
## Show only information for the data sets described in this website
all <- subset(all, experiment %in% c('hippo', 'snyder'))
The following plots show the wall time and memory used by each job while taking into account the number of cores used by each job. Note that doing so is a crude approximation of how much time and memory each job would have needed had it ran on a single node.
Points are colored by which analysis type they belong to. Note that the loading data step is required for the single-level and expressed-regions DER approaches as well as exon counting (with derfinder).
## Walltime and memory adjusted by number of cores (it's an approximation)
ggplot(all, aes(x=timeByCore, y=memByCore, colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores") + ylab("Memory (GB) divided by the number of cores") + scale_colour_brewer(palette="Dark2") + theme_bw(base_size = 18) + theme(legend.position=c(.5, .75), legend.box = 'horizontal')
ggplot(all, aes(x=log2(timeByCore), y=memByCore, colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores (log2)") + ylab("Memory (GB) divided by the number of cores") + scale_colour_brewer(palette="Dark2") + theme_bw(base_size = 18) + theme(legend.position=c(.5, .75), legend.box = 'horizontal')
## For supp text
time <- ggplot(subset(all, !software %in% c('TopHat', 'regionReport') & analysis != 'misc'), aes(x=log2(timeByCore), y=log2(memByCore), colour=analysis, shape=software)) + geom_point(size = 3) + facet_grid(~ experiment) + xlab("Wall time (hrs) multiplied by the number of cores (log2)") + ylab("GB memory divided by number of cores (log2)") + scale_colour_brewer(palette="Set1") + theme_bw(base_size = 18) + theme(legend.position=c(.55, .15), legend.box = 'horizontal')
time
pdf(file = 'time.pdf', width = 10)
time
dev.off()
## quartz_off_screen
## 2
#system('open time.pdf')
getInfo <- function(df, sumTime = FALSE, peakCores = FALSE) {
memByCore <- max(df$memByCore)
walltime <- ifelse(sumTime, sum(df$walltime), max(df$walltime))
memG <- max(df$memG)
peakCores <- ifelse(peakCores, max(df$peakCores), sum(df$cores))
res <- c(memByCore = memByCore, walltime = walltime, memG = memG, peakCores = peakCores)
return(res)
}
analysisInfo <- list('Single-base DER' = c('Load data', 'Single-base DER'),
'Expressed-region DER' = c('Load data', 'Expressed-region DER'),
'HTML report' = 'HTML report',
'Exon count - derfinder' = 'Load data'
)
analysisInfo <- lapply(analysisInfo, function(x) { which(all$analysis %in% x)})
analysisInfo[[4]] <- c(analysisInfo[[4]], which(all$step == 'covToEx'))
analysisInfo$"Exon count - HTSeq" <- which(all$step %in% c('toSam', 'htseq'))
analysisInfo$"Exon count - GenomicRanges" <- which(all$step == 'summOv')
## Summarize the information for each step of each analysis
analysisSummary <- lapply(names(analysisInfo), function(analysis) {
current <- all[analysisInfo[[analysis]], ]
res_analysis <- lapply(exps, function(exp) {
use <- subset(current, experiment == exp)
if(nrow(use) == 0) return(NULL)
res_exp <- lapply(unique(use$step), function(step) {
res_step <- as.data.frame(t(getInfo(use[use$step == step, ])))
res_step$step <- step
res_step$experiment <- exp
res_step$analysis <- analysis
return(res_step)
})
res_exp <- do.call(rbind, res_exp)
return(res_exp)
})
res_analysis <- do.call(rbind, res_analysis)
return(res_analysis)
})
analysisSummary <- do.call(rbind, analysisSummary)
The table shown below shows per analysis the maximum memory used by a job and maximum wall time for that step. This is assuming that all jobs for a given step ran simultaneously. For example, that all jobs running derfinder::analyzeChr()
were running at the same time. Note that for some analyses relied on the same steps, like loading the data (fullCov). This table can be useful to find the peak number of cores (the sum of cores for all jobs running simultaneously) for a given analysis step.
kable(analysisSummary, format = 'markdown', digits = c(2, 4, 2))
memByCore | walltime | memG | peakCores | step | experiment | analysis |
---|---|---|---|---|---|---|
1.32 | 0.0492 | 1.32 | 1 | derM | hippo | Single-base DER |
3.90 | 0.9697 | 7.80 | 48 | derA | hippo | Single-base DER |
3.25 | 0.0222 | 3.25 | 1 | derMod | hippo | Single-base DER |
1.29 | 0.1967 | 12.91 | 10 | fullCov | hippo | Single-base DER |
4.39 | 1.2494 | 4.39 | 1 | derM | snyder | Single-base DER |
5.14 | 2.3453 | 20.55 | 96 | derA | snyder | Single-base DER |
7.02 | 0.0558 | 7.02 | 2 | derMod | snyder | Single-base DER |
2.71 | 1.2539 | 27.10 | 10 | fullCov | snyder | Single-base DER |
2.07 | 0.2442 | 10.33 | 5 | regMat | hippo | Expressed-region DER |
1.29 | 0.1967 | 12.91 | 10 | fullCov | hippo | Expressed-region DER |
5.32 | 1.1131 | 26.62 | 5 | regMat | snyder | Expressed-region DER |
2.71 | 1.2539 | 27.10 | 10 | fullCov | snyder | Expressed-region DER |
36.46 | 0.8094 | 36.46 | 1 | derR | hippo | HTML report |
37.20 | 0.4836 | 37.20 | 1 | derR | snyder | HTML report |
1.29 | 0.1967 | 12.91 | 10 | fullCov | hippo | Exon count - derfinder |
11.16 | 0.6286 | 11.16 | 2 | covToEx | hippo | Exon count - derfinder |
2.71 | 1.2539 | 27.10 | 10 | fullCov | snyder | Exon count - derfinder |
16.20 | 0.7375 | 16.20 | 2 | covToEx | snyder | Exon count - derfinder |
0.38 | 0.5672 | 0.38 | 31 | htseq | hippo | Exon count - HTSeq |
1.73 | 3.7153 | 1.73 | 1 | toSam | hippo | Exon count - HTSeq |
0.38 | 7.8933 | 0.38 | 20 | htseq | snyder | Exon count - HTSeq |
1.44 | 42.0253 | 1.44 | 1 | toSam | snyder | Exon count - HTSeq |
1.80 | 0.2967 | 43.24 | 24 | summOv | hippo | Exon count - GenomicRanges |
6.32 | 2.6850 | 63.24 | 10 | summOv | snyder | Exon count - GenomicRanges |
## Summary the information for each analysis
peaks <- lapply(names(analysisInfo), function(analysis) {
res_analysis <- lapply(exps, function(exp) {
current <- analysisSummary[analysisSummary$analysis == analysis & analysisSummary$experiment == exp, ]
if(nrow(current) == 0) return(NULL)
res_exp <- as.data.frame(t(getInfo(current, sumTime = TRUE, peakCores = TRUE)))
res_exp$experiment <- exp
res_exp$analysis <- analysis
return(res_exp)
})
res_analysis <- do.call(rbind, res_analysis)
return(res_analysis)
})
peaks <- do.call(rbind, peaks)
save(peaks, file = 'peaks.Rdata')
We can further summarize the resources used by each analysis by identified the maximum memory used in the steps required for a particular analysis and the total wall time for running all the steps when all the jobs of a particular step are running simultaneously. Thus giving us the total actual wall time to run a specific analysis and the maximum memory required.
The table below shows the final summary. Note that in some analyses, the peak memory is from the fullCov step. We did not focus on reducing the memory load of this step as we sacrificed memory for speed. We know that much lower memory limits can be achieved using 1 core instead of the 10 cores used.
kable(peaks, format = 'markdown', digits = c(2, 3, 2))
memByCore | walltime | memG | peakCores | experiment | analysis |
---|---|---|---|---|---|
3.90 | 1.238 | 12.91 | 48 | hippo | Single-base DER |
7.02 | 4.904 | 27.10 | 96 | snyder | Single-base DER |
2.07 | 0.441 | 12.91 | 10 | hippo | Expressed-region DER |
5.32 | 2.367 | 27.10 | 10 | snyder | Expressed-region DER |
36.46 | 0.809 | 36.46 | 1 | hippo | HTML report |
37.20 | 0.484 | 37.20 | 1 | snyder | HTML report |
11.16 | 0.825 | 12.91 | 10 | hippo | Exon count - derfinder |
16.20 | 1.991 | 27.10 | 10 | snyder | Exon count - derfinder |
1.73 | 4.283 | 1.73 | 31 | hippo | Exon count - HTSeq |
1.44 | 49.919 | 1.44 | 20 | snyder | Exon count - HTSeq |
1.80 | 0.297 | 43.24 | 24 | hippo | Exon count - GenomicRanges |
6.32 | 2.685 | 63.24 | 10 | snyder | Exon count - GenomicRanges |
Regarding the high memory load for the HTML report, this could be significantly lowered by only loading the required coverage data used for the plots instead of the full output from the fullCov step. That is, using the which argument from fullCoverage()
to create a much smaller fullCov object, which would also reduce the memory used when plotting.
Note: since these analyses were done, we have found other ways to run derfinder::regionMatrix()
that require less memory. In particular, if you have BigWig files (as those generated by Rail-RNA
(Nellore, Collado-Torres, Jaffe, Alquicira-Hernández, et al., 2015)), we recommend using railMatrix()
.
The previous table can also be used to compare the sum of the time and peak memory used by the different steps to obtain the exon count table with the following software options.
derfinder
: includes resources used for reading coverage data in R
and then running creating a feature count matrix. We did so for
HTSeq
: includes resources used for generating sorted SAM files and then running HTSeq.summOv
: resources used for running GenomicRanges::summarizeOverlaps()
directly on the BAM files.The following table shows the details of the resources used by the different jobs. It shows the experiment (experiment), the analysis step (step), wall time used (shown in hours, walltime), number of cores used (cores), memory in GB used (memG), software used (software), analysis for which the step is used (analysis), and the job name (jobib). Furthermore, it shows two simple approximations:
These are the following analysis steps:
regionReport
.regionMatrix()
.GenomicRanges::summarizeOverlaps()
to generate exon count table.derfinder::coverageToExon()
for UCSC hg19 knownGene or GRCh37 p11 Ensembl annotation table.library("DT")
## Print whole table
d <- all[, c("experiment", "step", "walltime", "cores", "memG", "timeByCore", "memByCore", "software", "analysis", "jobid")]
datatable(d, options = list(pagingType='full_numbers', pageLength=50, scrollX='100%')) %>% formatRound(columns = c(3, 5:7), digits = 3)
Table made using DT
(Xie, 2015).
Date the report was generated.
## [1] "2016-03-21 15:49:35 EDT"
Wallclock time spent generating the report.
## Time difference of 5.694 secs
R
session information.
## Session info -----------------------------------------------------------------------------------------------------------
## setting value
## version R version 3.2.2 (2015-08-14)
## system x86_64, darwin13.4.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/New_York
## date 2016-03-21
## Packages ---------------------------------------------------------------------------------------------------------------
## package * version date source
## bibtex 0.4.0 2014-12-31 CRAN (R 3.2.0)
## bitops 1.0-6 2013-08-17 CRAN (R 3.2.0)
## colorspace 1.2-6 2015-03-11 CRAN (R 3.2.0)
## curl 0.9.6 2016-02-17 CRAN (R 3.2.3)
## devtools 1.10.0 2016-01-23 CRAN (R 3.2.3)
## digest 0.6.9 2016-01-08 CRAN (R 3.2.3)
## DT * 0.1 2015-06-09 CRAN (R 3.2.0)
## evaluate 0.8 2015-09-18 CRAN (R 3.2.0)
## formatR 1.2.1 2015-09-18 CRAN (R 3.2.0)
## ggplot2 * 2.0.0 2015-12-18 CRAN (R 3.2.3)
## gtable 0.1.2 2012-12-05 CRAN (R 3.2.0)
## highr 0.5.1 2015-09-18 CRAN (R 3.2.0)
## htmltools 0.3 2015-12-29 CRAN (R 3.2.3)
## htmlwidgets 0.5 2015-06-21 CRAN (R 3.2.1)
## httr 1.1.0 2016-01-28 CRAN (R 3.2.3)
## jsonlite 0.9.19 2015-11-28 CRAN (R 3.2.2)
## knitcitations * 1.0.7 2015-10-28 CRAN (R 3.2.0)
## knitr * 1.12.3 2016-01-22 CRAN (R 3.2.3)
## labeling 0.3 2014-08-23 CRAN (R 3.2.0)
## lubridate 1.5.0 2015-12-03 CRAN (R 3.2.3)
## magrittr 1.5 2014-11-22 CRAN (R 3.2.0)
## memoise 1.0.0 2016-01-29 CRAN (R 3.2.3)
## munsell 0.4.3 2016-02-13 CRAN (R 3.2.3)
## plyr 1.8.3 2015-06-12 CRAN (R 3.2.1)
## R6 2.1.2 2016-01-26 CRAN (R 3.2.3)
## RColorBrewer 1.1-2 2014-12-07 CRAN (R 3.2.0)
## Rcpp 0.12.3 2016-01-10 CRAN (R 3.2.3)
## RCurl 1.95-4.7 2015-06-30 CRAN (R 3.2.1)
## RefManageR 0.10.6 2016-02-15 CRAN (R 3.2.3)
## reshape2 1.4.1 2014-12-06 CRAN (R 3.2.0)
## RJSONIO 1.3-0 2014-07-28 CRAN (R 3.2.0)
## rmarkdown * 0.9.5 2016-02-22 CRAN (R 3.2.3)
## scales 0.3.0 2015-08-25 CRAN (R 3.2.0)
## stringi 1.0-1 2015-10-22 CRAN (R 3.2.0)
## stringr 1.0.0 2015-04-30 CRAN (R 3.2.0)
## XML 3.98-1.3 2015-06-30 CRAN (R 3.2.0)
## yaml 2.1.13 2014-06-12 CRAN (R 3.2.0)
This report was generated using rmarkdown
(Allaire, Cheng, Xie, McPherson, et al., 2016) with knitr
(Xie, 2014) running behind the scenes. Timing information extracted from the SGE reports using efficiency analytics
(Frazee, 2014). Figures and citations were made using ggplot2
(Wickham, 2009) and knitcitations
(Boettiger, 2015) respectively.
Citation file: timing.bib
[1] HTSeq: Analysing high-throughput sequencing data with Python — HTSeq 0.6.1p2 documentation. http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html. 2014. URL: http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html.
[2] SAMtools. http://samtools.sourceforge.net/. 2016. URL: http://samtools.sourceforge.net/.
[3] J. Allaire, J. Cheng, Y. Xie, J. McPherson, et al. rmarkdown: Dynamic Documents for R. R package version 0.9.5. 2016. URL: http://CRAN.R-project.org/package=rmarkdown.
[4] S. Anders and W. Huber. “Differential expression analysis for sequence count data”. In: Genome Biology 11 (2010), p. R106. DOI: 10.1186/gb-2010-11-10-r106. URL: http://genomebiology.com/2010/11/10/R106/.
[5] C. Boettiger. knitcitations: Citations for ‘Knitr’ Markdown Files. R package version 1.0.7. 2015. URL: http://CRAN.R-project.org/package=knitcitations.
[6] A. Frazee. Efficiency analysis of Sun Grid Engine batch jobs. 2014. URL: http://dx.doi.org/10.6084/m9.figshare.878000.
[7] M. Lawrence, W. Huber, H. Pagès, P. Aboyoun, et al. “Software for Computing and Annotating Genomic Ranges”. In: PLoS Computational Biology 9 (8 2013). DOI: 10.1371/journal.pcbi.1003118. URL: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118}.
[8] A. Nellore, L. Collado-Torres, A. E. Jaffe, J. Alquicira-Hernández, et al. “Rail-RNA: Scalable analysis of RNA-seq splicing and coverage”. In: bioRxiv (2015).
[9] H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2009. ISBN: 978-0-387-98140-6. URL: http://had.co.nz/ggplot2/book.
[10] Y. Xie. DT: A Wrapper of the JavaScript Library ‘DataTables’. R package version 0.1. 2015. URL: http://CRAN.R-project.org/package=DT.
[11] Y. Xie. “knitr: A Comprehensive Tool for Reproducible Research in R”. In: Implementing Reproducible Computational Research. Ed. by V. Stodden, F. Leisch and R. D. Peng. ISBN 978-1466561595. Chapman and Hall/CRC, 2014. URL: http://www.crcpress.com/product/isbn/9781466561595.