Count at gene level

This report defines the gene sets for the simulation data. The first set uses all the exons from the transcripts used in the simulation, while the second drops one transcript for the genes that have two of them.

Define gene sets

The following code defines the two gene sets.

## Load packages
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: methods
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     species
library('GenomicRanges')
library('derfinder')
library('Rsamtools')
## Loading required package: XVector
## Loading required package: Biostrings
library('GenomicAlignments')

## Load data
load('../simulation_info.Rdata')

## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')

## Exons by gene
ex <- exonsBy(txdb, by = 'gene')

## Define gene sets: complete and incomplete
gene_comp <- ex[unique(chosen$gene_id)]

## Transcripts by gene
tx <- transcriptsBy(txdb, 'gene')

## For each gene with 2 transcripts, choose only 1 to use
tx_inc <- tx[unique(chosen$gene_id)]
set.seed(20150330)
for(i in which(sapply(tx_inc, length) == 2)) {
    tx_inc[[i]] <- tx_inc[[i]][sample(1:2, 1)]
}

## Check that it's all genes with 1 isoform
table(sapply(tx_inc, length))
## 
##  1 
## 60
## Get tx ids
tx_ids <- sapply(tx_inc, function(x) { mcols(x)$tx_id })

## Define incomplete gene set
gene_inc <- exonsBy(txdb, by = 'tx')[as.character(tx_ids)]
names(gene_inc) <- names(tx_inc)

## Define a second incomplete gene set where 20% of the transcripts are 
## dropped at random, regardless of the simulation scenario
tx_rand <- tx[unique(chosen$gene_id)]
tx_rand_list <- unlist(tx_rand)
set.seed(20150406)
tx_rand_chosen <- tx_rand_list[sample(seq_len(length(tx_rand_list)), size = round(length(tx_rand_list) * 0.8))]
gene_rand <- split(tx_rand_chosen, names(tx_rand_chosen))

## Save info
save(tx_inc, gene_comp, gene_inc, gene_rand, file = 'gene_sets.Rdata')

Count

The following code finds the BAM files and produces the gene count matrices using summarizeOverlaps(). For more information, check RNA-Seq workflow: gene-level exploratory analysis and differential expression.

## Make bamFileList
files <- files <- rawFiles(datadir ='/dcl01/lieber/ajaffe/derRuns/derSoftware/simulation/thout',
    samplepatt = "sample", fileterm = "accepted_hits.bam")
bai <- paste0(files, ".bai")
bList <- BamFileList(files, bai)

## Compute the overlaps
message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps() -- complete"))
## 2015-04-07 03:02:13 summarizeOverlaps: Running summarizeOverlaps() -- complete
summOv_comp <- summarizeOverlaps(gene_comp, bList, mode = "Union",
    singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE)

message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps() -- incomplete"))
## 2015-04-07 03:02:48 summarizeOverlaps: Running summarizeOverlaps() -- incomplete
summOv_inc <- summarizeOverlaps(gene_inc, bList, mode = "Union",
    singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE)
    
message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps() -- incomplete random"))
## 2015-04-07 03:03:26 summarizeOverlaps: Running summarizeOverlaps() -- incomplete random
summOv_rand <- summarizeOverlaps(gene_rand, bList, mode = "Union",
    singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE)
    
## Save the results
save(summOv_comp, file = 'summOv_comp.Rdata')
save(summOv_inc, file = 'summOv_inc.Rdata')
save(summOv_rand, file = 'summOv_rand.Rdata')

Reproducibility

## Reproducibility info
Sys.time()
## [1] "2015-04-07 03:04:00 EDT"
proc.time()
##    user  system elapsed 
## 665.083  16.400 206.142
options(width = 120)
devtools::session_info()
## Session info -----------------------------------------------------------------------------------------------------------
##  setting  value                                      
##  version  R version 3.1.1 Patched (2014-10-16 r66782)
##  system   x86_64, linux-gnu                          
##  ui       X11                                        
##  language (EN)                                       
##  collate  en_US.UTF-8                                
##  tz       
## Packages ---------------------------------------------------------------------------------------------------------------
##  package                           * version  date       source                                      
##  acepack                           * 1.3-3.3  2013-05-03 CRAN (R 3.1.1)                              
##  AnnotationDbi                       1.28.1   2014-10-29 Bioconductor                                
##  base64enc                         * 0.1-2    2014-06-26 CRAN (R 3.1.0)                              
##  BatchJobs                         * 1.5      2014-10-30 CRAN (R 3.1.1)                              
##  BBmisc                            * 1.9      2015-02-03 CRAN (R 3.1.1)                              
##  Biobase                             2.26.0   2014-10-15 Bioconductor                                
##  BiocGenerics                        0.12.1   2014-11-15 Bioconductor                                
##  BiocParallel                      * 1.0.3    2015-02-09 Bioconductor                                
##  biomaRt                           * 2.22.0   2014-10-15 Bioconductor                                
##  Biostrings                          2.34.1   2014-12-13 Bioconductor                                
##  bitops                            * 1.0-6    2013-08-17 CRAN (R 3.1.0)                              
##  brew                              * 1.0-6    2011-04-13 CRAN (R 3.1.0)                              
##  bumphunter                        * 1.6.0    2014-10-15 Bioconductor                                
##  checkmate                         * 1.5.1    2014-12-14 CRAN (R 3.1.1)                              
##  cluster                           * 1.15.3   2014-09-04 CRAN (R 3.1.1)                              
##  codetools                         * 0.2-9    2014-08-21 CRAN (R 3.1.1)                              
##  colorspace                        * 1.2-6    2015-03-11 CRAN (R 3.1.1)                              
##  DBI                               * 0.3.1    2014-09-24 CRAN (R 3.1.1)                              
##  derfinder                           1.0.10   2015-03-14 Bioconductor                                
##  derfinderHelper                   * 1.0.4    2014-11-05 Github (lcolladotor/derfinderHelper@27bcfe6)
##  devtools                          * 1.7.0    2015-01-17 CRAN (R 3.1.1)                              
##  digest                            * 0.6.8    2014-12-31 CRAN (R 3.1.1)                              
##  doRNG                             * 1.6      2014-03-07 CRAN (R 3.1.0)                              
##  evaluate                          * 0.5.5    2014-04-29 CRAN (R 3.1.0)                              
##  fail                              * 1.2      2013-09-19 CRAN (R 3.1.0)                              
##  foreach                           * 1.4.2    2014-04-11 CRAN (R 3.1.0)                              
##  foreign                           * 0.8-61   2014-03-28 CRAN (R 3.1.1)                              
##  formatR                           * 1.0      2014-08-25 CRAN (R 3.1.1)                              
##  Formula                           * 1.2-0    2015-01-20 CRAN (R 3.1.1)                              
##  GenomeInfoDb                        1.2.4    2014-12-20 Bioconductor                                
##  GenomicAlignments                   1.2.2    2015-03-04 Bioconductor                                
##  GenomicFeatures                     1.18.3   2014-12-17 Bioconductor                                
##  GenomicFiles                      * 1.2.1    2015-02-08 Bioconductor                                
##  GenomicRanges                       1.18.4   2015-01-08 Bioconductor                                
##  ggplot2                           * 1.0.0    2014-05-21 CRAN (R 3.1.0)                              
##  gtable                            * 0.1.2    2012-12-05 CRAN (R 3.1.0)                              
##  Hmisc                             * 3.15-0   2015-02-16 CRAN (R 3.1.1)                              
##  htmltools                         * 0.2.6    2014-09-08 CRAN (R 3.1.1)                              
##  IRanges                             2.0.1    2014-12-13 Bioconductor                                
##  iterators                         * 1.0.7    2014-04-11 CRAN (R 3.1.0)                              
##  knitr                             * 1.9      2015-01-20 CRAN (R 3.1.1)                              
##  knitrBootstrap                    * 1.0.0    2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)   
##  lattice                           * 0.20-29  2014-04-04 CRAN (R 3.1.1)                              
##  latticeExtra                      * 0.6-26   2013-08-15 CRAN (R 3.1.0)                              
##  locfit                            * 1.5-9.1  2013-04-20 CRAN (R 3.1.0)                              
##  markdown                          * 0.7.4    2014-08-24 CRAN (R 3.1.1)                              
##  MASS                              * 7.3-35   2014-09-30 CRAN (R 3.1.1)                              
##  Matrix                            * 1.1-4    2014-06-15 CRAN (R 3.1.1)                              
##  matrixStats                       * 0.14.0   2015-02-14 CRAN (R 3.1.1)                              
##  munsell                           * 0.4.2    2013-07-11 CRAN (R 3.1.0)                              
##  nnet                              * 7.3-8    2014-03-28 CRAN (R 3.1.1)                              
##  pkgmaker                          * 0.22     2014-05-14 CRAN (R 3.1.0)                              
##  plyr                              * 1.8.1    2014-02-26 CRAN (R 3.1.0)                              
##  proto                             * 0.3-10   2012-12-22 CRAN (R 3.1.0)                              
##  qvalue                            * 1.43.0   2015-03-06 Bioconductor                                
##  RColorBrewer                      * 1.1-2    2014-12-07 CRAN (R 3.1.1)                              
##  Rcpp                              * 0.11.5   2015-03-06 CRAN (R 3.1.1)                              
##  RCurl                             * 1.95-4.5 2014-12-06 CRAN (R 3.1.1)                              
##  registry                          * 0.2      2012-01-24 CRAN (R 3.1.0)                              
##  reshape2                          * 1.4.1    2014-12-06 CRAN (R 3.1.1)                              
##  rmarkdown                         * 0.5.1    2015-01-26 CRAN (R 3.1.1)                              
##  rngtools                          * 1.2.4    2014-03-06 CRAN (R 3.1.0)                              
##  rpart                             * 4.1-8    2014-03-28 CRAN (R 3.1.1)                              
##  Rsamtools                           1.18.3   2015-03-04 Bioconductor                                
##  RSQLite                           * 1.0.0    2014-10-25 CRAN (R 3.1.1)                              
##  rstudioapi                        * 0.2      2014-12-31 CRAN (R 3.1.1)                              
##  rtracklayer                       * 1.26.2   2014-11-12 Bioconductor                                
##  S4Vectors                           0.4.0    2014-10-15 Bioconductor                                
##  scales                            * 0.2.4    2014-04-22 CRAN (R 3.1.0)                              
##  sendmailR                         * 1.2-1    2014-09-21 CRAN (R 3.1.1)                              
##  stringr                           * 0.6.2    2012-12-06 CRAN (R 3.1.0)                              
##  survival                          * 2.37-7   2014-01-22 CRAN (R 3.1.1)                              
##  TxDb.Hsapiens.UCSC.hg19.knownGene   3.0.0    2014-09-29 Bioconductor                                
##  XML                               * 3.98-1.1 2013-06-20 CRAN (R 3.1.0)                              
##  xtable                            * 1.7-4    2014-09-12 CRAN (R 3.1.1)                              
##  XVector                             0.6.0    2014-10-15 Bioconductor                                
##  yaml                              * 2.1.13   2014-06-12 CRAN (R 3.1.1)                              
##  zlibbioc                          * 1.12.0   2014-10-15 Bioconductor