This report defines the gene sets for the simulation data. The first set uses all the exons from the transcripts used in the simulation, while the second drops one transcript for the genes that have two of them.
The following code defines the two gene sets.
## Load packages
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
library('GenomicRanges')
library('derfinder')
library('Rsamtools')
library('GenomicAlignments')
## Load data
load('../simulation_info.Rdata')
## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
## Exons by gene
ex <- exonsBy(txdb, by = 'gene')
## Define gene sets: complete and incomplete
gene_comp <- ex[unique(chosen$gene_id)]
## Transcripts by gene
tx <- transcriptsBy(txdb, 'gene')
## For each gene with 2 transcripts, choose only 1 to use
tx_inc <- tx[unique(chosen$gene_id)]
set.seed(20150330)
for(i in which(sapply(tx_inc, length) == 2)) {
tx_inc[[i]] <- tx_inc[[i]][sample(1:2, 1)]
}
## Check that it's all genes with 1 isoform
table(sapply(tx_inc, length))
##
## 1
## 60
## Get tx ids
tx_ids <- sapply(tx_inc, function(x) { mcols(x)$tx_id })
## Define incomplete gene set
gene_inc <- exonsBy(txdb, by = 'tx')[as.character(tx_ids)]
names(gene_inc) <- names(tx_inc)
## Define a second incomplete gene set where 20% of the transcripts are
## dropped at random, regardless of the simulation scenario
tx_rand <- tx[unique(chosen$gene_id)]
tx_rand_list <- unlist(tx_rand)
set.seed(20150406)
tx_rand_chosen <- tx_rand_list[sample(seq_len(length(tx_rand_list)), size = round(length(tx_rand_list) * 0.8))]
gene_rand <- split(tx_rand_chosen, names(tx_rand_chosen))
## Save info
save(tx_inc, gene_comp, gene_inc, gene_rand, file = 'gene_sets.Rdata')
The following code finds the BAM files and produces the gene count matrices using summarizeOverlaps()
. For more information, check RNA-Seq workflow: gene-level exploratory analysis and differential expression.
## Make bamFileList
files <- files <- rawFiles(datadir ='/dcl01/lieber/ajaffe/derRuns/derSoftware/simulation/thout',
samplepatt = "sample", fileterm = "accepted_hits.bam")
bai <- paste0(files, ".bai")
bList <- BamFileList(files, bai)
## Compute the overlaps
message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps() -- complete"))
summOv_comp <- summarizeOverlaps(gene_comp, bList, mode = "Union",
singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE)
message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps() -- incomplete"))
summOv_inc <- summarizeOverlaps(gene_inc, bList, mode = "Union",
singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE)
message(paste(Sys.time(), "summarizeOverlaps: Running summarizeOverlaps() -- incomplete random"))
summOv_rand <- summarizeOverlaps(gene_rand, bList, mode = "Union",
singleEnd = FALSE, ignore.strand = TRUE, fragments = TRUE)
## Save the results
save(summOv_comp, file = 'summOv_comp.Rdata')
save(summOv_inc, file = 'summOv_inc.Rdata')
save(summOv_rand, file = 'summOv_rand.Rdata')
## Reproducibility info
Sys.time()
## [1] "2015-04-07 03:04:00 EDT"
proc.time()
## user system elapsed
## 665.083 16.400 206.142
options(width = 120)
devtools::session_info()
## setting value
## version R version 3.1.1 Patched (2014-10-16 r66782)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz
## package * version date source
## acepack * 1.3-3.3 2013-05-03 CRAN (R 3.1.1)
## AnnotationDbi 1.28.1 2014-10-29 Bioconductor
## base64enc * 0.1-2 2014-06-26 CRAN (R 3.1.0)
## BatchJobs * 1.5 2014-10-30 CRAN (R 3.1.1)
## BBmisc * 1.9 2015-02-03 CRAN (R 3.1.1)
## Biobase 2.26.0 2014-10-15 Bioconductor
## BiocGenerics 0.12.1 2014-11-15 Bioconductor
## BiocParallel * 1.0.3 2015-02-09 Bioconductor
## biomaRt * 2.22.0 2014-10-15 Bioconductor
## Biostrings 2.34.1 2014-12-13 Bioconductor
## bitops * 1.0-6 2013-08-17 CRAN (R 3.1.0)
## brew * 1.0-6 2011-04-13 CRAN (R 3.1.0)
## bumphunter * 1.6.0 2014-10-15 Bioconductor
## checkmate * 1.5.1 2014-12-14 CRAN (R 3.1.1)
## cluster * 1.15.3 2014-09-04 CRAN (R 3.1.1)
## codetools * 0.2-9 2014-08-21 CRAN (R 3.1.1)
## colorspace * 1.2-6 2015-03-11 CRAN (R 3.1.1)
## DBI * 0.3.1 2014-09-24 CRAN (R 3.1.1)
## derfinder 1.0.10 2015-03-14 Bioconductor
## derfinderHelper * 1.0.4 2014-11-05 Github (lcolladotor/derfinderHelper@27bcfe6)
## devtools * 1.7.0 2015-01-17 CRAN (R 3.1.1)
## digest * 0.6.8 2014-12-31 CRAN (R 3.1.1)
## doRNG * 1.6 2014-03-07 CRAN (R 3.1.0)
## evaluate * 0.5.5 2014-04-29 CRAN (R 3.1.0)
## fail * 1.2 2013-09-19 CRAN (R 3.1.0)
## foreach * 1.4.2 2014-04-11 CRAN (R 3.1.0)
## foreign * 0.8-61 2014-03-28 CRAN (R 3.1.1)
## formatR * 1.0 2014-08-25 CRAN (R 3.1.1)
## Formula * 1.2-0 2015-01-20 CRAN (R 3.1.1)
## GenomeInfoDb 1.2.4 2014-12-20 Bioconductor
## GenomicAlignments 1.2.2 2015-03-04 Bioconductor
## GenomicFeatures 1.18.3 2014-12-17 Bioconductor
## GenomicFiles * 1.2.1 2015-02-08 Bioconductor
## GenomicRanges 1.18.4 2015-01-08 Bioconductor
## ggplot2 * 1.0.0 2014-05-21 CRAN (R 3.1.0)
## gtable * 0.1.2 2012-12-05 CRAN (R 3.1.0)
## Hmisc * 3.15-0 2015-02-16 CRAN (R 3.1.1)
## htmltools * 0.2.6 2014-09-08 CRAN (R 3.1.1)
## IRanges 2.0.1 2014-12-13 Bioconductor
## iterators * 1.0.7 2014-04-11 CRAN (R 3.1.0)
## knitr * 1.9 2015-01-20 CRAN (R 3.1.1)
## knitrBootstrap * 1.0.0 2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)
## lattice * 0.20-29 2014-04-04 CRAN (R 3.1.1)
## latticeExtra * 0.6-26 2013-08-15 CRAN (R 3.1.0)
## locfit * 1.5-9.1 2013-04-20 CRAN (R 3.1.0)
## markdown * 0.7.4 2014-08-24 CRAN (R 3.1.1)
## MASS * 7.3-35 2014-09-30 CRAN (R 3.1.1)
## Matrix * 1.1-4 2014-06-15 CRAN (R 3.1.1)
## matrixStats * 0.14.0 2015-02-14 CRAN (R 3.1.1)
## munsell * 0.4.2 2013-07-11 CRAN (R 3.1.0)
## nnet * 7.3-8 2014-03-28 CRAN (R 3.1.1)
## pkgmaker * 0.22 2014-05-14 CRAN (R 3.1.0)
## plyr * 1.8.1 2014-02-26 CRAN (R 3.1.0)
## proto * 0.3-10 2012-12-22 CRAN (R 3.1.0)
## qvalue * 1.43.0 2015-03-06 Bioconductor
## RColorBrewer * 1.1-2 2014-12-07 CRAN (R 3.1.1)
## Rcpp * 0.11.5 2015-03-06 CRAN (R 3.1.1)
## RCurl * 1.95-4.5 2014-12-06 CRAN (R 3.1.1)
## registry * 0.2 2012-01-24 CRAN (R 3.1.0)
## reshape2 * 1.4.1 2014-12-06 CRAN (R 3.1.1)
## rmarkdown * 0.5.1 2015-01-26 CRAN (R 3.1.1)
## rngtools * 1.2.4 2014-03-06 CRAN (R 3.1.0)
## rpart * 4.1-8 2014-03-28 CRAN (R 3.1.1)
## Rsamtools 1.18.3 2015-03-04 Bioconductor
## RSQLite * 1.0.0 2014-10-25 CRAN (R 3.1.1)
## rstudioapi * 0.2 2014-12-31 CRAN (R 3.1.1)
## rtracklayer * 1.26.2 2014-11-12 Bioconductor
## S4Vectors 0.4.0 2014-10-15 Bioconductor
## scales * 0.2.4 2014-04-22 CRAN (R 3.1.0)
## sendmailR * 1.2-1 2014-09-21 CRAN (R 3.1.1)
## stringr * 0.6.2 2012-12-06 CRAN (R 3.1.0)
## survival * 2.37-7 2014-01-22 CRAN (R 3.1.1)
## TxDb.Hsapiens.UCSC.hg19.knownGene 3.0.0 2014-09-29 Bioconductor
## XML * 3.98-1.1 2013-06-20 CRAN (R 3.1.0)
## xtable * 1.7-4 2014-09-12 CRAN (R 3.1.1)
## XVector 0.6.0 2014-10-15 Bioconductor
## yaml * 2.1.13 2014-06-12 CRAN (R 3.1.1)
## zlibbioc * 1.12.0 2014-10-15 Bioconductor