Simulate reads using polyester.
Using chr22, a total of 60 genes were selected with reads simulated for 3 groups each with 10 biological replicates. Reads are simulated with a 40x coverage in mind and taking into account the transcript length. Fold changes are set to 1/2 when a group has low expression and 2 when a group has high expression. Reads are 100bp long and come from paired-end reads with mean fragment lengths of 250bp (25bp sd) and a uniform error rate of 0.005. The size
is set to 1/3 of the mean in the negative binomial model.
24 of the genes have only one transcript and are setup in the following way.
36 genes have 2 transcripts and are setup in the following way.
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:GenomeInfoDb':
##
## species
library('org.Hs.eg.db')
## Loading required package: DBI
library('polyester')
library('Biostrings')
## Loading required package: XVector
library('devtools')
## Fold changes
foldChange <- list(high = 2, low = 1/2)
## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
tx <- transcriptsBy(txdb, 'gene')
txs <- sapply(tx, length)
## Fasta file to use
fasta_file <- system.file('extdata', 'chr22.fa', package='polyester')
fasta <- readDNAStringSet(fasta_file)
## Find 24 genes with only 1 transcript, 36 with 2
n1 <- 24
n2 <- 36
chosen <- data.frame(tx_idx = c(which(txs == 1)[1:n1], rep(which(txs == 2), each = 2)), tx_n = rep(c(1, 2), c(n1, 109 * 2)), tx_i = c(rep(1, n1), rep(1:2, 109)))
## Add gene id, refseq info
chosen$gene_id <- names(tx)[chosen$tx_idx]
chosen$ucsckg_id <- mapply(function(i, tx_idx) {
tx[[tx_idx]]$tx_name[i]
}, chosen$tx_i, chosen$tx_idx)
ref <- select(org.Hs.eg.db, keys = chosen$gene_id, columns = 'REFSEQ', keytype = 'ENTREZID')
chosen$fasta_i <- mapply(function(i, gene_id) {
reftx <- ref$REFSEQ[ref$ENTREZID == gene_id]
if(all(sapply(reftx, is.na))) return(NA)
fasta_i <- unlist(sapply(reftx, grep, x = names(fasta)))
fasta_i[i]
}, chosen$tx_i, chosen$gene_id)
## Drop down to 30 genes with 2 txs where annotation map worked
toKeep <- tapply(chosen$fasta_i[chosen$tx_n == 2], chosen$gene_id[chosen$tx_n == 2], function(x) {
!any(sapply(x, is.na))
})
## Final selection
chosen <- subset(chosen, gene_id %in% c(chosen$gene_id[1:n1], names(which(toKeep)[1:36])))
writeXStringSet(fasta[chosen$fasta_i], 'chr22_chosen.fa')
#txinfo <- select(txdb, keys = tx[[6]]$tx_name, columns = columns(txdb), keytype = 'TXNAME')
#txinfo$EXONEND - txinfo$EXONSTART + 1
## Select txs to be DE
chosen$DE <- FALSE
de <- sample(1:n1, n1/2)
chosen$DE[de] <- TRUE
x <- sample(1:n2, n2 * 2 / 3) ## DE with only 1 tx
y <- sample(x, n2 / 3) ## DE all txs in gene
de.all <- unlist(sapply((y - 1) * 2, function(x) {x + n1 + 1:2}, simplify = FALSE))
chosen$DE[ de.all ] <- TRUE
z <- x[!x %in% y] ## DE only 1 tx in gene
de.one <- sapply((z - 1) * 2, function(x) {x + sample(n1 + 1:2, 1)} )
chosen$DE[ de.one ] <- TRUE
## DE type
chosen$group1 <- 'normal'
chosen$group1[ de[1:4] ] <- rep(c('low', 'high'), 2)
chosen$group1[ de.all[1:8] ] <- rep(rep(c('low', 'high'), each = 2), 2)
chosen$group1[ de.one[1:4] ] <- rep(c('low', 'high'), 2)
chosen$group2 <- 'normal'
chosen$group2[ de[5:8] ] <- rep(c('low', 'high'), 2)
chosen$group2[ de.all[9:16] ] <- rep(rep(c('low', 'high'), each = 2), 2)
chosen$group2[ de.one[5:8] ] <- rep(c('low', 'high'), 2)
chosen$group3 <- 'normal'
chosen$group3[ de[9:12] ] <- rep(c('low', 'high'), 2)
chosen$group3[ de.all[17:24] ] <- rep(rep(c('low', 'high'), each = 2), 2)
chosen$group3[ de.one[9:12] ] <- rep(c('low', 'high'), 2)
## Determine reads per transcript (40x coverage, 100bp reads)
chosen$width <- width(fasta[chosen$fasta_i])
chosen$readspertx <- round(chosen$width / 100 * 40)
## Base means
chosen$mean1 <- round(chosen$readspertx * ifelse(chosen$group1 == 'normal', 1, ifelse(chosen$group1 == 'high', foldChange$high, foldChange$low)))
chosen$mean2 <- round(chosen$readspertx * ifelse(chosen$group2 == 'normal', 1, ifelse(chosen$group2 == 'high', foldChange$high, foldChange$low)))
chosen$mean3 <- round(chosen$readspertx * ifelse(chosen$group3 == 'normal', 1, ifelse(chosen$group3 == 'high', foldChange$high, foldChange$low)))
## Generate count matrix
readmat <- matrix(NA, nrow = nrow(chosen), ncol = 3 * 10)
for(i in seq_len(30)) {
if (i <= 10) {
means <- chosen$mean1
} else if (i <= 20) {
means <- chosen$mean2
} else {
means <- chosen$mean3
}
readmat[, i] <- mapply(polyester:::NB, means, means / 3)
}
## Classify txs
chosen$case <- sapply(chosen$gene_id, function(gene) {
s <- subset(chosen, gene_id == gene)
if( nrow(s) == 2) {
res <- ifelse(all(s$DE), 'bothDE', ifelse(any(s$DE), 'oneDE', 'noneDE'))
} else {
res<- ifelse(s$DE, 'singleDE', 'singleNotDE')
}
return(res)
})
## Save parameters
save(chosen, readmat, foldChange, file = 'simulation_info.Rdata')
## Run simulation
outdir <- 'simulated_reads'
simulate_experiment_countmat(fasta = 'chr22_chosen.fa', readmat = readmat, outdir = outdir, fraglen = 250, fragsd = 25, readlen = 100, error_rate = 0.005, paired = TRUE, seed = '20141202')
## gzip fasta files
for(i in seq_len(30)) {
for(j in 1:2) {
system(paste('gzip', file.path(outdir, paste0("sample_", sprintf('%02d', i), "_", j, ".fasta"))))
}
}
## Generated pairs info file for running Tophat
{
sink(file.path(outdir, "paired.txt"))
for(i in seq_len(30)) {
cat(paste0("sample_", sprintf('%02d', i), "_1.fasta.gz\tsample_", sprintf('%02d', i), "_2.fasta.gz\tsample", i, "\n"))
}
sink()
}
## Reproducibility info
Sys.time()
## [1] "2014-12-04 13:45:28 EST"
proc.time()
## user system elapsed
## 148.180 12.774 163.927
session_info()
## Session info--------------------------------------------------------------
## setting value
## version R Under development (unstable) (2014-11-01 r66923)
## system x86_64, darwin10.8.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/New_York
## Packages------------------------------------------------------------------
## package * version date source
## AnnotationDbi * 1.29.10 2014-11-22 Bioconductor
## base64enc 0.1.2 2014-06-26 CRAN (R 3.2.0)
## BatchJobs 1.4 2014-09-24 CRAN (R 3.2.0)
## BBmisc 1.7 2014-06-21 CRAN (R 3.2.0)
## Biobase * 2.27.0 2014-10-14 Bioconductor
## BiocGenerics * 0.13.2 2014-11-18 Bioconductor
## BiocParallel 1.1.9 2014-11-24 Bioconductor
## biomaRt 2.23.5 2014-11-22 Bioconductor
## Biostrings * 2.35.5 2014-11-24 Bioconductor
## bitops 1.0.6 2013-08-17 CRAN (R 3.2.0)
## brew 1.0.6 2011-04-13 CRAN (R 3.2.0)
## checkmate 1.5.0 2014-10-19 CRAN (R 3.2.0)
## codetools 0.2.9 2014-08-21 CRAN (R 3.2.0)
## DBI * 0.3.1 2014-09-24 CRAN (R 3.2.0)
## devtools * 1.6.1 2014-10-07 CRAN (R 3.2.0)
## digest 0.6.4 2013-12-03 CRAN (R 3.2.0)
## evaluate 0.5.5 2014-04-29 CRAN (R 3.2.0)
## fail 1.2 2013-09-19 CRAN (R 3.2.0)
## foreach 1.4.2 2014-04-11 CRAN (R 3.2.0)
## formatR 1.0 2014-08-25 CRAN (R 3.2.0)
## GenomeInfoDb * 1.3.7 2014-11-15 Bioconductor
## GenomicAlignments 1.3.12 2014-11-27 Bioconductor
## GenomicFeatures * 1.19.6 2014-11-04 Bioconductor
## GenomicRanges * 1.19.18 2014-12-01 Bioconductor
## htmltools 0.2.6 2014-09-08 CRAN (R 3.2.0)
## IRanges * 2.1.24 2014-12-01 Bioconductor
## iterators 1.0.7 2014-04-11 CRAN (R 3.2.0)
## knitr 1.7 2014-10-13 CRAN (R 3.2.0)
## org.Hs.eg.db * 3.0.0 2014-09-26 Bioconductor
## polyester * 1.1.0 2014-10-14 Bioconductor
## RCurl 1.95.4.3 2014-07-29 CRAN (R 3.2.0)
## rmarkdown * 0.3.3 2014-09-17 CRAN (R 3.2.0)
## Rsamtools 1.19.11 2014-11-26 Bioconductor
## RSQLite * 1.0.0 2014-10-25 CRAN (R 3.2.0)
## rstudioapi 0.1 2014-03-27 CRAN (R 3.2.0)
## rtracklayer 1.27.6 2014-11-26 Bioconductor
## S4Vectors * 0.5.11 2014-11-24 Bioconductor
## sendmailR 1.2.1 2014-09-21 CRAN (R 3.2.0)
## stringr 0.6.2 2012-12-06 CRAN (R 3.2.0)
## TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0 2014-09-26 Bioconductor
## XML 3.98.1.1 2013-06-20 CRAN (R 3.2.0)
## XVector * 0.7.3 2014-11-24 Bioconductor
## yaml 2.1.13 2014-06-12 CRAN (R 3.2.0)
## zlibbioc 1.13.0 2014-10-14 Bioconductor