Simulate reads using polyester.
This code uses the genes from chr17 from the UCSC hg19 known gene database to simulate reads for a two group comparison with 5 samples per group. The number of reads corresponds to what you would expect from libraries with 40 million reads used to sequence the whole human genome. We identify all the transcripts in chr17 and set 1/6 of them to be high, 1/6 of them to be low and the rest the same between the two groups. The differences are 2x and the reads per transcript take into account the transcript length. Reads are 100bp long and come from paired-end reads with mean fragment lengths of 250bp (25bp sd) and a uniform error rate of 0.005. The size
is set to 1/3 of the mean in the negative binomial model.
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: methods
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist, unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:base':
##
## expand.grid
##
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Warning in .recacheSubclasses(def@className, def,
## doSubclasses, env): undefined subclass "GIntervalTree" of class
## "GenomicRangesORGenomicRangesList"; definition not updated
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "GIntervalTree" of class "RangedDataORGenomicRanges";
## definition not updated
library('polyester')
library('Biostrings')
## Loading required package: XVector
library('devtools')
## Fold changes
foldChange <- list(high = 2, low = 1/2)
## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr17')
tx <- transcriptsBy(txdb, 'gene')
## Transcripts per gene
txs <- sapply(tx, length)
barplot(table(txs), main = 'Number of transcripts per gene in chr17 based on hg19')
## Download info for chr17
if(!file.exists('knownGeneTxMrna.txt.gz')) download.file('http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGeneTxMrna.txt.gz', 'knownGeneTxMrna.txt.gz')
mrna <- read.table('knownGeneTxMrna.txt.gz', header = FALSE, sep = '\t', quote = '', stringsAsFactors = FALSE, col.names = c('ucsckg', 'sequence'))
## Use all genes from chr17
chosen <- data.frame(
tx_idx = rep(seq_len(length(txs)), txs),
tx_n = rep(txs, txs),
tx_i = unlist(sapply(txs, seq_len), use.names = FALSE)
)
## Add gene id
chosen$gene_id <- names(tx)[chosen$tx_idx]
chosen$ucsckg_id <- unlist(lapply(tx, function(x) x$tx_name), use.names = FALSE)
m <- match(chosen$ucsckg_id, mrna$ucsckg)
fasta <- DNAStringSet(mrna$sequence[m])
names(fasta) <- mrna$ucsckg[m]
## Write fasta file
writeXStringSet(fasta, 'chr17_chosen.fa')
## For a library size of 40 million reads, find out how many would be from chr17
## Based on chr length it would be about 2.6% of the 40 million reads:
data(hg19Ideogram, package = 'biovizBase')
seqlengths(hg19Ideogram)['chr17'] / sum(as.numeric(seqlengths(hg19Ideogram)[paste0('chr', c(1:22, 'X', 'Y'))])) * 100
## chr17
## 2.622858
## Based on the total length of mRNA it's about 5.1%. This assumes that all
## base-pairs contained in mRNAs are equally expressed
sum(width(fasta)) / sum(nchar(mrna$sequence)) * 100
## [1] 4.973116
## We'll use the percent based on the total mRNA, meaning that the total number
## of reads would be about 2 million
reads_chr17 <- round(sum(width(fasta)) / sum(nchar(mrna$sequence)) * 40e6)
reads_chr17
## [1] 1989247
## Now we can determine the number of reads per transcript based on a library
## size of 40 million reads where each read is 100 bp long.
chosen$width <- width(fasta)
chosen$readspertx <- round(reads_chr17 * 100 / sum(chosen$width) * chosen$width / 100)
## Missed just 18 reads due to rounding
sum(chosen$readspertx) - reads_chr17
## [1] 22
## Choose which transcripts are DE in each of the 3 replicate experiments
## Group 1 will be the baseline, group 2 will have the DE signal
n_de <- round(nrow(chosen) / 6)
status <- rep(c('high', 'low', 'normal'), c(n_de, n_de, nrow(chosen) - 2 * n_de))
set.seed(20151119)
chosen$rep1 <- sample(status)
chosen$rep2 <- sample(status)
chosen$rep3 <- sample(status)
## Identify the base means for group 2 in each of the 3 replicates
chosen$meanR1 <- round(chosen$readspertx * ifelse(chosen$rep1 == 'normal', 1, ifelse(chosen$rep1 == 'high', foldChange$high, foldChange$low)) )
chosen$meanR2 <- round(chosen$readspertx * ifelse(chosen$rep2 == 'normal', 1, ifelse(chosen$rep2 == 'high', foldChange$high, foldChange$low)) )
chosen$meanR3 <- round(chosen$readspertx * ifelse(chosen$rep3 == 'normal', 1, ifelse(chosen$rep3 == 'high', foldChange$high, foldChange$low)) )
## Generate the count matrix
readmat <- matrix(NA, nrow = nrow(chosen), ncol = 3 * 10)
for(i in seq_len(30)) {
if (i %in% c(1:5, 11:15, 21:25)) {
means <- chosen$readspertx
} else if (i %in% 6:10) {
means <- chosen$meanR1
} else if (i %in% 16:20) {
means <- chosen$meanR2
} else {
means <- chosen$meanR3
}
readmat[, i] <- mapply(polyester:::NB, means, means / 3)
}
colnames(readmat) <- paste0(rep(paste0('sample', 1:10, 'G', rep(1:2, each = 5)), 3), 'R', rep(1:3, each = 10))
rownames(readmat) <- chosen$ucsckg_id
## Classify txs by DE status
class_de <- function(gene, col) {
s <- subset(chosen, gene_id == gene)
status <- s[[col]]
if(nrow(s) > 1) {
res <- ifelse(all(status == 'normal'), 'noneDE', ifelse(all(status %in% c('high', 'low')), 'allDE', 'someDE'))
} else {
res <- ifelse(status %in% c('high', 'low'), 'singleDE', 'singleNotDE')
}
return(res)
}
chosen$case1 <- sapply(chosen$gene_id, class_de, col = 'rep1')
chosen$case2 <- sapply(chosen$gene_id, class_de, col = 'rep2')
chosen$case3 <- sapply(chosen$gene_id, class_de, col = 'rep3')
## At the transcript level, how does the DE status look like?
sapply(paste0('case', 1:3), function(x) { table(chosen[[x]])})
## case1 case2 case3
## allDE 84 101 71
## noneDE 649 646 596
## singleDE 193 157 174
## singleNotDE 321 357 340
## someDE 2653 2639 2719
## What about the gene level?
sapply(paste0('case', 1:3), function(x) { table(chosen[[x]][chosen$tx_i == 1])})
## case1 case2 case3
## allDE 38 43 33
## noneDE 227 226 213
## singleDE 193 157 174
## singleNotDE 321 357 340
## someDE 559 555 578
## Save parameters
save(chosen, readmat, foldChange, file = 'simulation_info.Rdata')
## Run simulation
outdir <- file.path('..', 'simulated_reads')
simulate_experiment_countmat(fasta = 'chr17_chosen.fa', readmat = readmat, outdir = outdir, fraglen = 250, fragsd = 25, readlen = 100, error_rate = 0.005, paired = TRUE, seed = '20151119')
## gzip fasta files
for(i in seq_len(30)) {
for(j in 1:2) {
system(paste('gzip', file.path(outdir, paste0("sample_", sprintf('%02d', i), "_", j, ".fasta"))))
}
}
## Generated pairs info file for running TopHat or HISAT
{
sink(file.path(outdir, "paired.txt"))
for(i in seq_len(30)) {
cat(paste0("sample_", sprintf('%02d', i), "_1.fasta.gz\tsample_", sprintf('%02d', i), "_2.fasta.gz\t", colnames(readmat)[i], "\n"))
}
sink()
}
## Reproducibility info
Sys.time()
## [1] "2015-11-19 16:13:58 EST"
proc.time()
## user system elapsed
## 4552.767 96.418 4798.872
session_info()
## Session info --------------------------------------------------------------
## setting value
## version R Under development (unstable) (2015-08-31 r69229)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz <NA>
## date 2015-11-19
## Packages ------------------------------------------------------------------
## package * version date
## AnnotationDbi * 1.33.1 2015-10-30
## Biobase * 2.31.0 2015-10-20
## BiocGenerics * 0.17.1 2015-11-07
## BiocParallel 1.5.0 2015-11-03
## BiocStyle * 1.9.2 2015-11-03
## biomaRt 2.27.0 2015-10-20
## Biostrings * 2.39.1 2015-11-19
## bitops 1.0-6 2013-08-17
## DBI 0.3.1 2014-09-24
## devtools * 1.9.1 2015-09-11
## digest 0.6.8 2014-12-31
## evaluate 0.8 2015-09-18
## formatR 1.2.1 2015-09-18
## futile.logger 1.4.1 2015-04-20
## futile.options 1.0.0 2010-04-06
## GenomeInfoDb * 1.7.3 2015-11-03
## GenomicAlignments 1.7.3 2015-11-05
## GenomicFeatures * 1.23.8 2015-11-06
## GenomicRanges * 1.23.3 2015-11-07
## htmltools 0.2.6 2014-09-08
## IRanges * 2.5.5 2015-11-03
## knitr 1.11 2015-08-14
## lambda.r 1.1.7 2015-03-20
## limma 3.27.5 2015-11-17
## logspline 2.1.8 2015-07-03
## magrittr 1.5 2014-11-22
## memoise 0.2.1 2014-04-22
## polyester * 1.7.1 2015-11-19
## RCurl 1.95-4.7 2015-06-30
## rmarkdown 0.8.1 2015-10-10
## Rsamtools 1.23.1 2015-11-03
## RSQLite 1.0.0 2014-10-25
## rtracklayer 1.31.1 2015-10-23
## S4Vectors * 0.9.9 2015-11-19
## stringi 1.0-1 2015-10-22
## stringr 1.0.0 2015-04-30
## SummarizedExperiment 1.1.2 2015-11-19
## TxDb.Hsapiens.UCSC.hg19.knownGene * 3.2.2 2015-10-19
## XML 3.98-1.3 2015-06-30
## XVector * 0.11.0 2015-10-20
## yaml 2.1.13 2014-06-12
## zlibbioc 1.17.0 2015-10-20
## source
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## CRAN (R 3.2.0)
## CRAN (R 3.3.0)
## CRAN (R 3.2.0)
## CRAN (R 3.3.0)
## CRAN (R 3.3.0)
## CRAN (R 3.3.0)
## CRAN (R 3.2.0)
## Bioconductor
## Bioconductor
## Bioconductor
## Bioconductor
## CRAN (R 3.2.0)
## Bioconductor
## CRAN (R 3.3.0)
## CRAN (R 3.3.0)
## Bioconductor
## CRAN (R 3.3.0)
## CRAN (R 3.3.0)
## CRAN (R 3.2.0)
## Github (lcolladotor/polyester@a07d933)
## CRAN (R 3.3.0)
## CRAN (R 3.3.0)
## Bioconductor
## CRAN (R 3.2.0)
## Bioconductor
## Bioconductor
## CRAN (R 3.3.0)
## CRAN (R 3.3.0)
## Bioconductor
## Bioconductor
## CRAN (R 3.3.0)
## Bioconductor
## CRAN (R 3.3.0)
## Bioconductor