Simulate reads

0.2 Code

library('TxDb.Hsapiens.UCSC.hg19.knownGene')

## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: methods
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Warning in .recacheSubclasses(def@className, def,
## doSubclasses, env): undefined subclass "GIntervalTree" of class
## "GenomicRangesORGenomicRangesList"; definition not updated

## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "GIntervalTree" of class "RangedDataORGenomicRanges";
## definition not updated

library('polyester')
library('Biostrings')

## Loading required package: XVector

library('devtools')

## Fold changes
foldChange <- list(high = 2, low = 1/2)

## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr17')
tx <- transcriptsBy(txdb, 'gene')

## Transcripts per gene
txs <- sapply(tx, length)
barplot(table(txs), main = 'Number of transcripts per gene in chr17 based on hg19')

## Download info for chr17
if(!file.exists('knownGeneTxMrna.txt.gz')) download.file('http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGeneTxMrna.txt.gz', 'knownGeneTxMrna.txt.gz')
mrna <- read.table('knownGeneTxMrna.txt.gz', header = FALSE, sep = '\t', quote = '', stringsAsFactors = FALSE, col.names = c('ucsckg', 'sequence'))

## Use all genes from chr17
chosen <- data.frame(
    tx_idx = rep(seq_len(length(txs)), txs),
    tx_n = rep(txs, txs),
    tx_i = unlist(sapply(txs, seq_len), use.names = FALSE)
)

## Add gene id
chosen$gene_id <- names(tx)[chosen$tx_idx]
chosen$ucsckg_id <- unlist(lapply(tx, function(x) x$tx_name), use.names = FALSE)

m <- match(chosen$ucsckg_id, mrna$ucsckg)
fasta <- DNAStringSet(mrna$sequence[m])
names(fasta) <- mrna$ucsckg[m]

## Write fasta file
writeXStringSet(fasta, 'chr17_chosen.fa')



## For a library size of 40 million reads, find out how many would be from chr17
## Based on chr length it would be about 2.6% of the 40 million reads:
data(hg19Ideogram, package = 'biovizBase')
seqlengths(hg19Ideogram)['chr17'] / sum(as.numeric(seqlengths(hg19Ideogram)[paste0('chr', c(1:22, 'X', 'Y'))])) * 100

##    chr17 
## 2.622858

## Based on the total length of mRNA it's about 5.1%. This assumes that all
## base-pairs contained in mRNAs are equally expressed
sum(width(fasta)) / sum(nchar(mrna$sequence)) * 100

## [1] 4.973116

## We'll use the percent based on the total mRNA, meaning that the total number
## of reads would be about 2 million
reads_chr17 <- round(sum(width(fasta)) / sum(nchar(mrna$sequence)) * 40e6)
reads_chr17

## [1] 1989247

## Now we can determine the number of reads per transcript based on a library 
## size of 40 million reads where each read is 100 bp long.
chosen$width <- width(fasta)
chosen$readspertx <- round(reads_chr17 * 100 / sum(chosen$width) * chosen$width / 100)
## Missed just 18 reads due to rounding
sum(chosen$readspertx) - reads_chr17

## [1] 22

## Choose which transcripts are DE in each of the 3 replicate experiments
## Group 1 will be the baseline, group 2 will have the DE signal
n_de <- round(nrow(chosen) / 6)
status <- rep(c('high', 'low', 'normal'), c(n_de, n_de, nrow(chosen) - 2 * n_de))
set.seed(20151119)
chosen$rep1 <- sample(status)
chosen$rep2 <- sample(status)
chosen$rep3 <- sample(status)

## Identify the base means for group 2 in each of the 3 replicates
chosen$meanR1 <- round(chosen$readspertx * ifelse(chosen$rep1 == 'normal', 1, ifelse(chosen$rep1 == 'high', foldChange$high, foldChange$low)) )
chosen$meanR2 <- round(chosen$readspertx * ifelse(chosen$rep2 == 'normal', 1, ifelse(chosen$rep2 == 'high', foldChange$high, foldChange$low)) )
chosen$meanR3 <- round(chosen$readspertx * ifelse(chosen$rep3 == 'normal', 1, ifelse(chosen$rep3 == 'high', foldChange$high, foldChange$low)) )

## Generate the count matrix
readmat <- matrix(NA, nrow = nrow(chosen), ncol = 3 * 10)
for(i in seq_len(30)) {
    if (i %in% c(1:5, 11:15, 21:25)) {
        means <- chosen$readspertx
    } else if (i %in% 6:10) {
        means <- chosen$meanR1    
    } else if (i %in% 16:20) {
        means <- chosen$meanR2    
    } else {
        means <- chosen$meanR3
    }
    readmat[, i] <- mapply(polyester:::NB, means, means / 3)
}
colnames(readmat) <- paste0(rep(paste0('sample', 1:10, 'G', rep(1:2, each = 5)), 3), 'R', rep(1:3, each = 10))
rownames(readmat) <- chosen$ucsckg_id

## Classify txs by DE status
class_de <- function(gene, col) {
    s <- subset(chosen, gene_id == gene)
    status <- s[[col]]
    if(nrow(s) > 1) {
        res <- ifelse(all(status == 'normal'), 'noneDE', ifelse(all(status %in% c('high', 'low')), 'allDE', 'someDE'))
    } else {
        res <- ifelse(status %in% c('high', 'low'), 'singleDE', 'singleNotDE') 
    }
    return(res)
}
chosen$case1 <- sapply(chosen$gene_id, class_de, col = 'rep1')
chosen$case2 <- sapply(chosen$gene_id, class_de, col = 'rep2')
chosen$case3 <- sapply(chosen$gene_id, class_de, col = 'rep3')

## At the transcript level, how does the DE status look like?
sapply(paste0('case', 1:3), function(x) { table(chosen[[x]])})

##             case1 case2 case3
## allDE          84   101    71
## noneDE        649   646   596
## singleDE      193   157   174
## singleNotDE   321   357   340
## someDE       2653  2639  2719

## What about the gene level?
sapply(paste0('case', 1:3), function(x) { table(chosen[[x]][chosen$tx_i == 1])})

##             case1 case2 case3
## allDE          38    43    33
## noneDE        227   226   213
## singleDE      193   157   174
## singleNotDE   321   357   340
## someDE        559   555   578

## Save parameters
save(chosen, readmat, foldChange, file = 'simulation_info.Rdata')

## Run simulation
outdir <- file.path('..', 'simulated_reads')
simulate_experiment_countmat(fasta = 'chr17_chosen.fa', readmat = readmat, outdir = outdir, fraglen = 250, fragsd = 25, readlen = 100, error_rate = 0.005, paired = TRUE, seed = '20151119')

## gzip fasta files
for(i in seq_len(30)) {
    for(j in 1:2) {
        system(paste('gzip', file.path(outdir, paste0("sample_", sprintf('%02d', i), "_", j, ".fasta"))))
    }
}

## Generated pairs info file for running TopHat or HISAT
{
sink(file.path(outdir, "paired.txt"))
for(i in seq_len(30)) {
    cat(paste0("sample_", sprintf('%02d', i), "_1.fasta.gz\tsample_", sprintf('%02d', i), "_2.fasta.gz\t", colnames(readmat)[i], "\n"))
}
sink()
}

## Reproducibility info
Sys.time()

## [1] "2015-11-19 16:13:58 EST"

proc.time()

##     user   system  elapsed 
## 4552.767   96.418 4798.872

session_info()

## Session info --------------------------------------------------------------

##  setting  value                                             
##  version  R Under development (unstable) (2015-08-31 r69229)
##  system   x86_64, linux-gnu                                 
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       <NA>                                              
##  date     2015-11-19

## Packages ------------------------------------------------------------------

##  package                           * version  date      
##  AnnotationDbi                     * 1.33.1   2015-10-30
##  Biobase                           * 2.31.0   2015-10-20
##  BiocGenerics                      * 0.17.1   2015-11-07
##  BiocParallel                        1.5.0    2015-11-03
##  BiocStyle                         * 1.9.2    2015-11-03
##  biomaRt                             2.27.0   2015-10-20
##  Biostrings                        * 2.39.1   2015-11-19
##  bitops                              1.0-6    2013-08-17
##  DBI                                 0.3.1    2014-09-24
##  devtools                          * 1.9.1    2015-09-11
##  digest                              0.6.8    2014-12-31
##  evaluate                            0.8      2015-09-18
##  formatR                             1.2.1    2015-09-18
##  futile.logger                       1.4.1    2015-04-20
##  futile.options                      1.0.0    2010-04-06
##  GenomeInfoDb                      * 1.7.3    2015-11-03
##  GenomicAlignments                   1.7.3    2015-11-05
##  GenomicFeatures                   * 1.23.8   2015-11-06
##  GenomicRanges                     * 1.23.3   2015-11-07
##  htmltools                           0.2.6    2014-09-08
##  IRanges                           * 2.5.5    2015-11-03
##  knitr                               1.11     2015-08-14
##  lambda.r                            1.1.7    2015-03-20
##  limma                               3.27.5   2015-11-17
##  logspline                           2.1.8    2015-07-03
##  magrittr                            1.5      2014-11-22
##  memoise                             0.2.1    2014-04-22
##  polyester                         * 1.7.1    2015-11-19
##  RCurl                               1.95-4.7 2015-06-30
##  rmarkdown                           0.8.1    2015-10-10
##  Rsamtools                           1.23.1   2015-11-03
##  RSQLite                             1.0.0    2014-10-25
##  rtracklayer                         1.31.1   2015-10-23
##  S4Vectors                         * 0.9.9    2015-11-19
##  stringi                             1.0-1    2015-10-22
##  stringr                             1.0.0    2015-04-30
##  SummarizedExperiment                1.1.2    2015-11-19
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.2.2    2015-10-19
##  XML                                 3.98-1.3 2015-06-30
##  XVector                           * 0.11.0   2015-10-20
##  yaml                                2.1.13   2014-06-12
##  zlibbioc                            1.17.0   2015-10-20
##  source                                
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.2.0)                        
##  CRAN (R 3.2.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.2.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.2.0)                        
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.2.0)                        
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.2.0)                        
##  Github (lcolladotor/polyester@a07d933)
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  CRAN (R 3.2.0)                        
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  Bioconductor

Simulate reads

L Collado-Torres

19 November 2015

Contents

0.1 Setup

0.2 Code