Contents

Simulate reads using polyester.

0.1 Setup

This code uses the genes from chr17 from the UCSC hg19 known gene database to simulate reads for a two group comparison with 5 samples per group. The number of reads corresponds to what you would expect from libraries with 40 million reads used to sequence the whole human genome. We identify all the transcripts in chr17 and set 1/6 of them to be high, 1/6 of them to be low and the rest the same between the two groups. The differences are 2x and the reads per transcript take into account the transcript length. Reads are 100bp long and come from paired-end reads with mean fragment lengths of 250bp (25bp sd) and a uniform error rate of 0.005. The size is set to 1/3 of the mean in the negative binomial model.

0.2 Code

library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: methods
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Warning in .recacheSubclasses(def@className, def,
## doSubclasses, env): undefined subclass "GIntervalTree" of class
## "GenomicRangesORGenomicRangesList"; definition not updated
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "GIntervalTree" of class "RangedDataORGenomicRanges";
## definition not updated
library('polyester')
library('Biostrings')
## Loading required package: XVector
library('devtools')

## Fold changes
foldChange <- list(high = 2, low = 1/2)

## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr17')
tx <- transcriptsBy(txdb, 'gene')

## Transcripts per gene
txs <- sapply(tx, length)
barplot(table(txs), main = 'Number of transcripts per gene in chr17 based on hg19')

## Download info for chr17
if(!file.exists('knownGeneTxMrna.txt.gz')) download.file('http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGeneTxMrna.txt.gz', 'knownGeneTxMrna.txt.gz')
mrna <- read.table('knownGeneTxMrna.txt.gz', header = FALSE, sep = '\t', quote = '', stringsAsFactors = FALSE, col.names = c('ucsckg', 'sequence'))

## Use all genes from chr17
chosen <- data.frame(
    tx_idx = rep(seq_len(length(txs)), txs),
    tx_n = rep(txs, txs),
    tx_i = unlist(sapply(txs, seq_len), use.names = FALSE)
)

## Add gene id
chosen$gene_id <- names(tx)[chosen$tx_idx]
chosen$ucsckg_id <- unlist(lapply(tx, function(x) x$tx_name), use.names = FALSE)

m <- match(chosen$ucsckg_id, mrna$ucsckg)
fasta <- DNAStringSet(mrna$sequence[m])
names(fasta) <- mrna$ucsckg[m]

## Write fasta file
writeXStringSet(fasta, 'chr17_chosen.fa')



## For a library size of 40 million reads, find out how many would be from chr17
## Based on chr length it would be about 2.6% of the 40 million reads:
data(hg19Ideogram, package = 'biovizBase')
seqlengths(hg19Ideogram)['chr17'] / sum(as.numeric(seqlengths(hg19Ideogram)[paste0('chr', c(1:22, 'X', 'Y'))])) * 100
##    chr17 
## 2.622858
## Based on the total length of mRNA it's about 5.1%. This assumes that all
## base-pairs contained in mRNAs are equally expressed
sum(width(fasta)) / sum(nchar(mrna$sequence)) * 100
## [1] 4.973116
## We'll use the percent based on the total mRNA, meaning that the total number
## of reads would be about 2 million
reads_chr17 <- round(sum(width(fasta)) / sum(nchar(mrna$sequence)) * 40e6)
reads_chr17
## [1] 1989247
## Now we can determine the number of reads per transcript based on a library 
## size of 40 million reads where each read is 100 bp long.
chosen$width <- width(fasta)
chosen$readspertx <- round(reads_chr17 * 100 / sum(chosen$width) * chosen$width / 100)
## Missed just 18 reads due to rounding
sum(chosen$readspertx) - reads_chr17
## [1] 22
## Choose which transcripts are DE in each of the 3 replicate experiments
## Group 1 will be the baseline, group 2 will have the DE signal
n_de <- round(nrow(chosen) / 6)
status <- rep(c('high', 'low', 'normal'), c(n_de, n_de, nrow(chosen) - 2 * n_de))
set.seed(20151119)
chosen$rep1 <- sample(status)
chosen$rep2 <- sample(status)
chosen$rep3 <- sample(status)

## Identify the base means for group 2 in each of the 3 replicates
chosen$meanR1 <- round(chosen$readspertx * ifelse(chosen$rep1 == 'normal', 1, ifelse(chosen$rep1 == 'high', foldChange$high, foldChange$low)) )
chosen$meanR2 <- round(chosen$readspertx * ifelse(chosen$rep2 == 'normal', 1, ifelse(chosen$rep2 == 'high', foldChange$high, foldChange$low)) )
chosen$meanR3 <- round(chosen$readspertx * ifelse(chosen$rep3 == 'normal', 1, ifelse(chosen$rep3 == 'high', foldChange$high, foldChange$low)) )

## Generate the count matrix
readmat <- matrix(NA, nrow = nrow(chosen), ncol = 3 * 10)
for(i in seq_len(30)) {
    if (i %in% c(1:5, 11:15, 21:25)) {
        means <- chosen$readspertx
    } else if (i %in% 6:10) {
        means <- chosen$meanR1    
    } else if (i %in% 16:20) {
        means <- chosen$meanR2    
    } else {
        means <- chosen$meanR3
    }
    readmat[, i] <- mapply(polyester:::NB, means, means / 3)
}
colnames(readmat) <- paste0(rep(paste0('sample', 1:10, 'G', rep(1:2, each = 5)), 3), 'R', rep(1:3, each = 10))
rownames(readmat) <- chosen$ucsckg_id

## Classify txs by DE status
class_de <- function(gene, col) {
    s <- subset(chosen, gene_id == gene)
    status <- s[[col]]
    if(nrow(s) > 1) {
        res <- ifelse(all(status == 'normal'), 'noneDE', ifelse(all(status %in% c('high', 'low')), 'allDE', 'someDE'))
    } else {
        res <- ifelse(status %in% c('high', 'low'), 'singleDE', 'singleNotDE') 
    }
    return(res)
}
chosen$case1 <- sapply(chosen$gene_id, class_de, col = 'rep1')
chosen$case2 <- sapply(chosen$gene_id, class_de, col = 'rep2')
chosen$case3 <- sapply(chosen$gene_id, class_de, col = 'rep3')

## At the transcript level, how does the DE status look like?
sapply(paste0('case', 1:3), function(x) { table(chosen[[x]])})
##             case1 case2 case3
## allDE          84   101    71
## noneDE        649   646   596
## singleDE      193   157   174
## singleNotDE   321   357   340
## someDE       2653  2639  2719
## What about the gene level?
sapply(paste0('case', 1:3), function(x) { table(chosen[[x]][chosen$tx_i == 1])})
##             case1 case2 case3
## allDE          38    43    33
## noneDE        227   226   213
## singleDE      193   157   174
## singleNotDE   321   357   340
## someDE        559   555   578
## Save parameters
save(chosen, readmat, foldChange, file = 'simulation_info.Rdata')

## Run simulation
outdir <- file.path('..', 'simulated_reads')
simulate_experiment_countmat(fasta = 'chr17_chosen.fa', readmat = readmat, outdir = outdir, fraglen = 250, fragsd = 25, readlen = 100, error_rate = 0.005, paired = TRUE, seed = '20151119')

## gzip fasta files
for(i in seq_len(30)) {
    for(j in 1:2) {
        system(paste('gzip', file.path(outdir, paste0("sample_", sprintf('%02d', i), "_", j, ".fasta"))))
    }
}

## Generated pairs info file for running TopHat or HISAT
{
sink(file.path(outdir, "paired.txt"))
for(i in seq_len(30)) {
    cat(paste0("sample_", sprintf('%02d', i), "_1.fasta.gz\tsample_", sprintf('%02d', i), "_2.fasta.gz\t", colnames(readmat)[i], "\n"))
}
sink()
}

## Reproducibility info
Sys.time()
## [1] "2015-11-19 16:13:58 EST"
proc.time()
##     user   system  elapsed 
## 4552.767   96.418 4798.872
session_info()
## Session info --------------------------------------------------------------
##  setting  value                                             
##  version  R Under development (unstable) (2015-08-31 r69229)
##  system   x86_64, linux-gnu                                 
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       <NA>                                              
##  date     2015-11-19
## Packages ------------------------------------------------------------------
##  package                           * version  date      
##  AnnotationDbi                     * 1.33.1   2015-10-30
##  Biobase                           * 2.31.0   2015-10-20
##  BiocGenerics                      * 0.17.1   2015-11-07
##  BiocParallel                        1.5.0    2015-11-03
##  BiocStyle                         * 1.9.2    2015-11-03
##  biomaRt                             2.27.0   2015-10-20
##  Biostrings                        * 2.39.1   2015-11-19
##  bitops                              1.0-6    2013-08-17
##  DBI                                 0.3.1    2014-09-24
##  devtools                          * 1.9.1    2015-09-11
##  digest                              0.6.8    2014-12-31
##  evaluate                            0.8      2015-09-18
##  formatR                             1.2.1    2015-09-18
##  futile.logger                       1.4.1    2015-04-20
##  futile.options                      1.0.0    2010-04-06
##  GenomeInfoDb                      * 1.7.3    2015-11-03
##  GenomicAlignments                   1.7.3    2015-11-05
##  GenomicFeatures                   * 1.23.8   2015-11-06
##  GenomicRanges                     * 1.23.3   2015-11-07
##  htmltools                           0.2.6    2014-09-08
##  IRanges                           * 2.5.5    2015-11-03
##  knitr                               1.11     2015-08-14
##  lambda.r                            1.1.7    2015-03-20
##  limma                               3.27.5   2015-11-17
##  logspline                           2.1.8    2015-07-03
##  magrittr                            1.5      2014-11-22
##  memoise                             0.2.1    2014-04-22
##  polyester                         * 1.7.1    2015-11-19
##  RCurl                               1.95-4.7 2015-06-30
##  rmarkdown                           0.8.1    2015-10-10
##  Rsamtools                           1.23.1   2015-11-03
##  RSQLite                             1.0.0    2014-10-25
##  rtracklayer                         1.31.1   2015-10-23
##  S4Vectors                         * 0.9.9    2015-11-19
##  stringi                             1.0-1    2015-10-22
##  stringr                             1.0.0    2015-04-30
##  SummarizedExperiment                1.1.2    2015-11-19
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.2.2    2015-10-19
##  XML                                 3.98-1.3 2015-06-30
##  XVector                           * 0.11.0   2015-10-20
##  yaml                                2.1.13   2014-06-12
##  zlibbioc                            1.17.0   2015-10-20
##  source                                
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.2.0)                        
##  CRAN (R 3.2.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.2.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.2.0)                        
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.2.0)                        
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.2.0)                        
##  Github (lcolladotor/polyester@a07d933)
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  CRAN (R 3.2.0)                        
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  Bioconductor                          
##  CRAN (R 3.3.0)                        
##  Bioconductor