Simulate reads

Simulate reads using polyester.

Setup

Using chr22, a total of 60 genes were selected with reads simulated for 3 groups each with 10 biological replicates. Reads are simulated with a 40x coverage in mind and taking into account the transcript length. Fold changes are set to 1/2 when a group has low expression and 2 when a group has high expression. Reads are 100bp long and come from paired-end reads with mean fragment lengths of 250bp (25bp sd) and a uniform error rate of 0.005. The size is set to 1/3 of the mean in the negative binomial model.

24 of the genes have only one transcript and are setup in the following way.

  • 4 are differentially expressed in group 1: 2 low, 2 high
  • 4 are differentially expressed in group 2: 2 low, 2 high
  • 4 are differentially expressed in group 3: 2 low, 2 high
  • 12 are not differentially expressed across the 3 groups

36 genes have 2 transcripts and are setup in the following way.

  • 12 of them are differentially expressed in both transcripts.
    • 4 are differentially expressed in group 1: 2 low, 2 high
    • 4 are differentially expressed in group 2: 2 low, 2 high
    • 4 are differentially expressed in group 3: 2 low, 2 high
  • 12 of them are differentially expressed in one transcript.
    • 4 are differentially expressed in group 1: 2 low, 2 high
    • 4 are differentially expressed in group 2: 2 low, 2 high
    • 4 are differentially expressed in group 3: 2 low, 2 high
  • 12 are not differentially expressed across the 3 groups in both transcripts

Code

library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     species
library('org.Hs.eg.db')
## Loading required package: DBI
library('polyester')
library('Biostrings')
## Loading required package: XVector
library('devtools')

## Fold changes
foldChange <- list(high = 2, low = 1/2)

## Find transcripts
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
tx <- transcriptsBy(txdb, 'gene')
txs <- sapply(tx, length)

## Fasta file to use
fasta_file <- system.file('extdata', 'chr22.fa', package='polyester')
fasta <- readDNAStringSet(fasta_file)

## Find 24 genes with only 1 transcript, 36 with 2
n1 <- 24
n2 <- 36
chosen <- data.frame(tx_idx = c(which(txs == 1)[1:n1], rep(which(txs == 2), each = 2)), tx_n = rep(c(1, 2), c(n1, 109 * 2)), tx_i = c(rep(1, n1), rep(1:2, 109)))

## Add gene id, refseq info
chosen$gene_id <- names(tx)[chosen$tx_idx]
chosen$ucsckg_id <- mapply(function(i, tx_idx) {
    tx[[tx_idx]]$tx_name[i]
}, chosen$tx_i, chosen$tx_idx)

ref <- select(org.Hs.eg.db, keys = chosen$gene_id, columns = 'REFSEQ', keytype = 'ENTREZID')
chosen$fasta_i <- mapply(function(i, gene_id) {
    reftx <- ref$REFSEQ[ref$ENTREZID == gene_id]
    if(all(sapply(reftx, is.na))) return(NA)
    fasta_i <- unlist(sapply(reftx, grep, x = names(fasta)))
    fasta_i[i]
}, chosen$tx_i, chosen$gene_id)

## Drop down to 30 genes with 2 txs where annotation map worked
toKeep <- tapply(chosen$fasta_i[chosen$tx_n == 2], chosen$gene_id[chosen$tx_n == 2], function(x) { 
    !any(sapply(x, is.na))
})

## Final selection
chosen <- subset(chosen, gene_id %in% c(chosen$gene_id[1:n1], names(which(toKeep)[1:36])))

writeXStringSet(fasta[chosen$fasta_i], 'chr22_chosen.fa')


#txinfo <- select(txdb, keys = tx[[6]]$tx_name, columns = columns(txdb), keytype = 'TXNAME')
#txinfo$EXONEND - txinfo$EXONSTART + 1

## Select txs to be DE
chosen$DE <- FALSE
de <- sample(1:n1, n1/2)
chosen$DE[de] <- TRUE
x <- sample(1:n2, n2 * 2 / 3) ## DE with only 1 tx
y <- sample(x, n2 / 3) ## DE all txs in gene
de.all <- unlist(sapply((y - 1) * 2, function(x) {x + n1 + 1:2}, simplify = FALSE))
chosen$DE[ de.all ] <- TRUE
z <- x[!x %in% y] ## DE only 1 tx in gene
de.one <- sapply((z - 1) * 2, function(x) {x + sample(n1 + 1:2, 1)} )
chosen$DE[ de.one ] <- TRUE

## DE type
chosen$group1 <- 'normal'
chosen$group1[ de[1:4] ] <- rep(c('low', 'high'), 2)
chosen$group1[ de.all[1:8] ] <- rep(rep(c('low', 'high'), each = 2), 2)
chosen$group1[ de.one[1:4] ] <- rep(c('low', 'high'), 2)

chosen$group2 <- 'normal'
chosen$group2[ de[5:8] ] <- rep(c('low', 'high'), 2)
chosen$group2[ de.all[9:16] ] <- rep(rep(c('low', 'high'), each = 2), 2)
chosen$group2[ de.one[5:8] ] <- rep(c('low', 'high'), 2)

chosen$group3 <- 'normal'
chosen$group3[ de[9:12] ] <- rep(c('low', 'high'), 2)
chosen$group3[ de.all[17:24] ] <- rep(rep(c('low', 'high'), each = 2), 2)
chosen$group3[ de.one[9:12] ] <- rep(c('low', 'high'), 2)


## Determine reads per transcript (40x coverage, 100bp reads)
chosen$width <- width(fasta[chosen$fasta_i])
chosen$readspertx <- round(chosen$width / 100 * 40)

## Base means
chosen$mean1 <- round(chosen$readspertx * ifelse(chosen$group1 == 'normal', 1, ifelse(chosen$group1 == 'high', foldChange$high, foldChange$low)))
chosen$mean2 <- round(chosen$readspertx * ifelse(chosen$group2 == 'normal', 1, ifelse(chosen$group2 == 'high', foldChange$high, foldChange$low)))
chosen$mean3 <- round(chosen$readspertx * ifelse(chosen$group3 == 'normal', 1, ifelse(chosen$group3 == 'high', foldChange$high, foldChange$low)))

## Generate count matrix
readmat <- matrix(NA, nrow = nrow(chosen), ncol = 3 * 10)
for(i in seq_len(30)) {
    if (i <= 10) {
        means <- chosen$mean1
    } else if (i <= 20) {
        means <- chosen$mean2    
    } else {
        means <- chosen$mean3
    }
    readmat[, i] <- mapply(polyester:::NB, means, means / 3)
}

## Classify txs
chosen$case <- sapply(chosen$gene_id, function(gene) {
    s <- subset(chosen, gene_id == gene)
    if( nrow(s) == 2) {
        res <- ifelse(all(s$DE), 'bothDE', ifelse(any(s$DE), 'oneDE', 'noneDE'))
    } else {
        res<- ifelse(s$DE, 'singleDE', 'singleNotDE')    
    }
    return(res)
})


## Save parameters
save(chosen, readmat, foldChange, file = 'simulation_info.Rdata')

## Run simulation
outdir <- 'simulated_reads'
simulate_experiment_countmat(fasta = 'chr22_chosen.fa', readmat = readmat, outdir = outdir, fraglen = 250, fragsd = 25, readlen = 100, error_rate = 0.005, paired = TRUE, seed = '20141202')

## gzip fasta files
for(i in seq_len(30)) {
    for(j in 1:2) {
        system(paste('gzip', file.path(outdir, paste0("sample_", sprintf('%02d', i), "_", j, ".fasta"))))
    }
}

## Generated pairs info file for running Tophat
{
sink(file.path(outdir, "paired.txt"))
for(i in seq_len(30)) {
    cat(paste0("sample_", sprintf('%02d', i), "_1.fasta.gz\tsample_", sprintf('%02d', i), "_2.fasta.gz\tsample", i, "\n"))
}
sink()
}

## Reproducibility info
Sys.time()
## [1] "2014-12-04 13:45:28 EST"
proc.time()
##    user  system elapsed 
## 148.180  12.774 163.927
session_info()
## Session info--------------------------------------------------------------
##  setting  value                                             
##  version  R Under development (unstable) (2014-11-01 r66923)
##  system   x86_64, darwin10.8.0                              
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       America/New_York
## Packages------------------------------------------------------------------
##  package                           * version  date       source        
##  AnnotationDbi                     * 1.29.10  2014-11-22 Bioconductor  
##  base64enc                           0.1.2    2014-06-26 CRAN (R 3.2.0)
##  BatchJobs                           1.4      2014-09-24 CRAN (R 3.2.0)
##  BBmisc                              1.7      2014-06-21 CRAN (R 3.2.0)
##  Biobase                           * 2.27.0   2014-10-14 Bioconductor  
##  BiocGenerics                      * 0.13.2   2014-11-18 Bioconductor  
##  BiocParallel                        1.1.9    2014-11-24 Bioconductor  
##  biomaRt                             2.23.5   2014-11-22 Bioconductor  
##  Biostrings                        * 2.35.5   2014-11-24 Bioconductor  
##  bitops                              1.0.6    2013-08-17 CRAN (R 3.2.0)
##  brew                                1.0.6    2011-04-13 CRAN (R 3.2.0)
##  checkmate                           1.5.0    2014-10-19 CRAN (R 3.2.0)
##  codetools                           0.2.9    2014-08-21 CRAN (R 3.2.0)
##  DBI                               * 0.3.1    2014-09-24 CRAN (R 3.2.0)
##  devtools                          * 1.6.1    2014-10-07 CRAN (R 3.2.0)
##  digest                              0.6.4    2013-12-03 CRAN (R 3.2.0)
##  evaluate                            0.5.5    2014-04-29 CRAN (R 3.2.0)
##  fail                                1.2      2013-09-19 CRAN (R 3.2.0)
##  foreach                             1.4.2    2014-04-11 CRAN (R 3.2.0)
##  formatR                             1.0      2014-08-25 CRAN (R 3.2.0)
##  GenomeInfoDb                      * 1.3.7    2014-11-15 Bioconductor  
##  GenomicAlignments                   1.3.12   2014-11-27 Bioconductor  
##  GenomicFeatures                   * 1.19.6   2014-11-04 Bioconductor  
##  GenomicRanges                     * 1.19.18  2014-12-01 Bioconductor  
##  htmltools                           0.2.6    2014-09-08 CRAN (R 3.2.0)
##  IRanges                           * 2.1.24   2014-12-01 Bioconductor  
##  iterators                           1.0.7    2014-04-11 CRAN (R 3.2.0)
##  knitr                               1.7      2014-10-13 CRAN (R 3.2.0)
##  org.Hs.eg.db                      * 3.0.0    2014-09-26 Bioconductor  
##  polyester                         * 1.1.0    2014-10-14 Bioconductor  
##  RCurl                               1.95.4.3 2014-07-29 CRAN (R 3.2.0)
##  rmarkdown                         * 0.3.3    2014-09-17 CRAN (R 3.2.0)
##  Rsamtools                           1.19.11  2014-11-26 Bioconductor  
##  RSQLite                           * 1.0.0    2014-10-25 CRAN (R 3.2.0)
##  rstudioapi                          0.1      2014-03-27 CRAN (R 3.2.0)
##  rtracklayer                         1.27.6   2014-11-26 Bioconductor  
##  S4Vectors                         * 0.5.11   2014-11-24 Bioconductor  
##  sendmailR                           1.2.1    2014-09-21 CRAN (R 3.2.0)
##  stringr                             0.6.2    2012-12-06 CRAN (R 3.2.0)
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0    2014-09-26 Bioconductor  
##  XML                                 3.98.1.1 2013-06-20 CRAN (R 3.2.0)
##  XVector                           * 0.7.3    2014-11-24 Bioconductor  
##  yaml                                2.1.13   2014-06-12 CRAN (R 3.2.0)
##  zlibbioc                            1.13.0   2014-10-14 Bioconductor