All exon counting methods

This document is based on counts.based.html and expands the ideas there to using 5 different exon sets. These are exons from:

  • the genomic state object produced by derfinder: genomicState
  • exons(): txdb
  • exonsBy(by = 'gene'): byGene
  • disjointExons(): disjoint
  • featureCounts() as included in the Rsubread package: featureCounts.

Not all exonic sets are disjoint and we wanted to know how they would perform using the simulated data.

library('edgeR')
## Loading required package: limma
library('DESeq2')
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
library('GenomicRanges')
library('Rsubread')
library('derfinder')
## Find out what's changed in derfinder with
## news(Version == "1.1.18", package = "derfinder")
## Load data
load("../coverageToExon/covToEx-ucsc.Rdata")
load("../CoverageInfo/fullCov.Rdata")
load("../derAnalysis/run2-v1.0.10/groupInfo.Rdata")
load('../simulation_info.Rdata')
if(file.exists("../derAnalysis/run2-v1.0.10/colsubset.Rdat")) {
    load("../derAnalysis/run2-v1.0.10/colsubset.Rdata")
} else {
    colsubset <- seq_len(length(groupInfo))
}

## GenomicState object
if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
    load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
    load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
} else {
    stop('Missing UCSC hg19 genomic state object')
}

## TxDb setup
library('GenomeInfoDb')
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
txinfo <- select(txdb, keys = chosen$ucsckg_id, columns = columns(txdb), keytype = 'TXNAME')

## Buiild GRangesList with exons grouped by transcript
tx <- split(GRanges(seqnames = txinfo$EXONCHROM, IRanges(start = txinfo$EXONSTART, end = txinfo$EXONEND), strand = txinfo$EXONSTRAND), txinfo$TXNAME)
tx <- tx[match(chosen$ucsckg_id, names(tx))]

## Calculate coverage
calc_cov <- function(exons) {
    print(system.time(reg_cov <- getRegionCoverage(fullCov = fullCov, exons, verbose = FALSE)))
    cov <- t(sapply(reg_cov, function(x) { colSums(x) })) / 100
    round(cov, 0)
}

## DESeq2 analysis
run_deseq <- function(counts, exons, file, groupInfo) {
    nonzero <- sapply(rowSums(counts), function(x) {x > 0})
    
    ## Round matrix and specify design
    dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group)

    ## Perform DE analysis
    system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) )

    ## Extract results
    deseq <- exons[nonzero]
    mcols(deseq) <- cbind(mcols(deseq), results(dse))

    ## Which are significant?
    deseq$sig <- deseq$padj < 0.05
    deseq$sig[is.na(deseq$sig)] <- FALSE

    ## Save results
    save(deseq, file = paste0(file, '-DESeq2.Rdata'))
    
    ## End
    return(deseq)
}

## edgeR analysis
run_edger <- function(counts, exons, file, groupInfo) {
    nonzero <- sapply(rowSums(counts), function(x) {x > 0})
    
    ## Determine design matrix
    design <- model.matrix(~ groupInfo)

    ## Perform DE analysis
    d <- DGEList(counts = counts[nonzero, ], group = groupInfo)
    d <- calcNormFactors(d)
    system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6))
    fw <- glmFit(dw, design = design, coef = 2:3)
    lrw <- glmLRT(fw, coef = 2:3)

    ## Extract results
    edger <- exons[nonzero]
    mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table))
    edger$pvalue <-  lrw$table$PValue
    edger$padj <- p.adjust(lrw$table$PValue, 'BH')

    ## Which are significant?
    edger$sig <- edger$padj < 0.05
    edger$sig[is.na(edger$sig)] <- FALSE

    ## Save results
    save(edger, file = paste0(file, '-edgeR.Rdata'))
    
    ## End
    return(edger)
}

Exon analysis

Sets

The following code defines the five exonic sets. It then checks how many exons overlap other exons. If it's zero for all of them, then they are disjoint.

## Genomic state
exons <- GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome
exons <- exons[seqnames(exons) == 'chr22']
exons <- list('genomicState' = exons[exons$theRegion == 'exon'])

## TxDb
exons <- c(exons, list(txdb = exons(txdb)))

## TxDb by gene
exons <- c(exons, list(byGene = unlist(exonsBy(txdb, 'gene'))))

## TxDb disjoint
exons <- c(exons, list(disjoint = disjointExons(txdb)))

## feature counts
hg19 <- read.table(system.file("annot", "hg19_RefSeq_exon.txt", package = "Rsubread"), header = TRUE)
hg19 <- subset(hg19, Chr == 'chr22')
exons <- c(exons, list(featureCounts = GRanges(hg19$Chr, IRanges(hg19$Start, hg19$End), strand = hg19$Strand, GeneID = hg19$GeneID)))
rm(hg19)

## Note that in some exon sets, some exons overlap
lapply(exons, function(x) { table(countOverlaps(x) - 1)})
## $genomicState
## 
##    0 
## 5676 
## 
## $txdb
## 
##    0    1    2    3    4    5    7 
## 5096 1105  219   69   21    5    1 
## 
## $byGene
## 
##    0    1    2    3    4    5    7 
## 4284 1130  241   84   14    9    1 
## 
## $disjoint
## 
##    0 
## 5659 
## 
## $featureCounts
## 
##    0    1    2    3 
## 4917  131    4    1

As expected, only the genomicState and disjoint sets are fully disjoint since they were designed to be that way.

Counting

The following code counts the number of reads per exon using tools from derfinder. Alternatively, other software could be used for counting.

## Count using derfinder
counts <- lapply(exons, calc_cov)
##    user  system elapsed 
##  29.183  11.595  40.844 
##    user  system elapsed 
##  25.777  17.335  43.172 
##    user  system elapsed 
##  22.375  14.922  37.548 
##    user  system elapsed 
##  20.869  13.253  34.149 
##    user  system elapsed 
##  19.292  10.671  29.985

DE

The following code performs the differential expression analysis using DESeq2 and edgeR.

## DESeq2
system.time( deseq <- mapply(run_deseq, counts, exons, names(exons), MoreArgs = list(groupInfo = groupInfo)) )
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##    user  system elapsed 
##   7.260   0.044   7.351
## edgeR
system.time( edger <- mapply(run_edger, counts, exons, names(exons), MoreArgs = list(groupInfo = groupInfo)) )
##    user  system elapsed 
##  14.039   0.150  14.209

Agreement

The code below compares the results between DESeq2 and edgeR.

agree <- function(deseq, edger) {
    addmargins(table('Significant DE gene -- DESeq2' = mcols(deseq)$sig, 'Significant DE gene -- edgeR' = mcols(edger)$sig))
}
mapply(agree, deseq, edger, SIMPLIFY = FALSE)
## $genomicState
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE   109    9 118
##                         TRUE      0  152 152
##                         Sum     109  161 270
## 
## $txdb
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE   122   10 132
##                         TRUE      0  177 177
##                         Sum     122  187 309
## 
## $byGene
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE   125    8 133
##                         TRUE      3  180 183
##                         Sum     128  188 316
## 
## $disjoint
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE   124    5 129
##                         TRUE      0  154 154
##                         Sum     124  159 283
## 
## $featureCounts
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE   112    6 118
##                         TRUE      0  145 145
##                         Sum     112  151 263

Overall, DESeq2 and edgeR mostly agree regardless of the exon set. Their best agreement is in the disjoint set with only 5 disagreements (1.8 percent).

Compare

Exonic segments

## Find exons
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
txinfo <- select(txdb, keys = chosen$ucsckg_id, columns = columns(txdb), keytype = 'TXNAME')

## Buiild GRangesList with exons grouped by transcript
tx <- split(GRanges(seqnames = txinfo$EXONCHROM, IRanges(start = txinfo$EXONSTART, end = txinfo$EXONEND), strand = txinfo$EXONSTRAND), txinfo$TXNAME)
tx <- tx[match(chosen$ucsckg_id, names(tx))]

## Gene level: DE if at least one transcript is DE
gene <- data.frame(gene_id = unique(chosen$gene_id))
gene$DE <- sapply(gene$gene_id, function(x) { any(chosen$DE[chosen$gene_id == x])  })
gene$case <- sapply(gene$gene_id, function(x) { unique(chosen$case[chosen$gene_id == x])  })

## Identify exonic segments
segments <- GRangesList(lapply(gene$gene_id, function(x) {
    i <- chosen$ucsckg_id[ chosen$gene_id == x]
    
    ## Find segments
    segs <- disjoin(unlist(tx[i]))
    ov <- findOverlaps(segs, tx[i])
    
    ## Find DE status per segment
    segs$DE <- as.vector(tapply(subjectHits(ov), queryHits(ov), function(y) {
        any(chosen$DE[ chosen$gene_id == x])
    }))
    
    ## Finish
    return(segs)
}))
names(segments) <- gene$gene_id
segs <- unlist(segments)

The following code compares the exonic segments (280 of them) against the resulting exon DE calls from DESeq2 and edgeR (in that order).

count_comp <- function(info, ptype = 'padj', cut = 0.05) {
    if(ptype == 'padj') {
        idx <- info$padj < cut
    } else if (ptype == 'pvalue') {
        idx <- info$pvalue < cut
    } else {
        p <- p.adjust(info$pvalue, ptype)
        idx <- p < cut
    }
    idx[is.na(idx)] <- FALSE

    ## Overlaps at least 1 DE exon
    addmargins(table('DE status' = segs$DE, 'Overlaps DE exon' = countOverlaps(segs, info[idx]) > 0))
}

lapply(deseq, count_comp)
## $genomicState
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   104    7 111
##     TRUE      6  163 169
##     Sum     110  170 280
## 
## $txdb
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   104    7 111
##     TRUE      6  163 169
##     Sum     110  170 280
## 
## $byGene
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   102    9 111
##     TRUE      7  162 169
##     Sum     109  171 280
## 
## $disjoint
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   108    3 111
##     TRUE     22  147 169
##     Sum     130  150 280
## 
## $featureCounts
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   105    6 111
##     TRUE     15  154 169
##     Sum     120  160 280
lapply(edger, count_comp)
## $genomicState
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    94   17 111
##     TRUE      6  163 169
##     Sum     100  180 280
## 
## $txdb
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    94   17 111
##     TRUE      6  163 169
##     Sum     100  180 280
## 
## $byGene
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    97   14 111
##     TRUE      7  162 169
##     Sum     104  176 280
## 
## $disjoint
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   104    7 111
##     TRUE     21  148 169
##     Sum     125  155 280
## 
## $featureCounts
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    99   12 111
##     TRUE     14  155 169
##     Sum     113  167 280

Empirical Power

The empirical power is shown below for DESeq2 and edgeR.

emp_power <- function(info, ptype = 'padj') {
    m <- count_comp(info, ptype)
    round(m[2, 2] / m[2, 3] * 100, 2)
}


sapply(deseq, emp_power)
##  genomicState          txdb        byGene      disjoint featureCounts 
##         96.45         96.45         95.86         86.98         91.12
sapply(edger, emp_power)
##  genomicState          txdb        byGene      disjoint featureCounts 
##         96.45         96.45         95.86         87.57         91.72

Empirical FPR

The empirical False Positive Rate (FPR) is shown below for DESeq2 and edgeR.

emp_fpr <- function(info, ptype = 'padj') {
    m <- count_comp(info, ptype)
    round(m[1, 2] / m[1, 3] * 100, 2)
}

sapply(deseq, emp_fpr)
##  genomicState          txdb        byGene      disjoint featureCounts 
##          6.31          6.31          8.11          2.70          5.41
sapply(edger, emp_fpr)
##  genomicState          txdb        byGene      disjoint featureCounts 
##         15.32         15.32         12.61          6.31         10.81

Empirical FDR

The empirical False Discovery Rate (FDR) is shown below for DESeq2 and edgeR.

emp_fdr <- function(info, ptype = 'padj') {
    m <- count_comp(info, ptype)
    round(m[1, 2] / m[3, 2] * 100, 2)
}

sapply(deseq, emp_fdr)
##  genomicState          txdb        byGene      disjoint featureCounts 
##          4.12          4.12          5.26          2.00          3.75
sapply(edger, emp_fdr)
##  genomicState          txdb        byGene      disjoint featureCounts 
##          9.44          9.44          7.95          4.52          7.19

Overlap

As in counts.based.html we can compare the DERs and exons directly.

load('../derAnalysis/run2-v1.0.10/fullRegions.Rdata')

## Some formatting and subsets
fullRegions$significantFDR <- factor(fullRegions$qvalues < 0.05, levels = c('TRUE', 'FALSE'))
fullRegions$sigFDR <- as.logical(fullRegions$significantFDR)
fullRegs20 <- fullRegions[width(fullRegions) >= 20]

## Overlap table for all 4 cases
ov_table <- function(ders, counts, query = 'der', minov = 0) {
    if(query == 'der') {
        if(minov == 0) {
            res <- addmargins(table('Significant DER (FDR)' = ders$sigFDR, 'Overlaps sig DE exon' = countOverlaps(ders, counts[counts$sig]) > 0))
        } else {
            res <- addmargins(table(ders$sigFDR, countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FDR)', paste0('Overlaps sig DE exon (min ', minov, 'bp)'))))
        }
    } else if (query == 'counts') {
        if(minov == 0) {
            res <- addmargins(table('Significant DE exon' = counts$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFDR]) > 0))
        } else {
            res <- addmargins(table(counts$sig[width(counts) >= minov], countOverlaps(counts[width(counts) >= minov], ders[ders$sigFDR], minoverlap = minov) > 0, dnn = c('Significant DE exon', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)'))))
        }
    }
    return(res)
}

## Explore mistmatched cases for DERs vs Exons direction
explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0 & !ders$sigFDR)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(ders, counts[counts$sig], minoverlap = minov) > 0 & ders$sigFDR)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(ders[i], counts[counts$sig], minoverlap = minov)),
            width_der = summary(width(ders[i])),
            ders_per_exon_table = table(table(subjectHits(findOverlaps(ders[i], counts[counts$sig], minoverlap = minov)))),
            ders_per_exon = sort(table(subjectHits(findOverlaps(ders[i], counts[counts$sig], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_der = summary(width(ders[i])),
            distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], counts, ignore.strand = TRUE))$distance),
            distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], counts[counts$sig], ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(ders[i], counts, ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(ders[i], counts[counts$sig], ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

## Explore mistmatched cases for Exons vs DERs direction
explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    counts <- counts[width(counts) >= minov]
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(counts, ders[ders$sigFDR], minoverlap = minov) > 0 & !counts$sig)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(counts, ders[ders$sigFDR], minoverlap = minov) > 0 & counts$sig)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFDR], minoverlap = minov)),
            width_exon = summary(width(counts[i])),
            exons_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFDR], minoverlap = minov)))),
            exons_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFDR], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_exon = summary(width(counts[i])),
            distance_nearest_sum = summary(mcols(distanceToNearest(counts[i], ders, ignore.strand = TRUE))$distance),
             distance_nearest_sig_sum = summary(mcols(distanceToNearest(counts[i], ders[ders$sigFDR], ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(counts[i], ders, ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(counts[i], ders[ders$sigFDR], ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

noNA <- function(x) {
    x[!is.na(x)]
}

Query: DERs

First we use the DERs as the query and check if they overlap a significantly differentially expressed exon. The results are shown for DESeq2 and edgeR using all DERs and then requiring a minimum overlap of 20 bp.

## DESeq2
lapply(deseq, function(x) {
    ov_table(fullRegions, x)
})
## $genomicState
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    15  301 316
##                 TRUE      0  153 153
##                 Sum      15  454 469
## 
## $txdb
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    15  301 316
##                 TRUE      0  153 153
##                 Sum      15  454 469
## 
## $byGene
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    15  301 316
##                 TRUE      0  153 153
##                 Sum      15  454 469
## 
## $disjoint
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    38  278 316
##                 TRUE     11  142 153
##                 Sum      49  420 469
## 
## $featureCounts
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    21  295 316
##                 TRUE      6  147 153
##                 Sum      27  442 469
lapply(deseq, function(x) {
    ov_table(fullRegs20, x, minov = 20L)
})
## $genomicState
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      0  153 153
##                 Sum       1  168 169
## 
## $txdb
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      0  153 153
##                 Sum       1  168 169
## 
## $byGene
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      0  153 153
##                 Sum       1  168 169
## 
## $disjoint
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     3   13  16
##                 TRUE     11  142 153
##                 Sum      14  155 169
## 
## $featureCounts
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      6  147 153
##                 Sum       7  162 169
## edgeR
lapply(edger, function(x) {
    ov_table(fullRegions, x)
})
## $genomicState
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    15  301 316
##                 TRUE      0  153 153
##                 Sum      15  454 469
## 
## $txdb
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    13  303 316
##                 TRUE      0  153 153
##                 Sum      13  456 469
## 
## $byGene
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    19  297 316
##                 TRUE      0  153 153
##                 Sum      19  450 469
## 
## $disjoint
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    34  282 316
##                 TRUE     11  142 153
##                 Sum      45  424 469
## 
## $featureCounts
##                      Overlaps sig DE exon
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    18  298 316
##                 TRUE      6  147 153
##                 Sum      24  445 469
lapply(edger, function(x) {
    ov_table(fullRegs20, x, minov = 20L)
})
## $genomicState
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      0  153 153
##                 Sum       1  168 169
## 
## $txdb
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      0  153 153
##                 Sum       1  168 169
## 
## $byGene
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      0  153 153
##                 Sum       1  168 169
## 
## $disjoint
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     3   13  16
##                 TRUE     11  142 153
##                 Sum      14  155 169
## 
## $featureCounts
##                      Overlaps sig DE exon (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      6  147 153
##                 Sum       7  162 169

Query: exons

Next, we can use the exons as the query and check if they overlap a significant DER. The results are shown for DESeq2 and edgeR using all DERs and then restricting the overlap to minimum 20 bp.

## DESeq2
lapply(deseq, function(x) {
    ov_table(fullRegions, x, 'counts')
})
## $genomicState
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   118    0 118
##               TRUE     23  129 152
##               Sum     141  129 270
## 
## $txdb
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   132    0 132
##               TRUE     26  151 177
##               Sum     158  151 309
## 
## $byGene
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   133    0 133
##               TRUE     27  156 183
##               Sum     160  156 316
## 
## $disjoint
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   128    1 129
##               TRUE     22  132 154
##               Sum     150  133 283
## 
## $featureCounts
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   118    0 118
##               TRUE     21  124 145
##               Sum     139  124 263
lapply(deseq, function(x) {
    ov_table(fullRegs20, x, 'counts', minov = 20L)
})
## $genomicState
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   118    0 118
##               TRUE     22  129 151
##               Sum     140  129 269
## 
## $txdb
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   132    0 132
##               TRUE     25  151 176
##               Sum     157  151 308
## 
## $byGene
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   133    0 133
##               TRUE     26  156 182
##               Sum     159  156 315
## 
## $disjoint
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   127    0 127
##               TRUE     23  129 152
##               Sum     150  129 279
## 
## $featureCounts
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   117    0 117
##               TRUE     21  124 145
##               Sum     138  124 262
## edgeR
lapply(edger, function(x) {
    ov_table(fullRegions, x, 'counts')
})
## $genomicState
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   109    0 109
##               TRUE     32  129 161
##               Sum     141  129 270
## 
## $txdb
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   122    0 122
##               TRUE     36  151 187
##               Sum     158  151 309
## 
## $byGene
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   128    0 128
##               TRUE     32  156 188
##               Sum     160  156 316
## 
## $disjoint
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   124    0 124
##               TRUE     26  133 159
##               Sum     150  133 283
## 
## $featureCounts
##                    Overlaps sig DER (FWER)
## Significant DE exon FALSE TRUE Sum
##               FALSE   112    0 112
##               TRUE     27  124 151
##               Sum     139  124 263
lapply(edger, function(x) {
    ov_table(fullRegs20, x, 'counts', minov = 20L)
})
## $genomicState
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   109    0 109
##               TRUE     31  129 160
##               Sum     140  129 269
## 
## $txdb
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   122    0 122
##               TRUE     35  151 186
##               Sum     157  151 308
## 
## $byGene
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   128    0 128
##               TRUE     31  156 187
##               Sum     159  156 315
## 
## $disjoint
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   122    0 122
##               TRUE     28  129 157
##               Sum     150  129 279
## 
## $featureCounts
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE exon FALSE TRUE Sum
##               FALSE   112    0 112
##               TRUE     26  124 150
##               Sum     138  124 262

Conclusions

Using DESeq2 with the genomicState or txdb exonic sets resulted in the highest power and reasonable FPR. While the disjoint set analyzed by DESeq2 resulted in the best FPR it had the lowest empirical power.

When using the DERs as the query, the disjoint set has the lowest disagreement for both DESeq2 and edgeR. However, the genomicState, txdb, and byGene sets are the only ones that do not have cases of a significant DER overlapping a non significant DE exon.

Regardless of the exon set, when using the exons as query, there are no cases of a non significantly DE exon overlap a significant DER when requiring a 20 bp overlap.

Reproducibility

## Reproducibility info
Sys.time()
## [1] "2015-04-06 22:32:59 EDT"
proc.time()
##    user  system elapsed 
## 198.395  69.557 274.286
options(width = 120)
devtools::session_info()
## Session info-----------------------------------------------------------------------------------------------------------
##  setting  value                                             
##  version  R Under development (unstable) (2014-11-01 r66923)
##  system   x86_64, darwin10.8.0                              
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       America/New_York
## Packages---------------------------------------------------------------------------------------------------------------
##  package                           * version     date       source                                   
##  acepack                             1.3.3.3     2013-05-03 CRAN (R 3.2.0)                           
##  annotate                            1.45.4      2015-03-21 Bioconductor                             
##  AnnotationDbi                     * 1.29.21     2015-04-03 Bioconductor                             
##  Biobase                           * 2.27.3      2015-03-27 Bioconductor                             
##  BiocGenerics                      * 0.13.11     2015-04-03 Bioconductor                             
##  BiocParallel                        1.1.21      2015-03-24 Bioconductor                             
##  biomaRt                             2.23.5      2014-11-22 Bioconductor                             
##  Biostrings                          2.35.12     2015-03-26 Bioconductor                             
##  bitops                              1.0.6       2013-08-17 CRAN (R 3.2.0)                           
##  bumphunter                          1.7.6       2015-03-13 Github (lcolladotor/bumphunter@37d10e7)  
##  cluster                             2.0.1       2015-01-31 CRAN (R 3.2.0)                           
##  codetools                           0.2.11      2015-03-10 CRAN (R 3.2.0)                           
##  colorout                          * 1.0.2       2014-11-03 local                                    
##  colorspace                          1.2.6       2015-03-11 CRAN (R 3.2.0)                           
##  DBI                                 0.3.1       2014-09-24 CRAN (R 3.2.0)                           
##  derfinder                         * 1.1.18      2015-04-01 Bioconductor                             
##  derfinderHelper                     1.1.6       2015-03-15 Bioconductor                             
##  DESeq2                            * 1.7.50      2015-04-04 Bioconductor                             
##  devtools                            1.6.1       2014-10-07 CRAN (R 3.2.0)                           
##  digest                              0.6.8       2014-12-31 CRAN (R 3.2.0)                           
##  doRNG                               1.6         2014-03-07 CRAN (R 3.2.0)                           
##  edgeR                             * 3.9.15      2015-04-03 Bioconductor                             
##  evaluate                            0.5.5       2014-04-29 CRAN (R 3.2.0)                           
##  foreach                             1.4.2       2014-04-11 CRAN (R 3.2.0)                           
##  foreign                             0.8.63      2015-02-20 CRAN (R 3.2.0)                           
##  formatR                             1.0         2014-08-25 CRAN (R 3.2.0)                           
##  Formula                             1.2.0       2015-01-20 CRAN (R 3.2.0)                           
##  futile.logger                       1.4         2015-03-21 CRAN (R 3.2.0)                           
##  futile.options                      1.0.0       2010-04-06 CRAN (R 3.2.0)                           
##  genefilter                          1.49.2      2014-10-21 Bioconductor                             
##  geneplotter                         1.45.0      2014-10-14 Bioconductor                             
##  GenomeInfoDb                      * 1.3.16      2015-03-27 Bioconductor                             
##  GenomicAlignments                   1.3.33      2015-04-06 Bioconductor                             
##  GenomicFeatures                   * 1.19.36     2015-03-30 Bioconductor                             
##  GenomicFiles                        1.3.15      2015-04-01 Bioconductor                             
##  GenomicRanges                     * 1.19.52     2015-04-04 Bioconductor                             
##  ggplot2                             1.0.0       2014-05-21 CRAN (R 3.2.0)                           
##  gtable                              0.1.2       2012-12-05 CRAN (R 3.2.0)                           
##  Hmisc                               3.14.5      2014-09-12 CRAN (R 3.2.0)                           
##  htmltools                           0.2.6       2014-09-08 CRAN (R 3.2.0)                           
##  IRanges                           * 2.1.43      2015-03-07 Bioconductor                             
##  iterators                           1.0.7       2014-04-11 CRAN (R 3.2.0)                           
##  knitr                               1.7         2014-10-13 CRAN (R 3.2.0)                           
##  knitrBootstrap                      1.0.0       2014-11-03 Github (jimhester/knitrBootstrap@76c41f0)
##  lambda.r                            1.1.7       2015-03-20 CRAN (R 3.2.0)                           
##  lattice                             0.20.31     2015-03-30 CRAN (R 3.2.0)                           
##  latticeExtra                        0.6.26      2013-08-15 CRAN (R 3.2.0)                           
##  limma                             * 3.23.12     2015-04-05 Bioconductor                             
##  locfit                              1.5.9.1     2013-04-20 CRAN (R 3.2.0)                           
##  markdown                            0.7.4       2014-08-24 CRAN (R 3.2.0)                           
##  MASS                                7.3.40      2015-03-21 CRAN (R 3.2.0)                           
##  Matrix                              1.2.0       2015-04-04 CRAN (R 3.2.0)                           
##  matrixStats                         0.14.0      2015-02-14 CRAN (R 3.2.0)                           
##  munsell                             0.4.2       2013-07-11 CRAN (R 3.2.0)                           
##  nnet                                7.3.9       2015-02-11 CRAN (R 3.2.0)                           
##  pkgmaker                            0.22        2014-05-14 CRAN (R 3.2.0)                           
##  plyr                                1.8.1       2014-02-26 CRAN (R 3.2.0)                           
##  proto                               0.3.10      2012-12-22 CRAN (R 3.2.0)                           
##  qvalue                              1.99.1      2015-04-04 Bioconductor                             
##  RColorBrewer                        1.1.2       2014-12-07 CRAN (R 3.2.0)                           
##  Rcpp                              * 0.11.5      2015-03-06 CRAN (R 3.2.0)                           
##  RcppArmadillo                     * 0.4.650.1.1 2015-02-26 CRAN (R 3.2.0)                           
##  RCurl                               1.95.4.5    2014-12-28 CRAN (R 3.2.0)                           
##  registry                            0.2         2012-01-24 CRAN (R 3.2.0)                           
##  reshape2                            1.4.1       2014-12-06 CRAN (R 3.2.0)                           
##  rmarkdown                           0.3.3       2014-09-17 CRAN (R 3.2.0)                           
##  rngtools                            1.2.4       2014-03-06 CRAN (R 3.2.0)                           
##  rpart                               4.1.9       2015-02-24 CRAN (R 3.2.0)                           
##  Rsamtools                           1.19.49     2015-03-27 Bioconductor                             
##  RSQLite                             1.0.0       2014-10-25 CRAN (R 3.2.0)                           
##  rstudioapi                          0.2         2014-12-31 CRAN (R 3.2.0)                           
##  Rsubread                          * 1.17.2      2015-03-26 Bioconductor                             
##  rtracklayer                         1.27.11     2015-04-01 Bioconductor                             
##  S4Vectors                         * 0.5.22      2015-03-06 Bioconductor                             
##  scales                              0.2.4       2014-04-22 CRAN (R 3.2.0)                           
##  stringr                             0.6.2       2012-12-06 CRAN (R 3.2.0)                           
##  survival                            2.38.1      2015-02-24 CRAN (R 3.2.0)                           
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0       2014-09-26 Bioconductor                             
##  XML                                 3.98.1.1    2013-06-20 CRAN (R 3.2.0)                           
##  xtable                              1.7.4       2014-09-12 CRAN (R 3.2.0)                           
##  XVector                           * 0.7.4       2015-02-08 Bioconductor                             
##  yaml                                2.1.13      2014-06-12 CRAN (R 3.2.0)                           
##  zlibbioc                            1.13.3      2015-03-23 Bioconductor