Gene-level evaluation

This document is based on all-exons.html. It uses the gene sets defined by counts-gene.html and runs DESeq2 and edgeR analyses. The results are compared against the exonic segments first. Then, overlap comparisons between the genes and the DERs are performed as in all-exons.html.

library('edgeR')
## Loading required package: limma
library('DESeq2')
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
library('GenomicRanges')
library('GenomeInfoDb')
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Load data
load("../derAnalysis/run2-v1.0.10/groupInfo.Rdata")
load('../simulation_info.Rdata')
load('../counts-gene/summOv_comp.Rdata')
load('../counts-gene/summOv_inc.Rdata')
load('../counts-gene/summOv_rand.Rdata')
load('../counts-gene/gene_sets.Rdata')
if(file.exists("../derAnalysis/run2-v1.0.10/colsubset.Rdat")) {
    load("../derAnalysis/run2-v1.0.10/colsubset.Rdata")
} else {
    colsubset <- seq_len(length(groupInfo))
}

## DESeq2 analysis
run_deseq <- function(counts, genes, file, groupInfo) {
    nonzero <- sapply(rowSums(counts), function(x) {x > 0})
    
    ## Round matrix and specify design
    dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group)

    ## Perform DE analysis
    system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) )

    ## Extract results
    deseq <- genes[nonzero]
    mcols(deseq) <- cbind(mcols(deseq), results(dse))

    ## Which are significant?
    mcols(deseq)$sig <- mcols(deseq)$padj < 0.05
    mcols(deseq)$sig[is.na(mcols(deseq)$sig)] <- FALSE

    ## Save results
    save(deseq, file = paste0(file, '-DESeq2.Rdata'))
    
    ## End
    return(deseq)
}

## edgeR analysis
run_edger <- function(counts, genes, file, groupInfo) {
    nonzero <- sapply(rowSums(counts), function(x) {x > 0})
    
    ## Determine design matrix
    design <- model.matrix(~ groupInfo)

    ## Perform DE analysis
    d <- DGEList(counts = counts[nonzero, ], group = groupInfo)
    d <- calcNormFactors(d)
    system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6))
    fw <- glmFit(dw, design = design, coef = 2:3)
    lrw <- glmLRT(fw, coef = 2:3)

    ## Extract results
    edger <- genes[nonzero]
    mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table))
    mcols(edger)$pvalue <-  lrw$table$PValue
    mcols(edger)$padj <- p.adjust(lrw$table$PValue, 'BH')

    ## Which are significant?
    mcols(edger)$sig <- mcols(edger)$padj < 0.05
    mcols(edger)$sig[is.na(mcols(edger)$sig)] <- FALSE

    ## Save results
    save(edger, file = paste0(file, '-edgeR.Rdata'))
    
    ## End
    return(edger)
}

Gene analysis

Sets

Define the gene sets.

genes <- list('complete' = gene_comp, 'incomplete' = gene_inc,
    'incomplete_random' = gene_rand)

Counting

Load counting data produced by summarizeOverlaps(). Also checks that the names of the genes are in the correct order and identifies the genes that have 0 counts in all samples.

## Count from summarizeOverlaps()
counts <- list('complete' = summOv_comp, 'incomplete' = summOv_inc,
    'incomplete_random' = summOv_rand)
counts <- lapply(counts, function(x) { assay(x)[, paste0('sample', 1:30)]})

## Check names
identical(rownames(counts[[1]]), names(gene_comp))
## [1] TRUE
identical(rownames(counts[[2]]), names(gene_inc))
## [1] TRUE
## Which genes have 0 counts?
lapply(counts, function(x) { 
    rownames(x)[rowSums(x) == 0]
})
## $complete
## [1] "100422998" "100423034"
## 
## $incomplete
## [1] "100422998" "100423034"
## 
## $incomplete_random
## [1] "100422998" "100423034"
## How did those genes look like?
x <- gene_comp[rownames(counts[[1]])[rowSums(counts[[1]]) == 0]]
x
## GRangesList object of length 2:
## $100422998 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames               ranges strand |   exon_id   exon_name
##                         |  
##   [1]    chr22 [28316514, 28316599]      + |    262063        
## 
## $100423034 
## GRanges object with 1 range and 2 metadata columns:
##       seqnames               ranges strand | exon_id exon_name
##   [1]    chr22 [28316513, 28316600]      - |  265043      
## 
## -------
## seqinfo: 1 sequence from hg19 genome
## Do they overlap each other when ignoring the strand?
strand(x) <- RleList('100422998' = Rle(factor("*", levels = c("+", "-", "*")), 1), '100423034' = Rle(factor("*", levels = c("+", "-", "*")), 1))

## They do
table(countOverlaps(x) - 1)
## 
## 1 
## 2

The two genes that have 0 counts are from different strands but overlap each other when we ignore the strand. Given the recommended settings for summarizeOverlaps() and the ambiguity introduced by non-strand specific RNA-seq data, we have to exclude these genes.

DE

The following code performs the gene-level differential expression analysis using DESeq2 and edgeR.

## DESeq2
system.time( deseq <- mapply(run_deseq, counts, genes, names(genes), MoreArgs = list(groupInfo = groupInfo)) )
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##    user  system elapsed 
##   2.948   0.035   3.027
## edgeR
system.time( edger <- mapply(run_edger, counts, genes, names(genes), MoreArgs = list(groupInfo = groupInfo)) )
##    user  system elapsed 
##   2.771   0.015   2.815

Agreement

The following code compares the DESeq2 and edgeR results.

agree <- function(deseq, edger) {
    addmargins(table('Significant DE gene -- DESeq2' = mcols(deseq)$sig, 'Significant DE gene -- edgeR' = mcols(edger)$sig))
}
mapply(agree, deseq, edger, SIMPLIFY = FALSE)
## $complete
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE    21    2  23
##                         TRUE      0   35  35
##                         Sum      21   37  58
## 
## $incomplete
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE    20    1  21
##                         TRUE      0   37  37
##                         Sum      20   38  58
## 
## $incomplete_random
##                              Significant DE gene -- edgeR
## Significant DE gene -- DESeq2 FALSE TRUE Sum
##                         FALSE    18    1  19
##                         TRUE      0   33  33
##                         Sum      18   34  52

There are only 2 and 1 disagreements between DESeq2 and edgeR for the complete and incomplete gene sets respectively.

Compare

Exonic segments

## Find exons
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
txinfo <- select(txdb, keys = chosen$ucsckg_id, columns = columns(txdb), keytype = 'TXNAME')

## Build GRangesList with exons grouped by transcript
tx <- split(GRanges(seqnames = txinfo$EXONCHROM, IRanges(start = txinfo$EXONSTART, end = txinfo$EXONEND), strand = txinfo$EXONSTRAND), txinfo$TXNAME)
tx <- tx[match(chosen$ucsckg_id, names(tx))]

## Gene level: DE if at least one transcript is DE
gene <- data.frame(gene_id = unique(chosen$gene_id))
gene$DE <- sapply(gene$gene_id, function(x) { any(chosen$DE[chosen$gene_id == x])  })
gene$case <- sapply(gene$gene_id, function(x) { unique(chosen$case[chosen$gene_id == x])  })

## Identify exonic segments
segments <- GRangesList(lapply(gene$gene_id, function(x) {
    i <- chosen$ucsckg_id[ chosen$gene_id == x]
    
    ## Find segments
    segs <- disjoin(unlist(tx[i]))
    ov <- findOverlaps(segs, tx[i])
    
    ## Find DE status per segment
    segs$DE <- as.vector(tapply(subjectHits(ov), queryHits(ov), function(y) {
        any(chosen$DE[ chosen$gene_id == x])
    }))
    
    ## Finish
    return(segs)
}))
names(segments) <- gene$gene_id
segs <- unlist(segments)

The following code checks if an exonic segment overlaps a DE gene for DESeq2 and edgeR, first by controlling the FDR and then by controlling the FWER.

count_comp <- function(info, ptype = 'padj', cut = 0.05) {
    if(ptype == 'padj') {
        idx <- mcols(info)$padj < cut
    } else if (ptype == 'pvalue') {
        idx <- mcols(info)$pvalue < cut
    } else {
        p <- p.adjust(mcols(info)$pvalue, ptype)
        idx <- p < cut
    }
    idx[is.na(idx)] <- FALSE

    ## Overlaps at least 1 DE exon
    addmargins(table('DE status' = segs$DE, 'Overlaps DE exon' = countOverlaps(segs, info[idx]) > 0))
}

## Default: adjusting p-values by FDR
lapply(deseq, count_comp)
## $complete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    90   21 111
##     TRUE      5  164 169
##     Sum      95  185 280
## 
## $incomplete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    88   23 111
##     TRUE     24  145 169
##     Sum     112  168 280
## 
## $incomplete_random
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    90   21 111
##     TRUE     17  152 169
##     Sum     107  173 280
lapply(edger, count_comp)
## $complete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    83   28 111
##     TRUE      5  164 169
##     Sum      88  192 280
## 
## $incomplete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    87   24 111
##     TRUE     24  145 169
##     Sum     111  169 280
## 
## $incomplete_random
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE    80   31 111
##     TRUE     17  152 169
##     Sum      97  183 280
## Adjusting p-values by Holm method
lapply(deseq, count_comp, ptype = 'holm')
## $complete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   111    0 111
##     TRUE      5  164 169
##     Sum     116  164 280
## 
## $incomplete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   111    0 111
##     TRUE     24  145 169
##     Sum     135  145 280
## 
## $incomplete_random
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   111    0 111
##     TRUE     17  152 169
##     Sum     128  152 280
lapply(edger, count_comp, ptype = 'holm')
## $complete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   108    3 111
##     TRUE      5  164 169
##     Sum     113  167 280
## 
## $incomplete
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   111    0 111
##     TRUE     24  145 169
##     Sum     135  145 280
## 
## $incomplete_random
##          Overlaps DE exon
## DE status FALSE TRUE Sum
##     FALSE   108    3 111
##     TRUE     17  152 169
##     Sum     125  155 280

Empirical Power

Empirical power for DESeq2 and edgeR. First for controlling the FDR and next for controlling the FWER.

emp_power <- function(info, ptype = 'padj') {
    m <- count_comp(info, ptype)
    round(m[2, 2] / m[2, 3] * 100, 2)
}


sapply(deseq, emp_power)
##          complete        incomplete incomplete_random 
##             97.04             85.80             89.94
sapply(edger, emp_power)
##          complete        incomplete incomplete_random 
##             97.04             85.80             89.94
## Adjusting p-values by Holm method
sapply(deseq, emp_power, ptype = 'holm')
##          complete        incomplete incomplete_random 
##             97.04             85.80             89.94
sapply(edger, emp_power, ptype = 'holm')
##          complete        incomplete incomplete_random 
##             97.04             85.80             89.94

Empirical FPR

Empirical false positive rate (FPR) for DESeq2 and edgeR. First for controlling the FDR and next for controlling the FWER.

emp_fpr <- function(info, ptype = 'padj') {
    m <- count_comp(info, ptype)
    round(m[1, 2] / m[1, 3] * 100, 2)
}

sapply(deseq, emp_fpr)
##          complete        incomplete incomplete_random 
##             18.92             20.72             18.92
sapply(edger, emp_fpr)
##          complete        incomplete incomplete_random 
##             25.23             21.62             27.93
## Adjusting p-values by Holm method
sapply(deseq, emp_fpr, ptype = 'holm')
##          complete        incomplete incomplete_random 
##                 0                 0                 0
sapply(edger, emp_fpr, ptype = 'holm')
##          complete        incomplete incomplete_random 
##               2.7               0.0               2.7

Empirical FDR

The empirical False Discovery Rate (FDR) is shown below for DESeq2 and edgeR. First for controlling the FDR and next for controlling the FWER.

emp_fdr <- function(info, ptype = 'padj') {
    m <- count_comp(info, ptype)
    round(m[1, 2] / m[3, 2] * 100, 2)
}

sapply(deseq, emp_fdr)
##          complete        incomplete incomplete_random 
##             11.35             13.69             12.14
sapply(edger, emp_fdr)
##          complete        incomplete incomplete_random 
##             14.58             14.20             16.94
## Adjusting p-values by Holm method
sapply(deseq, emp_fdr, ptype = 'holm')
##          complete        incomplete incomplete_random 
##                 0                 0                 0
sapply(edger, emp_fdr, ptype = 'holm')
##          complete        incomplete incomplete_random 
##              1.80              0.00              1.94

Overlap

As with previous reports (all-exons.html and counts.based.html) we can compare DERs versus the genes by overlapping them.

load('../derAnalysis/run2-v1.0.10/fullRegions.Rdata')

## Some formatting and subsets
fullRegions$significantFDR <- factor(fullRegions$qvalues < 0.05, levels = c('TRUE', 'FALSE'))
fullRegions$sigFDR <- as.logical(fullRegions$significantFDR)
fullRegs20 <- fullRegions[width(fullRegions) >= 20]

## Overlap table for all 4 cases
ov_table <- function(ders, counts, query = 'der', minov = 0) {
    if(query == 'der') {
        if(minov == 0) {
            res <- addmargins(table('Significant DER (FDR)' = ders$sigFDR, 'Overlaps sig DE gene' = countOverlaps(ders, counts[mcols(counts)$sig]) > 0))
        } else {
            res <- addmargins(table(ders$sigFDR, countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FDR)', paste0('Overlaps sig DE gene (min ', minov, 'bp)'))))
        }
    } else if (query == 'counts') {
        if(minov == 0) {
            res <- addmargins(table('Significant DE gene' = mcols(counts)$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFDR]) > 0))
        } else {
            res <- addmargins(table(mcols(counts)$sig[sapply(width(counts), sum) >= minov], countOverlaps(counts[sapply(width(counts), sum) >= minov], ders[ders$sigFDR], minoverlap = minov) > 0, dnn = c('Significant DE gene', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)'))))
        }
    }
    return(res)
}

## Explore mistmatched cases for DERs vs genes direction
explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & !ders$sigFDR)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & ders$sigFDR)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)),
            width_der = summary(width(ders[i])),
            ders_per_gene_table = table(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
            ders_per_gene = sort(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_der = summary(width(ders[i])),
            distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE))$distance),
            distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

## Explore mistmatched cases for genes vs DERs direction
explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    counts <- counts[sapply(width(counts), sum) >= minov]
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(counts, ders[ders$sigFDR], minoverlap = minov) > 0 & !mcols(counts)$sig)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(counts, ders[ders$sigFDR], minoverlap = minov) > 0 & mcols(counts)$sig)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFDR], minoverlap = minov)),
            width_gene = summary(sapply(width(counts[i]), sum)),
            genes_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFDR], minoverlap = minov)))),
            genes_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFDR], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_gene = summary(sapply(width(counts[i]), sum)),
            distance_nearest_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE))$distance),
             distance_nearest_sig_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders[ders$sigFDR], ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(unlist(counts[i]), ders[ders$sigFDR], ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

noNA <- function(x) {
    x[!is.na(x)]
}

Query: DERs

We first use the DERs as the query as shown below for DESeq2 and edgeR using all the DERs and then requiring a minimum overlap of 20 bp.

## DESeq2
lapply(deseq, function(x) {
    ov_table(fullRegions, x)
})
## $complete
##                      Overlaps sig DE gene
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    22  294 316
##                 TRUE      2  151 153
##                 Sum      24  445 469
## 
## $incomplete
##                      Overlaps sig DE gene
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    44  272 316
##                 TRUE      9  144 153
##                 Sum      53  416 469
## 
## $incomplete_random
##                      Overlaps sig DE gene
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    24  292 316
##                 TRUE     11  142 153
##                 Sum      35  434 469
lapply(deseq, function(x) {
    ov_table(fullRegs20, x, minov = 20L)
})
## $complete
##                      Overlaps sig DE gene (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      3  150 153
##                 Sum       4  165 169
## 
## $incomplete
##                      Overlaps sig DE gene (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     2   14  16
##                 TRUE      9  144 153
##                 Sum      11  158 169
## 
## $incomplete_random
##                      Overlaps sig DE gene (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE     12  141 153
##                 Sum      13  156 169
## edgeR
lapply(edger, function(x) {
    ov_table(fullRegions, x)
})
## $complete
##                      Overlaps sig DE gene
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    20  296 316
##                 TRUE      2  151 153
##                 Sum      22  447 469
## 
## $incomplete
##                      Overlaps sig DE gene
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    44  272 316
##                 TRUE      9  144 153
##                 Sum      53  416 469
## 
## $incomplete_random
##                      Overlaps sig DE gene
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE    24  292 316
##                 TRUE     11  142 153
##                 Sum      35  434 469
lapply(edger, function(x) {
    ov_table(fullRegs20, x, minov = 20L)
})
## $complete
##                      Overlaps sig DE gene (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE      3  150 153
##                 Sum       4  165 169
## 
## $incomplete
##                      Overlaps sig DE gene (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     2   14  16
##                 TRUE      9  144 153
##                 Sum      11  158 169
## 
## $incomplete_random
##                      Overlaps sig DE gene (min 20bp)
## Significant DER (FDR) FALSE TRUE Sum
##                 FALSE     1   15  16
##                 TRUE     12  141 153
##                 Sum      13  156 169

The results are identical between DESeq2 and edgeR with surprisingly slightly better agreements in with the incomplete gene set instead of the complete one when requiring a minimum overlap of 20bp.

The common disagreement scenario is when a non significant DER overlaps a significant DE gene.

The following code explores the mismatches (min 20 bp overlap) using DESeq2 when the DERs are the query.

lapply(deseq, function(x) {
    explore_ov(fullRegions, x, minov = 20L)[1:3]
})
## $complete
## $complete$n_overlaps
## 
##  1 
## 16 
## 
## $complete$width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   21.75   24.50   25.31   30.00   37.00 
## 
## $complete$ders_per_gene_table
## 
## 1 2 6 
## 4 3 1 
## 
## 
## $incomplete
## $incomplete$n_overlaps
## 
##  1 
## 14 
## 
## $incomplete$width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.25   26.00   26.57   30.00   37.00 
## 
## $incomplete$ders_per_gene_table
## 
## 1 2 6 
## 4 2 1 
## 
## 
## $incomplete_random
## $incomplete_random$n_overlaps
## 
##  1 
## 35 
## 
## $incomplete_random$width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   15.00   18.00   19.74   24.00   37.00 
## 
## $incomplete_random$ders_per_gene_table
## 
##  1  2  5 21 
##  3  3  1  1
lapply(deseq, function(x) {
    explore_ov(fullRegions, x, 'TRUE-FALSE', minov = 20L)[1:3]
})
## $complete
## $complete$width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      39      76     113      89     114     115 
## 
## $complete$distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    9754   14630   29260 
## 
## $complete$distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      66     132    9798   14700   29260 
## 
## 
## $incomplete
## $incomplete$width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   39.00   41.00   67.00   80.22   87.00  232.00 
## 
## $incomplete$distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      19     190    1991    5461    5236   29260 
## 
## $incomplete$distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      19     190    1991    5461    5236   29260 
## 
## 
## $incomplete_random
## $incomplete_random$width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   63.00   92.00   95.17  117.80  204.00 
## 
## $incomplete_random$distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1961    3837    6002    6730   29260 
## 
## $incomplete_random$distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    1961    3837    6002    6730   29260

Query: genes

Next we can use the genes as the query.

## DESeq2
lapply(deseq, function(x) {
    ov_table(fullRegions, x, 'counts')
})
## $complete
##                    Overlaps sig DER (FWER)
## Significant DE gene FALSE TRUE Sum
##               FALSE    22    1  23
##               TRUE      6   29  35
##               Sum      28   30  58
## 
## $incomplete
##                    Overlaps sig DER (FWER)
## Significant DE gene FALSE TRUE Sum
##               FALSE    21    0  21
##               TRUE      7   30  37
##               Sum      28   30  58
## 
## $incomplete_random
##                    Overlaps sig DER (FWER)
## Significant DE gene FALSE TRUE Sum
##               FALSE    18    1  19
##               TRUE      6   27  33
##               Sum      24   28  52
lapply(deseq, function(x) {
    ov_table(fullRegs20, x, 'counts', minov = 20L)
})
## $complete
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE TRUE Sum
##               FALSE    22    1  23
##               TRUE      6   29  35
##               Sum      28   30  58
## 
## $incomplete
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE TRUE Sum
##               FALSE    21    0  21
##               TRUE      7   30  37
##               Sum      28   30  58
## 
## $incomplete_random
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE TRUE Sum
##               FALSE    18    1  19
##               TRUE      6   27  33
##               Sum      24   28  52
## edgeR
lapply(edger, function(x) {
    ov_table(fullRegions, x, 'counts')
})
## $complete
##                    Overlaps sig DER (FWER)
## Significant DE gene FALSE TRUE Sum
##               FALSE    20    1  21
##               TRUE      8   29  37
##               Sum      28   30  58
## 
## $incomplete
##                    Overlaps sig DER (FWER)
## Significant DE gene FALSE TRUE Sum
##               FALSE    20    0  20
##               TRUE      8   30  38
##               Sum      28   30  58
## 
## $incomplete_random
##                    Overlaps sig DER (FWER)
## Significant DE gene FALSE TRUE Sum
##               FALSE    17    1  18
##               TRUE      7   27  34
##               Sum      24   28  52
lapply(edger, function(x) {
    ov_table(fullRegs20, x, 'counts', minov = 20L)
})
## $complete
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE TRUE Sum
##               FALSE    20    1  21
##               TRUE      8   29  37
##               Sum      28   30  58
## 
## $incomplete
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE TRUE Sum
##               FALSE    20    0  20
##               TRUE      8   30  38
##               Sum      28   30  58
## 
## $incomplete_random
##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE TRUE Sum
##               FALSE    17    1  18
##               TRUE      7   27  34
##               Sum      24   28  52

As opposed to when DERs were used as the query, this time there is a small disagreement between DESeq2 and edgeR when requiring a minimum 20bp overlap. Overlap, the most common disagreement case against the DERs is when a significant DE gene does not overlap a significant DER.

The following code explores the mismatches (min 20 bp overlap) using DESeq2 when the genes are the query.

## No cases FALSE-TRUE
#lapply(deseq, function(x) {
#    explore_ov_counts(fullRegions, x, minov = 20L)[1:3]
#})

lapply(deseq, function(x) {
    explore_ov_counts(fullRegions, x, 'TRUE-FALSE', minov = 20L)[1:3]
})
## $complete
## $complete$width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      66      86    1116    2971    4574   10060 
## 
## $complete$distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2047    7400  161000   12060 2986000 
## 
## $complete$distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   85170   92840   98110  320100  242300 3512000 
## 
## 
## $incomplete
## $incomplete$width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      66      94    1131    1906    3494    4971 
## 
## $incomplete$distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2047    5131  159300   10950 2986000 
## 
## $incomplete$distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   85170   92840   99560  330500  288000 3512000 
## 
## 
## $incomplete_random
## $incomplete_random$width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      66      86   12520   15650   27890   39850 
## 
## $incomplete_random$distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0  594500  587600 2986000 
## 
## $incomplete_random$distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   85170  227300  502400  818500  587700 3512000

Conclusions

DESeq2 and edgeR result in the same empirical power regardless of the gene set used, although both have high empirical FPR (minimum 18.92 for DESeq2 in complete gene set).

In the overlap comparison with a minimum overlap of 20 bp, when DERs are used as the query, the common disagreement scenario is when a non significant DER overlaps a significant DE gene. When genes are used as the query, the most common disagreement case against the DERs is when a significant DE gene does not overlap a significant DER.

Reproducibility

## Reproducibility info
Sys.time()
## [1] "2015-04-07 03:06:12 EDT"
proc.time()
##    user  system elapsed 
##  35.634   0.382  36.387
options(width = 120)
devtools::session_info()
## Session info-----------------------------------------------------------------------------------------------------------
##  setting  value                                             
##  version  R Under development (unstable) (2014-11-01 r66923)
##  system   x86_64, darwin10.8.0                              
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       America/New_York
## Packages---------------------------------------------------------------------------------------------------------------
##  package                           * version     date       source                                   
##  acepack                             1.3.3.3     2013-05-03 CRAN (R 3.2.0)                           
##  annotate                            1.45.4      2015-03-21 Bioconductor                             
##  AnnotationDbi                     * 1.29.21     2015-04-03 Bioconductor                             
##  Biobase                           * 2.27.3      2015-03-27 Bioconductor                             
##  BiocGenerics                      * 0.13.11     2015-04-03 Bioconductor                             
##  BiocParallel                        1.1.21      2015-03-24 Bioconductor                             
##  biomaRt                             2.23.5      2014-11-22 Bioconductor                             
##  Biostrings                          2.35.12     2015-03-26 Bioconductor                             
##  bitops                              1.0.6       2013-08-17 CRAN (R 3.2.0)                           
##  cluster                             2.0.1       2015-01-31 CRAN (R 3.2.0)                           
##  colorspace                          1.2.6       2015-03-11 CRAN (R 3.2.0)                           
##  DBI                                 0.3.1       2014-09-24 CRAN (R 3.2.0)                           
##  DESeq2                            * 1.7.50      2015-04-04 Bioconductor                             
##  devtools                            1.6.1       2014-10-07 CRAN (R 3.2.0)                           
##  digest                              0.6.8       2014-12-31 CRAN (R 3.2.0)                           
##  edgeR                             * 3.9.15      2015-04-03 Bioconductor                             
##  evaluate                            0.5.5       2014-04-29 CRAN (R 3.2.0)                           
##  foreign                             0.8.63      2015-02-20 CRAN (R 3.2.0)                           
##  formatR                             1.0         2014-08-25 CRAN (R 3.2.0)                           
##  Formula                             1.2.0       2015-01-20 CRAN (R 3.2.0)                           
##  futile.logger                       1.4         2015-03-21 CRAN (R 3.2.0)                           
##  futile.options                      1.0.0       2010-04-06 CRAN (R 3.2.0)                           
##  genefilter                          1.49.2      2014-10-21 Bioconductor                             
##  geneplotter                         1.45.0      2014-10-14 Bioconductor                             
##  GenomeInfoDb                      * 1.3.16      2015-03-27 Bioconductor                             
##  GenomicAlignments                   1.3.33      2015-04-06 Bioconductor                             
##  GenomicFeatures                   * 1.19.36     2015-03-30 Bioconductor                             
##  GenomicRanges                     * 1.19.52     2015-04-04 Bioconductor                             
##  ggplot2                             1.0.0       2014-05-21 CRAN (R 3.2.0)                           
##  gtable                              0.1.2       2012-12-05 CRAN (R 3.2.0)                           
##  Hmisc                               3.14.5      2014-09-12 CRAN (R 3.2.0)                           
##  htmltools                           0.2.6       2014-09-08 CRAN (R 3.2.0)                           
##  IRanges                           * 2.1.43      2015-03-07 Bioconductor                             
##  knitr                               1.7         2014-10-13 CRAN (R 3.2.0)                           
##  knitrBootstrap                      1.0.0       2014-11-03 Github (jimhester/knitrBootstrap@76c41f0)
##  lambda.r                            1.1.7       2015-03-20 CRAN (R 3.2.0)                           
##  lattice                             0.20.31     2015-03-30 CRAN (R 3.2.0)                           
##  latticeExtra                        0.6.26      2013-08-15 CRAN (R 3.2.0)                           
##  limma                             * 3.23.12     2015-04-05 Bioconductor                             
##  locfit                              1.5.9.1     2013-04-20 CRAN (R 3.2.0)                           
##  markdown                            0.7.4       2014-08-24 CRAN (R 3.2.0)                           
##  MASS                                7.3.40      2015-03-21 CRAN (R 3.2.0)                           
##  munsell                             0.4.2       2013-07-11 CRAN (R 3.2.0)                           
##  nnet                                7.3.9       2015-02-11 CRAN (R 3.2.0)                           
##  plyr                                1.8.1       2014-02-26 CRAN (R 3.2.0)                           
##  proto                               0.3.10      2012-12-22 CRAN (R 3.2.0)                           
##  RColorBrewer                        1.1.2       2014-12-07 CRAN (R 3.2.0)                           
##  Rcpp                              * 0.11.5      2015-03-06 CRAN (R 3.2.0)                           
##  RcppArmadillo                     * 0.4.650.1.1 2015-02-26 CRAN (R 3.2.0)                           
##  RCurl                               1.95.4.5    2014-12-28 CRAN (R 3.2.0)                           
##  reshape2                            1.4.1       2014-12-06 CRAN (R 3.2.0)                           
##  rmarkdown                         * 0.3.3       2014-09-17 CRAN (R 3.2.0)                           
##  rpart                               4.1.9       2015-02-24 CRAN (R 3.2.0)                           
##  Rsamtools                           1.19.49     2015-03-27 Bioconductor                             
##  RSQLite                             1.0.0       2014-10-25 CRAN (R 3.2.0)                           
##  rstudioapi                          0.2         2014-12-31 CRAN (R 3.2.0)                           
##  rtracklayer                         1.27.11     2015-04-01 Bioconductor                             
##  S4Vectors                         * 0.5.22      2015-03-06 Bioconductor                             
##  scales                              0.2.4       2014-04-22 CRAN (R 3.2.0)                           
##  stringr                             0.6.2       2012-12-06 CRAN (R 3.2.0)                           
##  survival                            2.38.1      2015-02-24 CRAN (R 3.2.0)                           
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0       2014-09-26 Bioconductor                             
##  XML                                 3.98.1.1    2013-06-20 CRAN (R 3.2.0)                           
##  xtable                              1.7.4       2014-09-12 CRAN (R 3.2.0)                           
##  XVector                           * 0.7.4       2015-02-08 Bioconductor                             
##  yaml                                2.1.13      2014-06-12 CRAN (R 3.2.0)                           
##  zlibbioc                            1.13.3      2015-03-23 Bioconductor