Compare vs PNAS at gene level

Gene-level analysis

This section has the code for running edgeR-robust and DESeq2 on the simulation data set using the known genes as features. It is based on compareVsPNAS.html, counts-based.html and counts-gene-eval.html.

This first code chunk loads the necessary data.

## Track time spent on making the report
startTime <- Sys.time()

library('edgeR')

## Loading required package: limma

library('DESeq2')

## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: Rcpp
## Loading required package: RcppArmadillo

library('GenomicRanges')
library('TxDb.Hsapiens.UCSC.hg19.knownGene')

## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Load data
load("../counts-gene/summOv.Rdata")
load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata")
load("../derAnalysis/run3-v1.0.10/colsubset.Rdata")

## GenomicState object
if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
    load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
    load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
} else {
    stop('Missing UCSC hg19 genomic state object')
}

## Find genes
genes <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene')

## Round matrix and remove genes with 0s
counts <- assay(summOv)[, colsubset]
nonzero <- sapply(rowSums(counts), function(x) {x > 0})

DESeq2

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

## Round matrix and specify design
dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group)

## Perform DE analysis
system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) )

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 70 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

##    user  system elapsed 
##  32.601   0.844  33.560

## Extract results
deseq <- genes[nonzero]
mcols(deseq) <- cbind(mcols(deseq), results(dse))

## Which are significant?
mcols(deseq)$sig <- mcols(deseq)$padj < 0.05
mcols(deseq)$sig[is.na(mcols(deseq)$sig)] <- FALSE

## Save results
save(deseq, file = 'deseq-gene.Rdata')

## Adjust by Holm
deseq_holm <- deseq
mcols(deseq_holm)$sig <- p.adjust(mcols(deseq_holm)$pvalue, 'holm') < 0.05

edgeR-robust

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

## Determine design matrix
design <- model.matrix(~ groupInfo)

## Perform DE analysis
d <- DGEList(counts = counts[nonzero, ], group = groupInfo)
d <- calcNormFactors(d)
system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6))

##    user  system elapsed 
##  76.999   4.804  82.360

fw <- glmFit(dw, design = design, coef = 2:3)
lrw <- glmLRT(fw, coef = 2:3)

## Extract results
edger <- genes[nonzero]
mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table))
mcols(edger)$pvalue <-  lrw$table$PValue
mcols(edger)$padj <- p.adjust(lrw$table$PValue, 'BH')

## Which are significant?
mcols(edger)$sig <- mcols(edger)$padj < 0.05
mcols(edger)$sig[is.na(mcols(edger)$sig)] <- FALSE

## Save results
save(edger, file = 'edger-gene.Rdata')

## Adjust by Holm
edger_holm <- edger
mcols(edger_holm)$sig <- p.adjust(mcols(edger_holm)$pvalue, 'holm') < 0.05

Overlap

## Load data
load('../derAnalysis/run3-v1.0.10/fullRegions.Rdata')

## Some formatting and subsets
names(fullRegions) <- seq_len(length(fullRegions))
fullRegions$sigFWER <- as.logical(fullRegions$significantFWER)
fullRegs20 <- fullRegions[width(fullRegions) >= 20]

## Overlap table for all 4 cases
ov_table <- function(ders, counts, query = 'der', minov = 0) {
    if(query == 'der') {
        if(minov == 0) {
            res <- addmargins(table('Significant DER (FWER)' = ders$sigFWER, 'Overlaps sig DE gene' = countOverlaps(ders, counts[mcols(counts)$sig]) > 0))
        } else {
            res <- addmargins(table(ders$sigFWER, countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FWER)', paste0('Overlaps sig DE gene (min ', minov, 'bp)'))))
        }
    } else if (query == 'counts') {
        if(minov == 0) {
            res <- addmargins(table('Significant DE gene' = mcols(counts)$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFWER]) > 0))
        } else {
            res <- addmargins(table(mcols(counts)$sig[sapply(width(counts), sum) >= minov], countOverlaps(counts[sapply(width(counts), sum) >= minov], ders[ders$sigFWER], minoverlap = minov) > 0, dnn = c('Significant DE gene', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)'))))
        }
    }
    return(res)
}

## Explore mistmatched cases for DERs vs genes direction
explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & !ders$sigFWER)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & ders$sigFWER)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)),
            width_der = summary(width(ders[i])),
            ders_per_gene_table = table(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
            ders_per_gene = sort(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_der = summary(width(ders[i])),
            distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE))$distance),
            distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

## Explore mistmatched cases for genes vs DERs direction
explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    counts <- counts[sapply(width(counts), sum) >= minov]
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & !mcols(counts)$sig)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & mcols(counts)$sig)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)),
            width_gene = summary(sapply(width(counts[i]), sum)),
            genes_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
            genes_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_gene = summary(sapply(width(counts[i]), sum)),
            distance_nearest_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE))$distance),
             distance_nearest_sig_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

noNA <- function(x) {
    x[!is.na(x)]
}

DESeq2

Query: DERs

We can first compare the results by using the DERs as the query and the genes as the subject. The following output shows the comparison using all DERs and exploring the mismatched cases. Then its repeated using the DERs ≥ 20 bp and a minimum overlap of 20bp.

For the mismatched cases of non-significant DERs overlapping a significant gene, we check:

how many genes each DER overlaps,
the width of the DERs
the frequency table of how many DERs overlap the same gene

For the other mismatched case, we check:

the width of the DERs
distance to nearest gene (regardless of gene size)
distance to nearest significant DE gene (ibidem)

## Overlap between DERs and significant DE genes
ov_table(fullRegions, deseq)

##                       Overlaps sig DE gene
## Significant DER (FWER) FALSE  TRUE   Sum
##                  FALSE 13285 18178 31463
##                  TRUE    285   230   515
##                  Sum   13570 18408 31978

## Explore mismatched cases
#noNA(explore_ov(fullRegions, deseq)[1:3])
#noNA(explore_ov(fullRegions, deseq, 'TRUE-FALSE')[1:3])

## Min 20 bp overlap, using only DERs 20 bp long
ov_table(fullRegs20, deseq, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  1026 1252 2278
##                  TRUE    285  230  515
##                  Sum    1311 1482 2793

## Explore mismatched cases, min 20bp overlap
noNA(explore_ov(fullRegs20, deseq, minov = 20L)[1:3])

## $n_overlaps
## 
##    1    2 
## 1245    7 
## 
## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.00   29.00   30.05   36.00   62.00 
## 
## $ders_per_gene_table
## 
##   1   2   3   4   5   6   7   8   9  10  21 
## 528 127  52  24  14  10   3   3   1   2   1

noNA(explore_ov(fullRegs20, deseq, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   47.00   59.00   66.76   77.00  182.00 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0     933   16980   16990 1385000 
## 
## $distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54    5925   23620   83210   65500 2651000

## Holm vs BH
addmargins(table('DESeq2 Holm' = mcols(deseq_holm)$sig, 'DESeq2 BH' = mcols(deseq)$sig))

##            DESeq2 BH
## DESeq2 Holm FALSE  TRUE   Sum
##       FALSE 14042  7463 21505
##       TRUE      0   311   311
##       Sum   14042  7774 21816

## Use Holm and min 20 bp ov
ov_table(fullRegs20, deseq_holm, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  2057  221 2278
##                  TRUE    442   73  515
##                  Sum    2499  294 2793

Most of the DERs are shorter than 20bp (91.27 percent), so we'll focus on the longer ones. The majority of the mismatches are from non significant DERs that overlap a significant gene.

As expected, when controlling the FWER instead of the FDR, most of the DE genes are no longer significant. Using FWER-controlled DE genes, most of the DERs 20bp or longer agree with the genes as not being significantly DE.

Query: genes

We can now repeat the comparison using the genes as the query and the DERs as the subject.

For the mismatched cases of non-significant genes overlapping a significant DER, we check:

how many DERs each gene overlaps,
the width of the genes
the frequency table of how many genes overlap the same DER

For the other mismatched case, we check:

the width of the genes
distance to nearest DER (regardless of DER size)
distance to nearest significant DER (ibidem)

## Overlap between genes and significant DERs
#ov_table(fullRegions, deseq, 'counts')

## Explore mismatched cases
#noNA(explore_ov_counts(fullRegions, deseq)[1:3])
#noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE')[1:3])

## Overlap between genes and significant DERs, min 20 bp
ov_table(fullRegions, deseq, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 13940   102 14042
##               TRUE   7598   176  7774
##               Sum   21538   278 21816

## Explore mismatched cases
noNA(explore_ov_counts(fullRegions, deseq, minov = 20L)[1:3])

## $n_overlaps
## 
##  1  2  3  4  6 11 
## 73 21  4  2  1  1 
## 
## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    1488    3452    4354    7060   13630 
## 
## $genes_per_der_table
## 
##   1  20 
## 132   1

noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54    2386    3770    4722    5941  118100 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    6176   31160  138800  119100 5384000 
## 
## $distance_nearest_sig_sum
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       52   759100  2443000  5405000  6433000 48690000

## Now with Holm
ov_table(fullRegions, deseq_holm, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 21277   228 21505
##               TRUE    261    50   311
##               Sum   21538   278 21816

From these results, we can see that derfinder is more conservative.

edgeR-robust

Query: DERs

Similar comparison using DERs as query and genes as subject with edgeR-robust results.

## Overlap between DERs and significant DE genes
#ov_table(fullRegions, edger)

## Explore mismatched cases
#noNA(explore_ov(fullRegions, edger)[1:3])
#noNA(explore_ov(fullRegions, edger, 'TRUE-FALSE')[1:3])

## Min 20 bp overlap, using only DERs 20 bp long
ov_table(fullRegs20, edger, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE   911 1367 2278
##                  TRUE    227  288  515
##                  Sum    1138 1655 2793

## Explore mismatched cases, min 20bp overlap
noNA(explore_ov(fullRegs20, edger, minov = 20L)[1:3])

## $n_overlaps
## 
##    1    2 
## 1354   13 
## 
## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.00   29.00   30.19   36.00   62.00 
## 
## $ders_per_gene_table
## 
##   1   2   3   4   5   6   7   8   9  10  21 
## 563 136  61  27  16  12   4   3   1   2   1

noNA(explore_ov(fullRegs20, edger, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.00   46.00   58.00   64.83   74.00  174.00 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0    6121   21320   20470 1385000 
## 
## $distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      69    7492   24510   74750   43900 2089000

## Holm vs BH
addmargins(table('edgeR Holm' = mcols(edger_holm)$sig, 'edger BH' = mcols(edger)$sig))

##           edger BH
## edgeR Holm FALSE  TRUE   Sum
##      FALSE 13397  7887 21284
##      TRUE      0   532   532
##      Sum   13397  8419 21816

## With Holm, 20bp
ov_table(fullRegs20, edger_holm, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  2067  211 2278
##                  TRUE    442   73  515
##                  Sum    2509  284 2793

The results are fairly similar to those from using DESeq2.

Query: genes

Similar comparison using genes as query and DERs as subject with edgeR-robust results.

## Overlap between genes and significant DERs
#ov_table(fullRegions, edger, 'counts')

## Explore mismatched cases
#noNA(explore_ov_counts(fullRegions, edger)[1:3])
#noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE')[1:3])

## Overlap between genes and significant DERs, min 20 bp
ov_table(fullRegions, edger, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 13325    72 13397
##               TRUE   8213   206  8419
##               Sum   21538   278 21816

## Explore mismatched cases
noNA(explore_ov_counts(fullRegions, edger, minov = 20L)[1:3])

## $n_overlaps
## 
##  1  2  3  4 
## 57 11  2  2 
## 
## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    1579    3758    4619    7095   13630 
## 
## $genes_per_der_table
## 
##  1 19 
## 74  1

noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      46    2282    3631    4514    5667  118100 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5977   31080  141900  122200 5384000 
## 
## $distance_nearest_sig_sum
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       52   737100  2360000  5379000  6422000 48690000

## With Holm, 20 bp
ov_table(fullRegions, edger_holm, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 21059   225 21284
##               TRUE    479    53   532
##               Sum   21538   278 21816

overall

While the DERs vs genes results are fairly similar between edgeR-robust and DESeq2, as shown below the number of mismatched cases is high compared to the number of cases both counts-based methods agree. This is also true when controlling the FWER to determine significance.

## edgeR vs DESeq2
addmargins(table('edgeR-robust (FDR)' = mcols(edger)$sig, 'DESeq2 (FDR)' = mcols(deseq)$sig))

##                   DESeq2 (FDR)
## edgeR-robust (FDR) FALSE  TRUE   Sum
##              FALSE 12877   520 13397
##              TRUE   1165  7254  8419
##              Sum   14042  7774 21816

## Control FWER
addmargins(table('edgeR-robust (FWER)' = mcols(edger_holm)$sig, 'DESeq2 (FWER)' = mcols(deseq_holm)$sig))

##                    DESeq2 (FWER)
## edgeR-robust (FWER) FALSE  TRUE   Sum
##               FALSE 21234    50 21284
##               TRUE    271   261   532
##               Sum   21505   311 21816

## Only sig if both edgeR and DEseq2 say it is
both <- deseq
mcols(both)$sig <- mcols(both)$sig & mcols(edger)$sig

## Same, for holm
both_holm <- deseq_holm
mcols(both_holm)$sig <- mcols(both_holm)$sig & mcols(edger_holm)$sig

We can consider an gene to be DE only if both edgeR-robust and DESeq2 find that its significantly DE. The next sections use this information.

Query: DERs

## Overlap between DERs and significant DE genes
#ov_table(fullRegions, both)

## Explore mismatched cases
#noNA(explore_ov(fullRegions, both)[1:3])
#noNA(explore_ov(fullRegions, both, 'TRUE-FALSE')[1:3])

## Min 20 bp overlap, using only DERs 20 bp long
ov_table(fullRegs20, both, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  1063 1215 2278
##                  TRUE    286  229  515
##                  Sum    1349 1444 2793

## Explore mismatched cases, min 20bp overlap
noNA(explore_ov(fullRegs20, both, minov = 20L)[1:3])

## $n_overlaps
## 
##    1    2 
## 1208    7 
## 
## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.00   29.00   30.15   36.00   62.00 
## 
## $ders_per_gene_table
## 
##   1   2   3   4   5   6   7   8   9  10  21 
## 513 121  52  23  14   9   3   3   1   2   1

noNA(explore_ov(fullRegs20, both, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   47.00   59.50   66.84   77.00  182.00 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0     924   16930   16970 1385000 
## 
## $distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54    7028   26650   89060   72640 2651000

## Holm vs BH
addmargins(table('Both Holm' = mcols(both_holm)$sig, 'Both BH' = mcols(both)$sig))

##          Both BH
## Both Holm FALSE  TRUE   Sum
##     FALSE 14562  6993 21555
##     TRUE      0   261   261
##     Sum   14562  7254 21816

## Use Holm and min 20 bp ov
ov_table(fullRegs20, both_holm, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  2109  169 2278
##                  TRUE    454   61  515
##                  Sum    2563  230 2793

The trends observed previously are maintained in this comparison with a reduction of cases where the gene is DE. This is expected due to the non-perfect agreement between DESeq2 and edgeR-robust.

library('TxDb.Hsapiens.UCSC.hg19.knownGene')
library('derfinder')

## Find out what's changed in derfinder with
## news(Version == "1.1.17", package = "derfinder")

library('derfinderHelper')
library('derfinderPlot')

## Find out what's changed in derfinderPlot with
## news(Version == "1.1.6", package = "derfinderPlot")

load('../derAnalysis/run3-v1.0.10/models.Rdata')
load('../derAnalysis/run3-v1.0.10/chr22/optionsStats.Rdata')
load("../CoverageInfo/fullCov.Rdata")

def.par <- par()
def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))]

regPlot <- function(region, title) {
    ## Calculate F-stats
    range <- start(region):end(region)
    dat <- fullCov[[as.character(seqnames(region))]][range, colsubset]

    ## Log2 transform
    for(i in seq_len(length(groupInfo))) dat[[i]] <- log2(dat[[i]] + 32) 

    ## Calculate f-stats
    fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0))

    ## Find annotation
    annoReg <- annotateRegions(region, GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome, verbose = FALSE)
    symbol <- mcols(annoReg$annotationList[[1]])$symbol
    symbol <- as.character(noNA(symbol)[[1]])
    if(length(symbol) > 1) symbol <- symbol[1]
    symbol <- ifelse(is.null(symbol), NA, symbol)
    ## Remove symbol name because it gets chomped on the plot
    mcols(annoReg$annotationList[[1]])$symbol <- NA
    
    par(def.par)

    ## Plot long gene
    plotRegionCoverage(region, getRegionCoverage(fullCov, region, verbose = FALSE), groupInfo, data.frame(name = title, distance = NA, region = symbol), annoReg, verbose = FALSE, ask = FALSE, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene)

    ## Add F-stat track
    par(fig = c(0, 1, 0.065, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1))
    plot(y = fstats, x = range, ylab = 'F-stat', type = 'l', xlab = '', bty = 'n', ylim = c(0, max(fstats[is.finite(fstats)], optionsStats$cutoffFstatUsed) * 1.1), las = 2, yaxt = 'n')
    y.max <- round(max(c(optionsStats$cutoffFstatUsed, fstats[is.finite(fstats)]), na.rm = TRUE), 0)
    axis(2, at = c(0, round(y.max / 2, 0), y.max), c(0, round(y.max / 2, 0), y.max), las = 2, tick = TRUE)
    abline(h = optionsStats$cutoffFstatUsed, col = 'red')
    abline(h = 0, col = 'grey')
}

sortWidth <- function(regions) {
    regions[order(width(regions), decreasing = TRUE)]
}

We can now make plots to explore some DERs for each of the cases.

query_der_plots <- function() {
    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & fullRegs20$sigFWER])[1:10], function(reg) {
        regPlot(reg, 'DER query: DE agreement')
    })

    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & !fullRegs20$sigFWER])[1:10],  function(reg) {
        regPlot(reg, 'DER query: not DE agreement')
    })

    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & fullRegs20$sigFWER])[1:10], function(reg) {
        regPlot(reg, 'DER query: only gene not DE')
    })

    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & !fullRegs20$sigFWER])[1:10], function(reg) {
        regPlot(reg, 'DER query: only gene DE')
    })
}
pdf(file = 'query_der_plots_gene.pdf', width = 10, height = 7)
query_der_plots()

## $`535`
## NULL
## 
## $`540`
## NULL
## 
## $`625`
## NULL
## 
## $`594`
## NULL
## 
## $`589`
## NULL
## 
## $`574`
## NULL
## 
## $`610`
## NULL
## 
## $`616`
## NULL
## 
## $`626`
## NULL
## 
## $`533`
## NULL

dev.off()

## pdf 
##   2

query_der_plots()

## $`535`
## NULL
## 
## $`540`
## NULL
## 
## $`625`
## NULL
## 
## $`594`
## NULL
## 
## $`589`
## NULL
## 
## $`574`
## NULL
## 
## $`610`
## NULL
## 
## $`616`
## NULL
## 
## $`626`
## NULL
## 
## $`533`
## NULL

Query: genes

As was shown with either DESeq2 or edgeR-robust results, derfinder is more conservative than the counts-based methods.

## Overlap between genes and significant DERs, min 20 bp
ov_table(fullRegions, both, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 14459   103 14562
##               TRUE   7079   175  7254
##               Sum   21538   278 21816

## Explore mismatched cases
noNA(explore_ov_counts(fullRegions, both, minov = 20L)[1:3])

## $n_overlaps
## 
##  1  2  3  4  6 11 
## 74 21  4  2  1  1 
## 
## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    1507    3495    4369    7055   13630 
## 
## $genes_per_der_table
## 
##   1  20 
## 133   1

noNA(explore_ov_counts(fullRegions, both, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54    2355    3709    4636    5822  118100 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5670   29530  137500  115400 5384000 
## 
## $distance_nearest_sig_sum
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       52   756600  2439000  5465000  6498000 48690000

## With Holm, 20 bp
ov_table(fullRegions, both_holm, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 21321   234 21555
##               TRUE    217    44   261
##               Sum   21538   278 21816

We can now visually explore some genes (maximum 10kb long) for each of the four cases.

max_kb <- function(x, limit = 1e4) {
    x[width(x) <= limit]
}

selectRegions <- function(x) {
    max_kb(sortWidth(unlist(range(x))))
}

query_gene_plots <- function() {
    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: DE agreement')
    })

    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: not DE agreement')
    })

    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: only gene not DE')
    })

    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: only gene DE')
    })
}
pdf(file = 'query_gene_plots.pdf', width = 10, height = 7)
query_gene_plots()

## $`401505`
## NULL
## 
## $`1581`
## NULL
## 
## $`26168`
## NULL
## 
## $`55299`
## NULL
## 
## $`4708`
## NULL
## 
## $`2524`
## NULL
## 
## $`112703`
## NULL
## 
## $`100506746`
## NULL
## 
## $`54989`
## NULL
## 
## $`363`
## NULL

dev.off()

## pdf 
##   2

query_gene_plots()

## $`401505`
## NULL
## 
## $`1581`
## NULL
## 
## $`26168`
## NULL
## 
## $`55299`
## NULL
## 
## $`4708`
## NULL
## 
## $`2524`
## NULL
## 
## $`112703`
## NULL
## 
## $`100506746`
## NULL
## 
## $`54989`
## NULL
## 
## $`363`
## NULL

Reproducibility

Date the report was generated.

## [1] "2015-03-30 21:24:43 EDT"

Wallclock time spent generating the report.

## Time difference of 29.945 mins

R session information.

## Session info-----------------------------------------------------------------------------------------------------------

##  setting  value                                             
##  version  R Under development (unstable) (2014-11-01 r66923)
##  system   x86_64, darwin10.8.0                              
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       America/New_York

## Packages---------------------------------------------------------------------------------------------------------------

##  package                           * version     date       source                                    
##  acepack                             1.3.3.3     2013-05-03 CRAN (R 3.2.0)                            
##  annotate                            1.45.4      2015-03-21 Bioconductor                              
##  AnnotationDbi                     * 1.29.20     2015-03-19 Bioconductor                              
##  Biobase                           * 2.27.3      2015-03-27 Bioconductor                              
##  BiocGenerics                      * 0.13.10     2015-03-27 Bioconductor                              
##  BiocParallel                        1.1.21      2015-03-24 Bioconductor                              
##  biomaRt                             2.23.5      2014-11-22 Bioconductor                              
##  Biostrings                          2.35.12     2015-03-26 Bioconductor                              
##  biovizBase                          1.15.3      2015-03-30 Bioconductor                              
##  bitops                              1.0.6       2013-08-17 CRAN (R 3.2.0)                            
##  BSgenome                            1.35.20     2015-03-27 Bioconductor                              
##  bumphunter                          1.7.6       2015-03-13 Github (lcolladotor/bumphunter@37d10e7)   
##  cluster                             2.0.1       2015-01-31 CRAN (R 3.2.0)                            
##  codetools                           0.2.11      2015-03-10 CRAN (R 3.2.0)                            
##  colorout                          * 1.0.2       2014-11-03 local                                     
##  colorspace                          1.2.6       2015-03-11 CRAN (R 3.2.0)                            
##  DBI                                 0.3.1       2014-09-24 CRAN (R 3.2.0)                            
##  derfinder                         * 1.1.17      2015-03-14 Github (lcolladotor/derfinder@3532e0c)    
##  derfinderHelper                   * 1.1.6       2015-03-15 Bioconductor                              
##  derfinderPlot                     * 1.1.6       2015-03-14 Github (lcolladotor/derfinderPlot@1319754)
##  DESeq2                            * 1.7.45      2015-03-25 Bioconductor                              
##  devtools                            1.6.1       2014-10-07 CRAN (R 3.2.0)                            
##  dichromat                           2.0.0       2013-01-24 CRAN (R 3.2.0)                            
##  digest                              0.6.8       2014-12-31 CRAN (R 3.2.0)                            
##  doRNG                               1.6         2014-03-07 CRAN (R 3.2.0)                            
##  edgeR                             * 3.9.14      2015-03-27 Bioconductor                              
##  evaluate                            0.5.5       2014-04-29 CRAN (R 3.2.0)                            
##  foreach                             1.4.2       2014-04-11 CRAN (R 3.2.0)                            
##  foreign                             0.8.63      2015-02-20 CRAN (R 3.2.0)                            
##  formatR                             1.0         2014-08-25 CRAN (R 3.2.0)                            
##  Formula                             1.2.0       2015-01-20 CRAN (R 3.2.0)                            
##  futile.logger                       1.4         2015-03-21 CRAN (R 3.2.0)                            
##  futile.options                      1.0.0       2010-04-06 CRAN (R 3.2.0)                            
##  genefilter                          1.49.2      2014-10-21 Bioconductor                              
##  geneplotter                         1.45.0      2014-10-14 Bioconductor                              
##  GenomeInfoDb                      * 1.3.16      2015-03-27 Bioconductor                              
##  GenomicAlignments                   1.3.32      2015-03-18 Bioconductor                              
##  GenomicFeatures                   * 1.19.36     2015-03-30 Bioconductor                              
##  GenomicFiles                        1.3.14      2015-03-07 Bioconductor                              
##  GenomicRanges                     * 1.19.48     2015-03-27 Bioconductor                              
##  GGally                              0.4.8       2014-08-26 CRAN (R 3.2.0)                            
##  ggbio                               1.15.2      2015-03-24 Bioconductor                              
##  ggplot2                             1.0.0       2014-05-21 CRAN (R 3.2.0)                            
##  graph                               1.45.2      2015-03-01 Bioconductor                              
##  gridExtra                           0.9.1       2012-08-09 CRAN (R 3.2.0)                            
##  gtable                              0.1.2       2012-12-05 CRAN (R 3.2.0)                            
##  Hmisc                               3.14.5      2014-09-12 CRAN (R 3.2.0)                            
##  htmltools                           0.2.6       2014-09-08 CRAN (R 3.2.0)                            
##  IRanges                           * 2.1.43      2015-03-07 Bioconductor                              
##  iterators                           1.0.7       2014-04-11 CRAN (R 3.2.0)                            
##  knitr                               1.7         2014-10-13 CRAN (R 3.2.0)                            
##  knitrBootstrap                      1.0.0       2014-11-03 Github (jimhester/knitrBootstrap@76c41f0) 
##  lambda.r                            1.1.7       2015-03-20 CRAN (R 3.2.0)                            
##  lattice                             0.20.30     2015-02-22 CRAN (R 3.2.0)                            
##  latticeExtra                        0.6.26      2013-08-15 CRAN (R 3.2.0)                            
##  limma                             * 3.23.11     2015-03-15 Bioconductor                              
##  locfit                              1.5.9.1     2013-04-20 CRAN (R 3.2.0)                            
##  markdown                            0.7.4       2014-08-24 CRAN (R 3.2.0)                            
##  MASS                                7.3.40      2015-03-21 CRAN (R 3.2.0)                            
##  Matrix                              1.1.5.1     2015-03-23 CRAN (R 3.2.0)                            
##  matrixStats                         0.14.0      2015-02-14 CRAN (R 3.2.0)                            
##  mime                                0.3         2015-03-29 CRAN (R 3.2.0)                            
##  munsell                             0.4.2       2013-07-11 CRAN (R 3.2.0)                            
##  nnet                                7.3.9       2015-02-11 CRAN (R 3.2.0)                            
##  OrganismDbi                         1.9.15      2015-03-30 Bioconductor                              
##  pkgmaker                            0.22        2014-05-14 CRAN (R 3.2.0)                            
##  plyr                                1.8.1       2014-02-26 CRAN (R 3.2.0)                            
##  proto                               0.3.10      2012-12-22 CRAN (R 3.2.0)                            
##  qvalue                              1.99.0      2015-03-30 Bioconductor                              
##  RBGL                                1.43.0      2014-10-14 Bioconductor                              
##  RColorBrewer                        1.1.2       2014-12-07 CRAN (R 3.2.0)                            
##  Rcpp                              * 0.11.5      2015-03-06 CRAN (R 3.2.0)                            
##  RcppArmadillo                     * 0.4.650.1.1 2015-02-26 CRAN (R 3.2.0)                            
##  RCurl                               1.95.4.5    2014-12-28 CRAN (R 3.2.0)                            
##  registry                            0.2         2012-01-24 CRAN (R 3.2.0)                            
##  reshape                             0.8.5       2014-04-23 CRAN (R 3.2.0)                            
##  reshape2                            1.4.1       2014-12-06 CRAN (R 3.2.0)                            
##  rmarkdown                           0.3.3       2014-09-17 CRAN (R 3.2.0)                            
##  rngtools                            1.2.4       2014-03-06 CRAN (R 3.2.0)                            
##  rpart                               4.1.9       2015-02-24 CRAN (R 3.2.0)                            
##  Rsamtools                           1.19.49     2015-03-27 Bioconductor                              
##  RSQLite                             1.0.0       2014-10-25 CRAN (R 3.2.0)                            
##  rstudioapi                          0.2         2014-12-31 CRAN (R 3.2.0)                            
##  rtracklayer                         1.27.10     2015-03-27 Bioconductor                              
##  S4Vectors                         * 0.5.22      2015-03-06 Bioconductor                              
##  scales                              0.2.4       2014-04-22 CRAN (R 3.2.0)                            
##  stringr                             0.6.2       2012-12-06 CRAN (R 3.2.0)                            
##  survival                            2.38.1      2015-02-24 CRAN (R 3.2.0)                            
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0       2014-09-26 Bioconductor                              
##  VariantAnnotation                   1.13.46     2015-03-26 Bioconductor                              
##  XML                                 3.98.1.1    2013-06-20 CRAN (R 3.2.0)                            
##  xtable                              1.7.4       2014-09-12 CRAN (R 3.2.0)                            
##  XVector                           * 0.7.4       2015-02-08 Bioconductor                              
##  yaml                                2.1.13      2014-06-12 CRAN (R 3.2.0)                            
##  zlibbioc                            1.13.3      2015-03-23 Bioconductor