Compare vs PNAS at gene level

Gene-level analysis

This section has the code for running edgeR-robust and DESeq2 on the simulation data set using the known genes as features.

This first code chunk loads the necessary data.

## Track time spent on making the report
startTime <- Sys.time()

library('edgeR')

## Loading required package: limma

library('DESeq2')

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following object is masked from 'package:limma':
## 
##     plotMA

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: Rcpp

## Loading required package: RcppArmadillo

library('GenomicRanges')
library('TxDb.Hsapiens.UCSC.hg19.knownGene')

## Loading required package: GenomicFeatures

## Loading required package: AnnotationDbi

## Load data
load("../counts-gene/summOv.Rdata")
load("../derAnalysis/run3-v1.0.10/groupInfo.Rdata")
load("../derAnalysis/run3-v1.0.10/colsubset.Rdata")

## GenomicState object
if(file.exists('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
    load('/home/epi/ajaffe/Lieber/Projects/RNAseq/derannotator/rdas/GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
} else if(file.exists('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')) {
    load('../../GenomicState.Hsapiens.UCSC.hg19.knownGene.rda')
} else {
    stop('Missing UCSC hg19 genomic state object')
}

## Find genes
genes <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, by = 'gene')

## Round matrix and remove genes with 0s
counts <- assay(summOv)[, colsubset]
nonzero <- sapply(rowSums(counts), function(x) {x > 0})

DESeq2

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

## Round matrix and specify design
dse <- DESeqDataSetFromMatrix(counts[nonzero, ], data.frame(group = groupInfo), ~ group)

## Perform DE analysis
system.time( dse <- DESeq(dse, test = 'LRT', reduced = ~ 1) )

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 70 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

##    user  system elapsed 
##  30.693   1.018  31.903

## Extract results
deseq <- genes[nonzero]
mcols(deseq) <- cbind(mcols(deseq), results(dse))

## Which are significant?
mcols(deseq)$sig <- mcols(deseq)$padj < 0.05
mcols(deseq)$sig[is.na(mcols(deseq)$sig)] <- FALSE

## Save results
save(deseq, file = 'deseq-gene.Rdata')

## Adjust by Holm
deseq_holm <- deseq
mcols(deseq_holm)$sig <- p.adjust(mcols(deseq_holm)$pvalue, 'holm') < 0.05

edgeR-robust

The following code performs the DESeq2 analysis. Code is based on edgeR_Robust supplementary code. The main change is that it has been modified for the multi-group scenario.

## Determine design matrix
design <- model.matrix(~ groupInfo)

## Perform DE analysis
d <- DGEList(counts = counts[nonzero, ], group = groupInfo)
d <- calcNormFactors(d)
system.time(dw <- estimateGLMRobustDisp(d, design = design, prior.df = 10, maxit = 6))

##    user  system elapsed 
##  75.526   6.136  82.226

fw <- glmFit(dw, design = design, coef = 2:3)
lrw <- glmLRT(fw, coef = 2:3)

## Extract results
edger <- genes[nonzero]
mcols(edger) <- cbind(mcols(edger), DataFrame(lrw$table))
mcols(edger)$pvalue <-  lrw$table$PValue
mcols(edger)$padj <- p.adjust(lrw$table$PValue, 'BH')

## Which are significant?
mcols(edger)$sig <- mcols(edger)$padj < 0.05
mcols(edger)$sig[is.na(mcols(edger)$sig)] <- FALSE

## Save results
save(edger, file = 'edger-gene.Rdata')

## Adjust by Holm
edger_holm <- edger
mcols(edger_holm)$sig <- p.adjust(mcols(edger_holm)$pvalue, 'holm') < 0.05

Overlap

## Load data
load('../derAnalysis/run3-v1.0.10/fullRegions.Rdata')

## Some formatting and subsets
names(fullRegions) <- seq_len(length(fullRegions))
fullRegions$sigFWER <- as.logical(fullRegions$significantFWER)
fullRegs20 <- fullRegions[width(fullRegions) >= 20]

## Overlap table for all 4 cases
ov_table <- function(ders, counts, query = 'der', minov = 0) {
    if(query == 'der') {
        if(minov == 0) {
            res <- addmargins(table('Significant DER (FWER)' = ders$sigFWER, 'Overlaps sig DE gene' = countOverlaps(ders, counts[mcols(counts)$sig]) > 0))
        } else {
            res <- addmargins(table(ders$sigFWER, countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0, dnn = c('Significant DER (FWER)', paste0('Overlaps sig DE gene (min ', minov, 'bp)'))))
        }
    } else if (query == 'counts') {
        if(minov == 0) {
            res <- addmargins(table('Significant DE gene' = mcols(counts)$sig, 'Overlaps sig DER (FWER)' = countOverlaps(counts, ders[ders$sigFWER]) > 0))
        } else {
            res <- addmargins(table(mcols(counts)$sig[sapply(width(counts), sum) >= minov], countOverlaps(counts[sapply(width(counts), sum) >= minov], ders[ders$sigFWER], minoverlap = minov) > 0, dnn = c('Significant DE gene', paste0('Overlaps sig DER (FWER, min ', minov, 'bp)'))))
        }
    }
    return(res)
}

## Explore mistmatched cases for DERs vs genes direction
explore_ov <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & !ders$sigFWER)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(ders, counts[mcols(counts)$sig], minoverlap = minov) > 0 & ders$sigFWER)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)),
            width_der = summary(width(ders[i])),
            ders_per_gene_table = table(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
            ders_per_gene = sort(table(subjectHits(findOverlaps(ders[i], counts[mcols(counts)$sig], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_der = summary(width(ders[i])),
            distance_nearest_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE))$distance),
            distance_nearest_sig_sum = summary(mcols(distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(ders[i], unlist(counts), ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(ders[i], unlist(counts[mcols(counts)$sig]), ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

## Explore mistmatched cases for genes vs DERs direction
explore_ov_counts <- function(ders, counts, case = "FALSE-TRUE", minov = 0L) {
    counts <- counts[sapply(width(counts), sum) >= minov]
    if(case == 'FALSE-TRUE') {
        i <- which(countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & !mcols(counts)$sig)
    } else if (case == 'TRUE-FALSE') {
        i <- which(!countOverlaps(counts, ders[ders$sigFWER], minoverlap = minov) > 0 & mcols(counts)$sig)
    } else{
        stop('invalid case')
    }
    if(length(i) == 0) return("No such cases")
    
    if(case == 'FALSE-TRUE') {
        res <- list(
            n_overlaps = table(countOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)),
            width_gene = summary(sapply(width(counts[i]), sum)),
            genes_per_der_table = table(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
            genes_per_der = sort(table(subjectHits(findOverlaps(counts[i], ders[ders$sigFWER], minoverlap = minov)))),
            i = i
        )
    } else {
        res <- list(
            width_gene = summary(sapply(width(counts[i]), sum)),
            distance_nearest_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE))$distance),
             distance_nearest_sig_sum = summary(mcols(distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE))$distance),
            distance_nearest = distanceToNearest(unlist(counts[i]), ders, ignore.strand = TRUE),
            distance_nearest_sig = distanceToNearest(unlist(counts[i]), ders[ders$sigFWER], ignore.strand = TRUE),
            i = i
        )
    }
    
    return(res)
}

noNA <- function(x) {
    x[!is.na(x)]
}

DESeq2

Query: DERs

We can first compare the results by using the DERs as the query and the genes as the subject. The following output shows the comparison using all DERs and exploring the mismatched cases. Then its repeated using the DERs ≥ 20 bp and a minimum overlap of 20bp.

For the mismatched cases of non-significant DERs overlapping a significant gene, we check:

how many genes each DER overlaps,
the width of the DERs
the frequency table of how many DERs overlap the same gene

For the other mismatched case, we check:

the width of the DERs
distance to nearest gene (regardless of gene size)
distance to nearest significant DE gene (ibidem)

## Overlap between DERs and significant DE genes
ov_table(fullRegions, deseq)

##                       Overlaps sig DE gene
## Significant DER (FWER) FALSE  TRUE   Sum
##                  FALSE 13416 18047 31463
##                  TRUE    288   227   515
##                  Sum   13704 18274 31978

## Explore mismatched cases
#noNA(explore_ov(fullRegions, deseq)[1:3])
#noNA(explore_ov(fullRegions, deseq, 'TRUE-FALSE')[1:3])

## Min 20 bp overlap, using only DERs 20 bp long
ov_table(fullRegs20, deseq, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  1030 1248 2278
##                  TRUE    288  227  515
##                  Sum    1318 1475 2793

## Explore mismatched cases, min 20bp overlap
noNA(explore_ov(fullRegs20, deseq, minov = 20L)[1:3])

## $n_overlaps
## 
##    1    2 
## 1241    7 
## 
## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.00   29.00   30.07   36.00   62.00 
## 
## $ders_per_gene_table
## 
##   1   2   3   4   5   6   7   8   9  10  21 
## 524 127  52  24  14  10   3   3   1   2   1

noNA(explore_ov(fullRegs20, deseq, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   47.00   60.00   67.08   77.00  182.00 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0     901   16810   16930 1385000 
## 
## $distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54    6172   24420   83570   65550 2651000

## Holm vs BH
addmargins(table('DESeq2 Holm' = mcols(deseq_holm)$sig, 'DESeq2 BH' = mcols(deseq)$sig))

##            DESeq2 BH
## DESeq2 Holm FALSE  TRUE   Sum
##       FALSE 14091  7414 21505
##       TRUE      0   311   311
##       Sum   14091  7725 21816

## Use Holm and min 20 bp ov
ov_table(fullRegs20, deseq_holm, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  2057  221 2278
##                  TRUE    442   73  515
##                  Sum    2499  294 2793

Most of the DERs are shorter than 20bp (91.27 percent), so we'll focus on the longer ones. The majority of the mismatches are from non significant DERs that overlap a significant gene.

As expected, when controlling the FWER instead of the FDR, most of the DE genes are no longer significant. Using FWER-controlled DE genes, most of the DERs 20bp or longer agree with the genes as not being significantly DE.

Query: genes

We can now repeat the comparison using the genes as the query and the DERs as the subject.

For the mismatched cases of non-significant genes overlapping a significant DER, we check:

how many DERs each gene overlaps,
the width of the genes
the frequency table of how many genes overlap the same DER

For the other mismatched case, we check:

the width of the genes
distance to nearest DER (regardless of DER size)
distance to nearest significant DER (ibidem)

## Overlap between genes and significant DERs
#ov_table(fullRegions, deseq, 'counts')

## Explore mismatched cases
#noNA(explore_ov_counts(fullRegions, deseq)[1:3])
#noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE')[1:3])

## Overlap between genes and significant DERs, min 20 bp
ov_table(fullRegions, deseq, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 13988   103 14091
##               TRUE   7550   175  7725
##               Sum   21538   278 21816

## Explore mismatched cases
noNA(explore_ov_counts(fullRegions, deseq, minov = 20L)[1:3])

## $n_overlaps
## 
##  1  2  3  4  6 11 
## 73 21  5  2  1  1 
## 
## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    1456    3410    4318    7055   13630 
## 
## $genes_per_der_table
## 
##   1  20 
## 135   1

noNA(explore_ov_counts(fullRegions, deseq, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      46    2372    3751    4705    5913  118100 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    6161   30960  138100  118500 5384000 
## 
## $distance_nearest_sig_sum
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       52   757800  2436000  5391000  6405000 48690000

## Now with Holm
ov_table(fullRegions, deseq_holm, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 21277   228 21505
##               TRUE    261    50   311
##               Sum   21538   278 21816

From these results, we can see that derfinder is more conservative.

edgeR-robust

Query: DERs

Similar comparison using DERs as query and genes as subject with edgeR-robust results.

## Overlap between DERs and significant DE genes
#ov_table(fullRegions, edger)

## Explore mismatched cases
#noNA(explore_ov(fullRegions, edger)[1:3])
#noNA(explore_ov(fullRegions, edger, 'TRUE-FALSE')[1:3])

## Min 20 bp overlap, using only DERs 20 bp long
ov_table(fullRegs20, edger, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE   911 1367 2278
##                  TRUE    227  288  515
##                  Sum    1138 1655 2793

## Explore mismatched cases, min 20bp overlap
noNA(explore_ov(fullRegs20, edger, minov = 20L)[1:3])

## $n_overlaps
## 
##    1    2 
## 1354   13 
## 
## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.00   29.00   30.19   36.00   62.00 
## 
## $ders_per_gene_table
## 
##   1   2   3   4   5   6   7   8   9  10  21 
## 563 136  61  27  16  12   4   3   1   2   1

noNA(explore_ov(fullRegs20, edger, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.00   46.00   58.00   64.83   74.00  174.00 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0    6121   21320   20470 1385000 
## 
## $distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      69    7492   24510   74750   43900 2089000

## Holm vs BH
addmargins(table('edgeR Holm' = mcols(edger_holm)$sig, 'edger BH' = mcols(edger)$sig))

##           edger BH
## edgeR Holm FALSE  TRUE   Sum
##      FALSE 13397  7887 21284
##      TRUE      0   532   532
##      Sum   13397  8419 21816

## With Holm, 20bp
ov_table(fullRegs20, edger_holm, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  2067  211 2278
##                  TRUE    442   73  515
##                  Sum    2509  284 2793

The results are fairly similar to those from using DESeq2.

Query: genes

Similar comparison using genes as query and DERs as subject with edgeR-robust results.

## Overlap between genes and significant DERs
#ov_table(fullRegions, edger, 'counts')

## Explore mismatched cases
#noNA(explore_ov_counts(fullRegions, edger)[1:3])
#noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE')[1:3])

## Overlap between genes and significant DERs, min 20 bp
ov_table(fullRegions, edger, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 13325    72 13397
##               TRUE   8213   206  8419
##               Sum   21538   278 21816

## Explore mismatched cases
noNA(explore_ov_counts(fullRegions, edger, minov = 20L)[1:3])

## $n_overlaps
## 
##  1  2  3  4 
## 57 11  2  2 
## 
## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    1579    3758    4619    7095   13630 
## 
## $genes_per_der_table
## 
##  1 19 
## 74  1

noNA(explore_ov_counts(fullRegions, edger, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      46    2282    3631    4514    5667  118100 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5977   31080  141900  122200 5384000 
## 
## $distance_nearest_sig_sum
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       52   737100  2360000  5379000  6422000 48690000

## With Holm, 20 bp
ov_table(fullRegions, edger_holm, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 21059   225 21284
##               TRUE    479    53   532
##               Sum   21538   278 21816

overall

While the DERs vs genes results are fairly similar between edgeR-robust and DESeq2, as shown below the number of mismatched cases is high compared to the number of cases both counts-based methods agree. This is also true when controlling the FWER to determine significance.

## edgeR vs DESeq2
addmargins(table('edgeR-robust (FDR)' = mcols(edger)$sig, 'DESeq2 (FDR)' = mcols(deseq)$sig))

##                   DESeq2 (FDR)
## edgeR-robust (FDR) FALSE  TRUE   Sum
##              FALSE 12897   500 13397
##              TRUE   1194  7225  8419
##              Sum   14091  7725 21816

## Control FWER
addmargins(table('edgeR-robust (FWER)' = mcols(edger_holm)$sig, 'DESeq2 (FWER)' = mcols(deseq_holm)$sig))

##                    DESeq2 (FWER)
## edgeR-robust (FWER) FALSE  TRUE   Sum
##               FALSE 21234    50 21284
##               TRUE    271   261   532
##               Sum   21505   311 21816

## Only sig if both edgeR and DEseq2 say it is
both <- deseq
mcols(both)$sig <- mcols(both)$sig & mcols(edger)$sig

## Same, for holm
both_holm <- deseq_holm
mcols(both_holm)$sig <- mcols(both_holm)$sig & mcols(edger_holm)$sig

We can consider an gene to be DE only if both edgeR-robust and DESeq2 find that its significantly DE. The next sections use this information.

Query: DERs

## Overlap between DERs and significant DE genes
#ov_table(fullRegions, both)

## Explore mismatched cases
#noNA(explore_ov(fullRegions, both)[1:3])
#noNA(explore_ov(fullRegions, both, 'TRUE-FALSE')[1:3])

## Min 20 bp overlap, using only DERs 20 bp long
ov_table(fullRegs20, both, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  1066 1212 2278
##                  TRUE    289  226  515
##                  Sum    1355 1438 2793

## Explore mismatched cases, min 20bp overlap
noNA(explore_ov(fullRegs20, both, minov = 20L)[1:3])

## $n_overlaps
## 
##    1    2 
## 1205    7 
## 
## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   23.00   29.00   30.16   36.00   62.00 
## 
## $ders_per_gene_table
## 
##   1   2   3   4   5   6   7   8   9  10  21 
## 510 121  52  23  14   9   3   3   1   2   1

noNA(explore_ov(fullRegs20, both, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_der
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   47.00   60.00   67.16   77.00  182.00 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0     887   16750   16900 1385000 
## 
## $distance_nearest_sig_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      54    7309   27620   89360   73330 2651000

## Holm vs BH
addmargins(table('Both Holm' = mcols(both_holm)$sig, 'Both BH' = mcols(both)$sig))

##          Both BH
## Both Holm FALSE  TRUE   Sum
##     FALSE 14591  6964 21555
##     TRUE      0   261   261
##     Sum   14591  7225 21816

## Use Holm and min 20 bp ov
ov_table(fullRegs20, both_holm, minov = 20L)

##                       Overlaps sig DE gene (min 20bp)
## Significant DER (FWER) FALSE TRUE  Sum
##                  FALSE  2109  169 2278
##                  TRUE    454   61  515
##                  Sum    2563  230 2793

The trends observed previously are maintained in this comparison with a reduction of cases where the gene is DE. This is expected due to the non-perfect agreement between DESeq2 and edgeR-robust.

library('TxDb.Hsapiens.UCSC.hg19.knownGene')
library('derfinder')
library('derfinderHelper')
library('derfinderPlot')
load('../derAnalysis/run3-v1.0.10/models.Rdata')
load('../derAnalysis/run3-v1.0.10/chr22/optionsStats.Rdata')
load("../CoverageInfo/fullCov.Rdata")

def.par <- par()
def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))]

library('RColorBrewer')
fstats.colors <- function(x) {
    brewer.pal(4, 'Dark2')[ifelse(x == 0, 1, ifelse(x == 1, 2, 4))]
}

regPlot <- function(region, title) {
    ## Calculate F-stats
    range <- start(region):end(region)
    dat <- fullCov[[as.character(seqnames(region))]][range, colsubset]

    ## Log2 transform
    for(i in seq_len(length(groupInfo))) dat[[i]] <- log2(dat[[i]] + 32) 

    ## Calculate f-stats
    fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0))

    ## Find annotation
    annoReg <- annotateRegions(region, GenomicState.Hsapiens.UCSC.hg19.knownGene$fullGenome, verbose = FALSE)
    symbol <- mcols(annoReg$annotationList[[1]])$symbol
    symbol <- as.character(noNA(symbol)[[1]])
    if(length(symbol) > 1) symbol <- symbol[1]
    symbol <- ifelse(is.null(symbol), NA, symbol)
    ## Remove symbol name because it gets chomped on the plot
    mcols(annoReg$annotationList[[1]])$symbol <- NA
    
    par(def.par)

    ## Plot long gene
    plotRegionCoverage(region, getRegionCoverage(fullCov, region, verbose = FALSE), groupInfo, data.frame(name = title, distance = NA, region = symbol), annoReg, verbose = FALSE, ask = FALSE, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene)

    ## Add F-stat track
    fstats.idx <- fstats > optionsStats$cutoffFstatUsed
    fstats.idx[is.na(fstats.idx)] <- -1
    fstats.r <- Rle(fstats.idx)
    
    info <- lapply(seq_len(nrun(fstats.r)), function(i) {
        x <- c(start(fstats.r)[i], end(fstats.r)[i])
        x <- c(x, rev(x))
        y <- rep(runValue(fstats.r)[i], 4)
        col <- fstats.colors(y[1])
        return(list(x = x, y = y, col = col))
    })


    par(fig = c(0, 1, 0.065, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1))
    plot(1, ylab = '', type = 'n', xlab = '', bty = 'n', ylim = c(-1.2, 10), xlim = c(1, length(range)), las = 3, yaxt = 'n', xaxt = 'n')
    axis(2, at = 0, 'F Status', las = 2, tick = TRUE)
    for(pol in info) {
        polygon(x=pol$x, y=pol$y + rep(c(-0.15, 0.15), each = 2), col = pol$col, border = pol$col)
    }
}

sortWidth <- function(regions) {
    regions[order(width(regions), decreasing = TRUE)]
}

We can now make plots to explore some DERs for each of the cases.

query_der_plots <- function() {
    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & fullRegs20$sigFWER])[1:10], function(reg) {
        regPlot(reg, 'DER query: DE agreement')
    })

    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & !fullRegs20$sigFWER])[1:10],  function(reg) {
        regPlot(reg, 'DER query: not DE agreement')
    })

    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) == 0 & fullRegs20$sigFWER])[1:10], function(reg) {
        regPlot(reg, 'DER query: only gene not DE')
    })

    sapply(sortWidth(fullRegs20[countOverlaps(fullRegs20, both[mcols(both)$sig], minoverlap = 20L) > 0 & !fullRegs20$sigFWER])[1:10], function(reg) {
        regPlot(reg, 'DER query: only gene DE')
    })
}
pdf(file = 'query_der_plots_gene.pdf', width = 10, height = 7)
query_der_plots()

## $`535`
## NULL
## 
## $`540`
## NULL
## 
## $`625`
## NULL
## 
## $`594`
## NULL
## 
## $`589`
## NULL
## 
## $`574`
## NULL
## 
## $`610`
## NULL
## 
## $`616`
## NULL
## 
## $`626`
## NULL
## 
## $`533`
## NULL

dev.off()

## quartz_off_screen 
##                 2

query_der_plots()

## $`535`
## NULL
## 
## $`540`
## NULL
## 
## $`625`
## NULL
## 
## $`594`
## NULL
## 
## $`589`
## NULL
## 
## $`574`
## NULL
## 
## $`610`
## NULL
## 
## $`616`
## NULL
## 
## $`626`
## NULL
## 
## $`533`
## NULL

Query: genes

As was shown with either DESeq2 or edgeR-robust results, derfinder is more conservative than the counts-based methods.

## Overlap between genes and significant DERs, min 20 bp
ov_table(fullRegions, both, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 14487   104 14591
##               TRUE   7051   174  7225
##               Sum   21538   278 21816

## Explore mismatched cases
noNA(explore_ov_counts(fullRegions, both, minov = 20L)[1:3])

## $n_overlaps
## 
##  1  2  3  4  6 11 
## 74 21  5  2  1  1 
## 
## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     493    1462    3452    4333    7050   13630 
## 
## $genes_per_der_table
## 
##   1  20 
## 136   1

noNA(explore_ov_counts(fullRegions, both, 'TRUE-FALSE', minov = 20L)[1:3])

## $width_gene
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      46    2348    3702    4627    5802  118100 
## 
## $distance_nearest_sum
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5687   29570  137300  115400 5384000 
## 
## $distance_nearest_sig_sum
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       52   757600  2441000  5455000  6478000 48690000

## With Holm, 20 bp
ov_table(fullRegions, both_holm, 'counts', 20L)

##                    Overlaps sig DER (FWER, min 20bp)
## Significant DE gene FALSE  TRUE   Sum
##               FALSE 21321   234 21555
##               TRUE    217    44   261
##               Sum   21538   278 21816

We can now visually explore some genes (maximum 10kb long) for each of the four cases.

max_kb <- function(x, limit = 1e4) {
    x[width(x) <= limit]
}

selectRegions <- function(x) {
    max_kb(sortWidth(unlist(range(x))))
}

query_gene_plots <- function() {
    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: DE agreement')
    })

    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: not DE agreement')
    })

    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: only gene not DE')
    })

    sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[1:10], function(reg) {
        regPlot(reg, 'Gene query: only gene DE')
    })
}
pdf(file = 'query_gene_plots.pdf', width = 10, height = 7)
query_gene_plots()

## $`401505`
## NULL
## 
## $`1581`
## NULL
## 
## $`26168`
## NULL
## 
## $`55299`
## NULL
## 
## $`4708`
## NULL
## 
## $`2524`
## NULL
## 
## $`112703`
## NULL
## 
## $`100506746`
## NULL
## 
## $`54989`
## NULL
## 
## $`363`
## NULL

dev.off()

## quartz_off_screen 
##                 2

query_gene_plots()

## $`401505`
## NULL
## 
## $`1581`
## NULL
## 
## $`26168`
## NULL
## 
## $`55299`
## NULL
## 
## $`4708`
## NULL
## 
## $`2524`
## NULL
## 
## $`112703`
## NULL
## 
## $`100506746`
## NULL
## 
## $`54989`
## NULL
## 
## $`363`
## NULL

## Make figures for the paper
pdf(file = 'figure4.pdf', width = 10, height = 7)
sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[8], function(reg) {
    regPlot(reg, 'Gene query: DE agreement')
})

## $`2987`
## NULL

dev.off()

## quartz_off_screen 
##                 2

pdf(file = 'figure5.pdf', width = 10, height = 7)
sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[7], function(reg) {
    regPlot(reg, 'Gene query: not DE agreement')
})

## $`84365`
## NULL

dev.off()

## quartz_off_screen 
##                 2

pdf(file = 'figure6.pdf', width = 10, height = 7)
sapply(selectRegions(both[sapply(width(both), sum) >= 20 & !mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) > 0])[7], function(reg) {
    regPlot(reg, 'Gene query: only gene not DE')
})

## $`6122`
## NULL

dev.off()

## quartz_off_screen 
##                 2

pdf(file = 'figure7.pdf', width = 10, height = 7)
sapply(selectRegions(both[sapply(width(both), sum) >= 20 & mcols(both)$sig & countOverlaps(both, fullRegions[fullRegions$sigFWER], minoverlap = 20L) == 0])[2], function(reg) {
    regPlot(reg, 'Gene query: only gene DE')
})

## $`1581`
## NULL

dev.off()

## quartz_off_screen 
##                 2

Reproducibility

Date the report was generated.

## [1] "2016-02-20 15:50:54 EST"

Wallclock time spent generating the report.

## Time difference of 25.974 mins

R session information.

## Session info -----------------------------------------------------------------------------------------------------------

##  setting  value                       
##  version  R version 3.2.2 (2015-08-14)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/New_York            
##  date     2016-02-20

## Packages ---------------------------------------------------------------------------------------------------------------

##  package                           * version     date       source                                   
##  acepack                             1.3-3.3     2014-11-24 CRAN (R 3.2.0)                           
##  annotate                            1.48.0      2015-10-14 Bioconductor                             
##  AnnotationDbi                     * 1.32.3      2015-12-24 Bioconductor                             
##  Biobase                           * 2.30.0      2015-10-14 Bioconductor                             
##  BiocGenerics                      * 0.16.1      2015-11-06 Bioconductor                             
##  BiocInstaller                       1.20.1      2015-11-18 Bioconductor                             
##  BiocParallel                        1.4.3       2015-12-16 Bioconductor                             
##  biomaRt                             2.26.1      2015-11-23 Bioconductor                             
##  Biostrings                          2.38.4      2016-02-09 Bioconductor                             
##  biovizBase                          1.18.0      2015-10-14 Bioconductor                             
##  bitops                              1.0-6       2013-08-17 CRAN (R 3.2.0)                           
##  BSgenome                            1.38.0      2015-10-14 Bioconductor                             
##  bumphunter                          1.10.0      2015-10-14 Bioconductor                             
##  cluster                             2.0.3       2015-07-21 CRAN (R 3.2.2)                           
##  codetools                           0.2-14      2015-07-15 CRAN (R 3.2.2)                           
##  colorspace                          1.2-6       2015-03-11 CRAN (R 3.2.0)                           
##  DBI                                 0.3.1       2014-09-24 CRAN (R 3.2.0)                           
##  derfinder                         * 1.5.19      2015-12-15 Bioconductor                             
##  derfinderHelper                   * 1.4.1       2015-11-03 Bioconductor                             
##  derfinderPlot                     * 1.5.4       2016-02-20 Bioconductor                             
##  DESeq2                            * 1.10.1      2015-12-22 Bioconductor                             
##  devtools                            1.10.0      2016-01-23 CRAN (R 3.2.3)                           
##  dichromat                           2.0-0       2013-01-24 CRAN (R 3.2.0)                           
##  digest                              0.6.9       2016-01-08 CRAN (R 3.2.3)                           
##  doRNG                               1.6         2014-03-07 CRAN (R 3.2.0)                           
##  edgeR                             * 3.12.0      2015-10-14 Bioconductor                             
##  evaluate                            0.8         2015-09-18 CRAN (R 3.2.0)                           
##  foreach                             1.4.3       2015-10-13 CRAN (R 3.2.0)                           
##  foreign                             0.8-66      2015-08-19 CRAN (R 3.2.0)                           
##  formatR                             1.2.1       2015-09-18 CRAN (R 3.2.0)                           
##  Formula                             1.2-1       2015-04-07 CRAN (R 3.2.0)                           
##  futile.logger                       1.4.1       2015-04-20 CRAN (R 3.2.0)                           
##  futile.options                      1.0.0       2010-04-06 CRAN (R 3.2.0)                           
##  genefilter                          1.52.1      2016-01-28 Bioconductor                             
##  geneplotter                         1.48.0      2015-10-14 Bioconductor                             
##  GenomeInfoDb                      * 1.6.3       2016-01-26 Bioconductor                             
##  GenomicAlignments                   1.6.3       2016-01-06 Bioconductor                             
##  GenomicFeatures                   * 1.22.13     2016-02-11 Bioconductor                             
##  GenomicFiles                        1.6.2       2015-12-30 Bioconductor                             
##  GenomicRanges                     * 1.22.4      2016-01-30 Bioconductor                             
##  GGally                              1.0.1       2016-01-14 CRAN (R 3.2.3)                           
##  ggbio                               1.18.5      2016-02-13 Bioconductor                             
##  ggplot2                             2.0.0       2015-12-18 CRAN (R 3.2.3)                           
##  graph                               1.48.0      2015-10-14 Bioconductor                             
##  gridExtra                           2.0.0       2015-07-14 CRAN (R 3.2.0)                           
##  gtable                              0.1.2       2012-12-05 CRAN (R 3.2.0)                           
##  Hmisc                               3.17-1      2015-12-18 CRAN (R 3.2.3)                           
##  htmltools                           0.3         2015-12-29 CRAN (R 3.2.3)                           
##  IRanges                           * 2.4.7       2016-02-09 Bioconductor                             
##  iterators                           1.0.8       2015-10-13 CRAN (R 3.2.0)                           
##  knitr                               1.12.3      2016-01-22 CRAN (R 3.2.3)                           
##  knitrBootstrap                      1.0.0       2015-05-19 Github (jimhester/knitrBootstrap@76c41f0)
##  lambda.r                            1.1.7       2015-03-20 CRAN (R 3.2.0)                           
##  lattice                             0.20-33     2015-07-14 CRAN (R 3.2.2)                           
##  latticeExtra                        0.6-28      2016-02-09 CRAN (R 3.2.3)                           
##  limma                             * 3.26.8      2016-02-12 Bioconductor                             
##  locfit                              1.5-9.1     2013-04-20 CRAN (R 3.2.0)                           
##  magrittr                            1.5         2014-11-22 CRAN (R 3.2.0)                           
##  markdown                            0.7.7       2015-04-22 CRAN (R 3.2.0)                           
##  Matrix                              1.2-3       2015-11-28 CRAN (R 3.2.2)                           
##  matrixStats                         0.50.1      2015-12-15 CRAN (R 3.2.3)                           
##  memoise                             1.0.0       2016-01-29 CRAN (R 3.2.3)                           
##  mime                                0.4         2015-09-03 CRAN (R 3.2.0)                           
##  munsell                             0.4.3       2016-02-13 CRAN (R 3.2.3)                           
##  nnet                                7.3-12      2016-02-02 CRAN (R 3.2.3)                           
##  OrganismDbi                         1.12.1      2015-12-23 Bioconductor                             
##  pkgmaker                            0.22        2014-05-14 CRAN (R 3.2.0)                           
##  plyr                                1.8.3       2015-06-12 CRAN (R 3.2.1)                           
##  qvalue                              2.2.2       2016-01-08 Bioconductor                             
##  RBGL                                1.46.0      2015-10-14 Bioconductor                             
##  RColorBrewer                      * 1.1-2       2014-12-07 CRAN (R 3.2.0)                           
##  Rcpp                              * 0.12.3      2016-01-10 CRAN (R 3.2.3)                           
##  RcppArmadillo                     * 0.6.500.4.0 2016-01-27 CRAN (R 3.2.3)                           
##  RCurl                               1.95-4.7    2015-06-30 CRAN (R 3.2.1)                           
##  registry                            0.3         2015-07-08 CRAN (R 3.2.1)                           
##  reshape                             0.8.5       2014-04-23 CRAN (R 3.2.0)                           
##  reshape2                            1.4.1       2014-12-06 CRAN (R 3.2.0)                           
##  rmarkdown                         * 0.9.2       2016-01-01 CRAN (R 3.2.3)                           
##  rngtools                            1.2.4       2014-03-06 CRAN (R 3.2.0)                           
##  rpart                               4.1-10      2015-06-29 CRAN (R 3.2.2)                           
##  Rsamtools                           1.22.0      2015-10-14 Bioconductor                             
##  RSQLite                             1.0.0       2014-10-25 CRAN (R 3.2.0)                           
##  rtracklayer                         1.30.2      2016-02-06 Bioconductor                             
##  S4Vectors                         * 0.8.11      2016-01-29 Bioconductor                             
##  scales                              0.3.0       2015-08-25 CRAN (R 3.2.0)                           
##  stringi                             1.0-1       2015-10-22 CRAN (R 3.2.0)                           
##  stringr                             1.0.0       2015-04-30 CRAN (R 3.2.0)                           
##  SummarizedExperiment              * 1.0.2       2016-01-01 Bioconductor                             
##  survival                            2.38-3      2015-07-02 CRAN (R 3.2.2)                           
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.2.2       2015-10-15 Bioconductor                             
##  VariantAnnotation                   1.16.4      2015-12-09 Bioconductor                             
##  XML                                 3.98-1.3    2015-06-30 CRAN (R 3.2.0)                           
##  xtable                              1.8-2       2016-02-05 CRAN (R 3.2.3)                           
##  XVector                             0.10.0      2015-10-14 Bioconductor                             
##  yaml                                2.1.13      2014-06-12 CRAN (R 3.2.0)                           
##  zlibbioc                            1.16.0      2015-10-14 Bioconductor