Here we evaluate the performance of the different pipelines we used to analyze the simulated data. Evaluating the performance is tricky and depends on how you define the reference set. Here we show the results by using all exons and exons that overlap only 1 transcript. The second set removes ambiguity in determining the truth and is thus the most useful. Other possible reference sets can be considered and you can find the results in previous versions of this document. We removed them from the final version to avoid confusion.

1 Setup

First, we load the required libraries.

## Load required libraries
library('GenomicRanges')
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
library('knitr')
library('devtools')
library('ggplot2')

1.1 Load data

Next we load the required data from the simulation setup as well as the statistical results from the different pipelines. We then format the results in a way that we can easily access them later on.

## Load simulation information
load(file.path('..', 'generateReads', 'simulation_info.Rdata'))

## Load transcripts used
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr17')
txinfo <- select(txdb, keys = chosen$ucsckg_id, columns = columns(txdb), keytype = 'TXNAME')

## 'select()' returned 1:many mapping between keys and columns

## Build GRangesList with exons grouped by transcript
tx <- split(GRanges(seqnames = txinfo$EXONCHROM, IRanges(start = txinfo$EXONSTART, end = txinfo$EXONEND), strand = txinfo$EXONSTRAND), txinfo$TXNAME)
tx <- tx[match(chosen$ucsckg_id, names(tx))]


if(!identical(nrow(chosen), length(tx))) {
    message(paste(Sys.time(), 'Subsetting list of transcripts to those that were used when simulating the reads'))
    tx <- tx[names(tx) %in% chosen$gene_id]
    stopifnot(unique(sum(width(tx)) - chosen$width) == 0)
}

## Load stat results
inputs <- c('featureCounts', 'railMatrix', 'regionMatrix', 'ballgown')
reps <- 1:3
softwares <- c('DESeq2', 'edgeR', 'limma')
statuses <- c('complete', 'incomplete')

## Function for loading results from deseq2-edger folder
loadDE <- function(software, file) {
    load(file.path('..', 'deseq2-edger', paste0('stats-', file, '-', software, '.Rdata')))
    if(software == 'DESeq2') {
        DEres <- deseq
    } else if (software == 'edgeR') {
        DEres <- edger
    } else if (software == 'limma') {
        DEres <- limma
    }
    return(DEres)
}

## Function for loading ballgown results
loadBG <- function(level, file) {
    load(file.path('..', 'ballgown', paste0('bgres-', level, '-', file, '.Rdata')))
    return(bgres)
}

## Actually load the stats results
stats <- lapply(inputs, function(input) {
    inputres <- lapply(reps, function(rep) {
        if(input %in% c('featureCounts', 'ballgown')) {
            statusres <- lapply(statuses, function(status) {
                if(input == 'ballgown') {
                    res <- lapply(c('trans', 'exon'), loadBG, file = 
                        paste0('R', rep, ifelse(status == 'complete', '-comp', '-inc')))
                    names(res) <- c('trans', 'exon')
                } else if (input == 'featureCounts') {
                    res <- lapply(softwares, loadDE, file = paste0('featureCounts-R', rep, 
                        ifelse(status == 'complete', '-comp', '-inc')))
                    names(res) <- softwares
                }
                return(res)      
            })
            names(statusres) <- statuses
            return(statusres)
        } else {
            softres <- lapply(softwares, loadDE, file = paste0(input, '-R', rep))
            names(softres) <- softwares
            return(softres)
        }
    })
    names(inputres) <- reps
    return(inputres)
})
names(stats) <- inputs

## Group the stats results into a list without nested levels
stats_GR <- lapply(inputs, function(input) {
    if(input %in% c('featureCounts', 'ballgown')) {
        res <- lapply(stats[[input]], unlist)
    } else {
        res <- unlist(stats[[input]])
    }
    res <- unlist(res)
    return(res)
})
names(stats_GR) <- inputs
stats_GR <- unlist(stats_GR)

1.2 Evaluation functions

The following code defines the functions we will use to evaluate the statistical results.

## count_comp compares the information at hand versus the reference set
count_comp <- function(info, rep = 1, type = 'padj', reference, cut = 0.05) {
    if(type == 'padj') {
        idx <- mcols(info)$padj < cut
    } else if (type == 'qval') {
        idx <- mcols(info)$qval < cut
    } else if (type == 'pbonf') {
        idx <- mcols(info)$pbonf < cut
    }
    
    idx[is.na(idx)] <- FALSE

    ## Overlaps at least 1 DE 'reference' unit
    addmargins(table('DE truth' = mcols(reference)[[paste0('DEr', rep)]],
        'Overlaps DE' = countOverlaps(reference, info[idx]) > 0))
}


## Functions for evaluating empirical power, FPR and FDR, plus summarizing 
## the info
emp_power <- function(m, digits = 1) {
    round(m[2, 2] / m[2, 3] * 100, digits)
}
emp_fpr <- function(m, digits = 1) {
    round(m[1, 2] / m[1, 3] * 100, digits)
}
emp_fdr <- function(m, digits = 1) {
    round(m[1, 2] / m[3, 2] * 100, digits)
}

## Detailed table with the results for each replicate of the simulation
emp <- function(tables) {
    empirical <- data.frame(
        power = sapply(tables, emp_power),
        FPR = sapply(tables, emp_fpr),
        FDR = sapply(tables, emp_fdr)
    )
    empirical$replicate <- as.integer(sapply(strsplit(names(tables), '\\.'), '[[', 2))
    empirical$AnnotationComplete <- NA
    empirical$AnnotationComplete[grepl('\\.complete', names(tables))] <- TRUE
    empirical$AnnotationComplete[grepl('\\.incomplete', names(tables))] <- FALSE
    empirical$Aligner <- 'HISAT'
    empirical$Aligner[grepl('rail', names(tables))] <- 'Rail-RNA'
    empirical$SummaryMethod <- 'StringTie'
    empirical$SummaryMethod[grepl('Matrix', names(tables))] <- 'derfinder'
    empirical$SummaryMethod[grepl('featureCounts', names(tables))] <- 'featureCounts'
    empirical$StatMethod <- 'DESeq2'
    empirical$StatMethod[grepl('edgeR', names(tables))] <- 'edgeR'
    empirical$StatMethod[grepl('limma', names(tables))] <- 'limma'
    empirical$StatMethod[grepl('ballgown.*exon', names(tables))] <- 'ballgown-exon'
    empirical$StatMethod[grepl('ballgown.*trans', names(tables))] <- 'ballgown-trans'
    rownames(empirical) <- NULL
    return(empirical)
}

## This function takes the result from emp() and summarizes it by showing
## the minimum and maximum value per replicate
emp_sum_paste <- function(x) { paste0('(', x[1], '-', x[2], ')') }
emp_sum <- function(empinfo) {
    empinfo$situation <- paste(empinfo$AnnotationComplete, empinfo$Aligner, 
        empinfo$SummaryMethod, empinfo$StatMethod, sep=';')
    empsum <- data.frame(
        'Power' = sapply(tapply(empinfo$power, empinfo$situation, range),
            emp_sum_paste),
        'FPR' = sapply(tapply(empinfo$FPR, empinfo$situation, range),
            emp_sum_paste),
        'FDR' = sapply(tapply(empinfo$FDR, empinfo$situation, range),
            emp_sum_paste)
    )
    empsum$AnnotationComplete <- as.logical(sapply(strsplit(rownames(empsum), ';'), '[[', 1))
    empsum$Aligner <- sapply(strsplit(rownames(empsum), ';'), '[[', 2)
    empsum$SummaryMethod <- sapply(strsplit(rownames(empsum), ';'), '[[', 3)
    empsum$StatMethod <- sapply(strsplit(rownames(empsum), ';'), '[[', 4)
    empsum <- empsum[c(1, 12, 2, 13, 3, 14, 4, 15, 5, 16, 6, 9, 7, 10, 8, 11), ]
    rownames(empsum) <- NULL
    empsum 
}

## Summarize by taking the mean
emp_sum_mean <- function(empinfo) {
    empinfo$situation <- paste(empinfo$AnnotationComplete, empinfo$Aligner, 
        empinfo$SummaryMethod, empinfo$StatMethod, sep=';')
    empsum <- data.frame(
        'Power' = tapply(empinfo$power, empinfo$situation, mean, na.rm = TRUE),
        'FPR' = tapply(empinfo$FPR, empinfo$situation, mean, na.rm = TRUE),
        'FDR' = tapply(empinfo$FDR, empinfo$situation, mean, na.rm = TRUE)
    )
    empsum$AnnotationComplete <- as.logical(sapply(strsplit(rownames(empsum), ';'), '[[', 1))
    empsum$Aligner <- sapply(strsplit(rownames(empsum), ';'), '[[', 2)
    empsum$SummaryMethod <- sapply(strsplit(rownames(empsum), ';'), '[[', 3)
    empsum$StatMethod <- sapply(strsplit(rownames(empsum), ';'), '[[', 4)
    empsum <- empsum[c(1, 12, 2, 13, 3, 14, 4, 15, 5, 16, 6, 9, 7, 10, 8, 11), ]
    rownames(empsum) <- NULL
    empsum 
}

## index_comp is similar to count_comp but it returns the actual indices for
## the false positives, true positives, etc
index_comp <- function(info, rep = 1, type = 'padj', reference, cut = 0.05) {
    if(type == 'padj') {
        idx <- mcols(info)$padj < cut
    } else if (type == 'qval') {
        idx <- mcols(info)$qval < cut
    } else if (type == 'pbonf') {
        idx <- mcols(info)$pbonf < cut
    }
    
    idx[is.na(idx)] <- FALSE

    ## Overlaps at least 1 DE 'reference' unit
    ov <- countOverlaps(reference, info[idx]) > 0
    TP <- mcols(reference)[[paste0('DEr', rep)]] & ov
    TN <- !mcols(reference)[[paste0('DEr', rep)]] & !ov
    FP <- !mcols(reference)[[paste0('DEr', rep)]] & ov
    FN <- mcols(reference)[[paste0('DEr', rep)]] & !ov
    return(list(TruePositive = TP, TrueNegative = TN, FalsePositive = FP, FalseNegative = FN))
}

## case_result uses the information from index_comp to make a nice summary table
case_result <- function(idx, r, reference) { 
    res <- lapply(names(idx), function(i) {
        cases <- mcols(reference)[[paste0('case', r)]][idx[[i]]]
        if(length(cases) == 0) return(NULL)
        data.frame(case = cases, result = i, stringsAsFactors = FALSE)
    })
    res <- do.call(rbind, res)
    addmargins(table("Case" = res$case, "Result" = res$result))
}

## Chose the most frequent case, used for choosing the case in the exons 
## reference sets.exon
mostFreq <- function(cases) {
    if(length(cases) == 1) return(cases)
    names(sort(table(cases), decreasing = TRUE))[1]
}

## Process exons set to create a reference set
processExons <- function(exons_set) {
    ov <- findOverlaps(exons_set, tx)
    ## Find DE status per exon
    sHits <- subjectHits(ov)
    qHits <- queryHits(ov)
    exons_set$DEr1 <- as.vector(tapply(sHits, qHits, function(y) {
        ## Note that we could change this from any() to all() and that
        ## would impact change whether the exon is labeled as DE or not.
        any(chosen$rep1[y] != 'normal')
    }))
    exons_set$DEr2 <- as.vector(tapply(sHits, qHits, function(y) {
        any(chosen$rep2[y] != 'normal')
    }))
    exons_set$DEr3 <- as.vector(tapply(sHits, qHits, function(y) {
        any(chosen$rep3[y] != 'normal')
    }))

    exons_set$cases1 <- as.vector(tapply(sHits, qHits, function(y) {
        chosen$case1[y]
    }))
    exons_set$cases2 <- as.vector(tapply(sHits, qHits, function(y) {
        chosen$case2[y]
    }))
    exons_set$cases3 <- as.vector(tapply(sHits, qHits, function(y) {
        chosen$case3[y]
    }))
    exons_set$case1 <- unname(sapply(exons_set$cases1, mostFreq))
    exons_set$case2 <- unname(sapply(exons_set$cases2, mostFreq))
    exons_set$case3 <- unname(sapply(exons_set$cases3, mostFreq))
    return(exons_set)
}

1.3 Reference sets

We can now proceed to defining our reference sets. They are:

Exon level with DE status given by whether the exon overlaps any transcript with low or high expression. Note that if two transcripts share the same exon, that exon will appear twice in this reference set.
Exons that overlap only 1 transcript with DE status given by the transcript they overlap. These exons are not ambiguous in the truth assignment and is thus the most reliable reference set.

## Transcript level info
#trans <- tx
#mcols(trans)$DEr1 <- chosen$rep1 != 'normal'
#mcols(trans)$DEr2 <- chosen$rep2 != 'normal'
#mcols(trans)$DEr3 <- chosen$rep3 != 'normal'
#mcols(trans)$case1 <- chosen$case1
#mcols(trans)$case2 <- chosen$case2
#mcols(trans)$case3 <- chosen$case3

## Transcript level information with DE status given by the gene level
#trans_case <- trans
#mcols(trans_case)$DEr1 <- chosen$case1 %in% c('allDE', 'singleDE', 'someDE')
#mcols(trans_case)$DEr2 <- chosen$case2 %in% c('allDE', 'singleDE', 'someDE')
#mcols(trans_case)$DEr3 <- chosen$case3 %in% c('allDE', 'singleDE', 'someDE')


## Create exon level reference
exons <- processExons(unlist(tx))

## Create disjoint exon level reference
#exons_disjoin <- processExons(disjoin(unlist(tx)))

## Exons overlapping only 1 transcript
exons_one <- exons[countOverlaps(exons, tx) == 1]

## Disjoint exons overlapping only 1 transcript
#exons_disjoin_one <- exons_disjoin[countOverlaps(exons_disjoin, trans) == 1]


## Explore the number and percent of DE cases
percDE <- function(reference) {
    res <- lapply(1:3, function(r) {
        round(table(mcols(reference)[[paste0('DEr', r)]]) / length(reference) * 100, 2)
    })
    names(res) <- paste0('rep', 1:3)
    unlist(res)
}

## First at the transcript level, which matches exactly the 2/6 of transcripts 
## set to be DE (1/6 low, 1/6 high)
#percDE(trans)

## Next at the transcript level with DE status given by the gene level
## The percent DE is highly increased given that genes with more than 1
## transcript are likely to have at least 1 DE.
#percDE(trans_case)

## At the exon level, the percent with the truth set as DE is similar than a
## the transcript (DE by gene) level and is highly increased.
percDE(exons)

## rep1.FALSE  rep1.TRUE rep2.FALSE  rep2.TRUE rep3.FALSE  rep3.TRUE 
##      24.34      75.66      26.20      73.80      24.56      75.44

## Exons overlapping only 1 transcript. The percent is close to the one used in
## the simulation setup. That is: 2/6 of transcripts set to be DE (1/6 low, 1/6 
## high)
percDE(exons_one)

## rep1.FALSE  rep1.TRUE rep2.FALSE  rep2.TRUE rep3.FALSE  rep3.TRUE 
##      59.64      40.36      68.69      31.31      68.12      31.88

## For each reference set, see how many other units of that set each piece overlaps
table(countOverlaps(exons) - 1)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 3868 5874 7205 5901 4788 3734 3039 1311 1075  405  424  161  107  162   63 
##   15   16   17   19   20   21   23   24   25   26   27   46   47 
##   70   71   36   64   21    1    1  303  439  189   22    1    3

table(countOverlaps(exons_one) - 1)

## 
##    0 
## 3868

Note how the overlap between the different units of the same reference set is considerable for the complete exon reference set.

Number of units per reference level:

Exon level: 39338
Exon level subsetted to those that overlap only 1 transcript: 3868

2 Evaluation (FDR controlled)

2.1 Evaluate results

The following code actually runs the evaluation functions for the different reference sets.

## Get type and replicate info
types <- ifelse(grepl('ballgown', names(stats_GR)), 'qval', 'padj') ## For FDR 
#types <- 'pbonf' ## For FWER
replicates <- as.integer(sapply(strsplit(names(stats_GR), '\\.'), '[[', 2))


## Evaluate at the exons level
## DE if any of the transcripts it overlaps is DE
tables_exons <- mapply(count_comp, stats_GR, replicates, types,
    SIMPLIFY = FALSE, MoreArgs = list(reference = exons, cut = 0.05))
empirical_exons <- emp(tables_exons)
index_exons <- mapply(index_comp, stats_GR, replicates, types,
    SIMPLIFY = FALSE, MoreArgs = list(reference = exons, cut = 0.05))
case_result_exons <- mapply(case_result, index_exons, replicates,
    SIMPLIFY = FALSE, MoreArgs = list(reference = exons))
    
## Evaluate at the exons level (overlapping only 1 transcript)
## DE if any of the transcripts it overlaps is DE
tables_exons_one <- mapply(count_comp, stats_GR, replicates, types,
    SIMPLIFY = FALSE, MoreArgs = list(reference = exons_one, cut = 0.05))
empirical_exons_one <- emp(tables_exons_one)
index_exons_one <- mapply(index_comp, stats_GR, replicates, types,
    SIMPLIFY = FALSE, MoreArgs = list(reference = exons_one, cut = 0.05))
case_result_exons_one <- mapply(case_result, index_exons_one, replicates,
    SIMPLIFY = FALSE, MoreArgs = list(reference = exons_one))

2.2 Summaries

Below are summaries of showing the minimum and maximum empirical power, false positive rate (FPR) and false discovery rate (FDR) per replicate for each of the analysis pipelines by the different reference sets.

## Exon level
kable(emp_sum(empirical_exons), format = 'html')

Power	FPR	FDR	AnnotationComplete	Aligner	SummaryMethod	StatMethod
(72.2-74)	(5.2-6.7)	(2.3-2.8)	FALSE	HISAT	featureCounts	DESeq2
(73.4-75)	(5.6-6.6)	(2.6-2.9)	TRUE	HISAT	featureCounts	DESeq2
(77.4-79.6)	(10.9-12.6)	(4.4-5)	FALSE	HISAT	featureCounts	edgeR
(78.4-80.6)	(12.2-13.1)	(5-5.1)	TRUE	HISAT	featureCounts	edgeR
(73.2-75.6)	(2.4-3.7)	(1.1-1.6)	FALSE	HISAT	featureCounts	limma
(74.1-76.6)	(2.6-4)	(1.2-1.7)	TRUE	HISAT	featureCounts	limma
(70.9-72.9)	(3.1-4.1)	(1.3-1.8)	FALSE	HISAT	StringTie	ballgown-exon
(72-74.6)	(3.3-4.4)	(1.4-1.9)	TRUE	HISAT	StringTie	ballgown-exon
(47.1-51.9)	(1-1.8)	(0.7-1.2)	FALSE	HISAT	StringTie	ballgown-trans
(47.8-51.6)	(1.2-3.1)	(0.8-1.9)	TRUE	HISAT	StringTie	ballgown-trans
(71.6-73)	(8.3-10.1)	(3.6-4.2)	NA	HISAT	derfinder	DESeq2
(71.6-72.7)	(8.3-9.4)	(3.9-4)	NA	Rail-RNA	derfinder	DESeq2
(72.6-73.4)	(9.9-10.8)	(4.3-4.9)	NA	HISAT	derfinder	edgeR
(72.4-73.2)	(10-10.6)	(4.4-4.7)	NA	Rail-RNA	derfinder	edgeR
(65.5-69.7)	(15-17.8)	(7.1-7.6)	NA	HISAT	derfinder	limma
(65.8-69.8)	(15.8-18.3)	(7.5-7.9)	NA	Rail-RNA	derfinder	limma

## Exon level (overlapping only 1 transcript)
kable(emp_sum(empirical_exons_one), format = 'html')

Power	FPR	FDR	AnnotationComplete	Aligner	SummaryMethod	StatMethod
(70-78.3)	(2.1-3.6)	(5.6-9.8)	FALSE	HISAT	featureCounts	DESeq2
(95.1-95.9)	(2.9-4.8)	(6.3-9.7)	TRUE	HISAT	featureCounts	DESeq2
(71.2-79.2)	(5-8.1)	(12-19.5)	FALSE	HISAT	featureCounts	edgeR
(96.4-97.2)	(6.8-9.9)	(12.7-18.1)	TRUE	HISAT	featureCounts	edgeR
(69-77.6)	(2.5-3.3)	(6-7.7)	FALSE	HISAT	featureCounts	limma
(94.4-95.1)	(3.1-4.5)	(6.5-7.5)	TRUE	HISAT	featureCounts	limma
(68.4-77)	(2.8-3)	(5.5-8.3)	FALSE	HISAT	StringTie	ballgown-exon
(93.7-94.6)	(3.6-4)	(5.9-7.8)	TRUE	HISAT	StringTie	ballgown-exon
(53.2-60)	(0.6-2.2)	(1.4-8.1)	FALSE	HISAT	StringTie	ballgown-trans
(67.2-71.9)	(0.6-1.1)	(1.4-3.2)	TRUE	HISAT	StringTie	ballgown-trans
(94.3-95.5)	(6.8-9)	(12.2-16.1)	NA	HISAT	derfinder	DESeq2
(94.1-95.1)	(6.5-8.3)	(11.1-15.7)	NA	Rail-RNA	derfinder	DESeq2
(95.6-96.1)	(8.4-10.5)	(13.9-18.9)	NA	HISAT	derfinder	edgeR
(94.7-95.9)	(8.3-10.2)	(12.8-18.5)	NA	Rail-RNA	derfinder	edgeR
(93.6-94.2)	(6.4-9.3)	(12.8-16.5)	NA	HISAT	derfinder	limma
(93.7-94.2)	(6.5-9.1)	(12.5-16.1)	NA	Rail-RNA	derfinder	limma

From these summaries we can see that in most cases the minimum and maximum values are nearly the same, with more variability for the HISAT -> StringTie -> ballgown pipeline results. Note that for this pipeline we conducted tests at the transcript level (StatMethod = ballgown-trans) and at the exon level (ballgown-exon). edgeR achieves slightly greater empirical power than DESeq2 at the cost of a higher empirical FPR and FDR.

The sets where we consider exons overlapping only 1 transcript shows a clear difference in power for the methods that rely on annotation between the analyses that used the complete annotation and those that used the incomplete one. The derfinder analyses do have higher FPR and FDR than other pipelines, but achieve nearly the same power as the other methods when the annotation is complete.

2.3 Details

The following tables show the results for all the replicates.

## Exon level
kable(empirical_exons, format = 'html', row.names = TRUE)

	power	FPR	FDR	replicate	AnnotationComplete	Aligner	SummaryMethod	StatMethod
1	75.0	6.6	2.8	1	TRUE	HISAT	featureCounts	DESeq2
2	80.6	13.1	5.0	1	TRUE	HISAT	featureCounts	edgeR
3	74.1	4.0	1.7	1	TRUE	HISAT	featureCounts	limma
4	74.0	6.7	2.8	1	FALSE	HISAT	featureCounts	DESeq2
5	79.6	12.6	4.8	1	FALSE	HISAT	featureCounts	edgeR
6	73.2	3.7	1.6	1	FALSE	HISAT	featureCounts	limma
7	74.0	5.6	2.6	2	TRUE	HISAT	featureCounts	DESeq2
8	80.0	12.2	5.1	2	TRUE	HISAT	featureCounts	edgeR
9	76.6	2.6	1.2	2	TRUE	HISAT	featureCounts	limma
10	72.9	5.2	2.5	2	FALSE	HISAT	featureCounts	DESeq2
11	78.9	11.7	5.0	2	FALSE	HISAT	featureCounts	edgeR
12	75.6	2.4	1.1	2	FALSE	HISAT	featureCounts	limma
13	73.4	6.6	2.9	3	TRUE	HISAT	featureCounts	DESeq2
14	78.4	12.9	5.1	3	TRUE	HISAT	featureCounts	edgeR
15	76.6	3.6	1.5	3	TRUE	HISAT	featureCounts	limma
16	72.2	5.2	2.3	3	FALSE	HISAT	featureCounts	DESeq2
17	77.4	10.9	4.4	3	FALSE	HISAT	featureCounts	edgeR
18	75.1	3.3	1.4	3	FALSE	HISAT	featureCounts	limma
19	72.7	9.4	4.0	1	NA	Rail-RNA	derfinder	DESeq2
20	73.2	10.6	4.5	1	NA	Rail-RNA	derfinder	edgeR
21	69.8	18.3	7.8	1	NA	Rail-RNA	derfinder	limma
22	71.6	8.3	4.0	2	NA	Rail-RNA	derfinder	DESeq2
23	72.4	10.0	4.7	2	NA	Rail-RNA	derfinder	edgeR
24	65.8	15.8	7.9	2	NA	Rail-RNA	derfinder	limma
25	72.3	8.9	3.9	3	NA	Rail-RNA	derfinder	DESeq2
26	73.0	10.3	4.4	3	NA	Rail-RNA	derfinder	edgeR
27	66.8	16.6	7.5	3	NA	Rail-RNA	derfinder	limma
28	73.0	10.1	4.2	1	NA	HISAT	derfinder	DESeq2
29	73.4	10.8	4.5	1	NA	HISAT	derfinder	edgeR
30	69.7	17.8	7.6	1	NA	HISAT	derfinder	limma
31	71.6	8.8	4.2	2	NA	HISAT	derfinder	DESeq2
32	72.6	10.6	4.9	2	NA	HISAT	derfinder	edgeR
33	65.5	15.0	7.5	2	NA	HISAT	derfinder	limma
34	72.3	8.3	3.6	3	NA	HISAT	derfinder	DESeq2
35	72.8	9.9	4.3	3	NA	HISAT	derfinder	edgeR
36	66.9	15.7	7.1	3	NA	HISAT	derfinder	limma
37	50.7	3.1	1.9	1	TRUE	HISAT	StringTie	ballgown-trans
38	72.0	4.4	1.9	1	TRUE	HISAT	StringTie	ballgown-exon
39	47.1	1.5	1.0	1	FALSE	HISAT	StringTie	ballgown-trans
40	70.9	4.1	1.8	1	FALSE	HISAT	StringTie	ballgown-exon
41	51.6	1.2	0.8	2	TRUE	HISAT	StringTie	ballgown-trans
42	74.1	3.7	1.7	2	TRUE	HISAT	StringTie	ballgown-exon
43	51.9	1.8	1.2	2	FALSE	HISAT	StringTie	ballgown-trans
44	72.9	3.5	1.7	2	FALSE	HISAT	StringTie	ballgown-exon
45	47.8	1.5	1.0	3	TRUE	HISAT	StringTie	ballgown-trans
46	74.6	3.3	1.4	3	TRUE	HISAT	StringTie	ballgown-exon
47	50.6	1.0	0.7	3	FALSE	HISAT	StringTie	ballgown-trans
48	72.9	3.1	1.3	3	FALSE	HISAT	StringTie	ballgown-exon

## Exon level (overlapping only 1 transcript)
kable(empirical_exons_one, format = 'html', row.names = TRUE)

	power	FPR	FDR	replicate	AnnotationComplete	Aligner	SummaryMethod	StatMethod
1	95.9	4.5	6.5	1	TRUE	HISAT	featureCounts	DESeq2
2	97.2	9.5	12.7	1	TRUE	HISAT	featureCounts	edgeR
3	95.1	4.5	6.5	1	TRUE	HISAT	featureCounts	limma
4	78.3	3.5	6.2	1	FALSE	HISAT	featureCounts	DESeq2
5	79.2	7.3	12.0	1	FALSE	HISAT	featureCounts	edgeR
6	77.6	3.3	6.0	1	FALSE	HISAT	featureCounts	limma
7	95.1	2.9	6.3	2	TRUE	HISAT	featureCounts	DESeq2
8	96.4	6.8	13.5	2	TRUE	HISAT	featureCounts	edgeR
9	94.8	3.1	6.7	2	TRUE	HISAT	featureCounts	limma
10	77.5	2.1	5.6	2	FALSE	HISAT	featureCounts	DESeq2
11	78.4	5.0	12.3	2	FALSE	HISAT	featureCounts	edgeR
12	77.3	2.5	6.6	2	FALSE	HISAT	featureCounts	limma
13	95.4	4.8	9.7	3	TRUE	HISAT	featureCounts	DESeq2
14	96.4	9.9	18.1	3	TRUE	HISAT	featureCounts	edgeR
15	94.4	3.6	7.5	3	TRUE	HISAT	featureCounts	limma
16	70.0	3.6	9.8	3	FALSE	HISAT	featureCounts	DESeq2
17	71.2	8.1	19.5	3	FALSE	HISAT	featureCounts	edgeR
18	69.0	2.7	7.7	3	FALSE	HISAT	featureCounts	limma
19	95.0	8.0	11.1	1	NA	Rail-RNA	derfinder	DESeq2
20	95.3	9.4	12.8	1	NA	Rail-RNA	derfinder	edgeR
21	94.1	9.1	12.5	1	NA	Rail-RNA	derfinder	limma
22	94.1	6.5	13.1	2	NA	Rail-RNA	derfinder	DESeq2
23	94.7	8.3	16.1	2	NA	Rail-RNA	derfinder	edgeR
24	93.7	6.5	13.2	2	NA	Rail-RNA	derfinder	limma
25	95.1	8.3	15.7	3	NA	Rail-RNA	derfinder	DESeq2
26	95.9	10.2	18.5	3	NA	Rail-RNA	derfinder	edgeR
27	94.2	8.5	16.1	3	NA	Rail-RNA	derfinder	limma
28	95.5	9.0	12.2	1	NA	HISAT	derfinder	DESeq2
29	95.8	10.5	13.9	1	NA	HISAT	derfinder	edgeR
30	94.2	9.3	12.8	1	NA	HISAT	derfinder	limma
31	94.3	6.8	13.7	2	NA	HISAT	derfinder	DESeq2
32	95.6	8.4	16.1	2	NA	HISAT	derfinder	edgeR
33	93.6	6.4	13.0	2	NA	HISAT	derfinder	limma
34	95.3	8.5	16.1	3	NA	HISAT	derfinder	DESeq2
35	96.1	10.5	18.9	3	NA	HISAT	derfinder	edgeR
36	93.8	8.7	16.5	3	NA	HISAT	derfinder	limma
37	70.9	0.7	1.4	1	TRUE	HISAT	StringTie	ballgown-trans
38	94.6	4.0	5.9	1	TRUE	HISAT	StringTie	ballgown-exon
39	60.0	0.6	1.4	1	FALSE	HISAT	StringTie	ballgown-trans
40	77.0	3.0	5.5	1	FALSE	HISAT	StringTie	ballgown-exon
41	71.9	1.1	3.2	2	TRUE	HISAT	StringTie	ballgown-trans
42	94.3	3.6	7.8	2	TRUE	HISAT	StringTie	ballgown-exon
43	58.1	2.0	6.9	2	FALSE	HISAT	StringTie	ballgown-trans
44	77.0	2.8	7.4	2	FALSE	HISAT	StringTie	ballgown-exon
45	67.2	0.6	1.8	3	TRUE	HISAT	StringTie	ballgown-trans
46	93.7	3.6	7.7	3	TRUE	HISAT	StringTie	ballgown-exon
47	53.2	2.2	8.1	3	FALSE	HISAT	StringTie	ballgown-trans
48	68.4	2.9	8.3	3	FALSE	HISAT	StringTie	ballgown-exon

2.4 Specific case

In this section we focus on the specific case of using HISAT -> derfinder -> edgeR from the first replicate because it has the highest empirical FPR from all the derfinder analyses when using the exon reference set (row 20 in the first table).

When using the exon level reference set, only about a tenth [(37 + 642) / 6018 = 11.28%] of the noneDE cases are incorrectly labeled. The numbers decrease in the subset of exons that overlap only 1 transcript: [(0 + 37) / 448 = 8.26%].

## Compare all references for one simulation scenario
case_result_exons[['regionMatrix.1.edgeR']]

##              Result
## Case          FalseNegative FalsePositive TrueNegative TruePositive   Sum
##   allDE                  81             0            0          595   676
##   noneDE                 37           642         5304           35  6018
##   singleDE               35             0            0          885   920
##   singleNotDE             0           149         1127            9  1285
##   someDE               7775           244         2110        20310 30439
##   Sum                  7928          1035         8541        21834 39338

case_result_exons_one[['regionMatrix.1.edgeR']]

##              Result
## Case          FalseNegative FalsePositive TrueNegative TruePositive  Sum
##   allDE                   2             0            0          137  139
##   noneDE                  0            37          411            0  448
##   singleDE               35             0            0          870  905
##   singleNotDE             0           145         1117            0 1262
##   someDE                 28            60          537          489 1114
##   Sum                    65           242         2065         1496 3868

Next we show the same tables but in percent of the total reference set units.

## Now in percent
round(case_result_exons[['regionMatrix.1.edgeR']] /
    case_result_exons[['regionMatrix.1.edgeR']][6, 5] * 100, 2)

##              Result
## Case          FalseNegative FalsePositive TrueNegative TruePositive    Sum
##   allDE                0.21          0.00         0.00         1.51   1.72
##   noneDE               0.09          1.63        13.48         0.09  15.30
##   singleDE             0.09          0.00         0.00         2.25   2.34
##   singleNotDE          0.00          0.38         2.86         0.02   3.27
##   someDE              19.76          0.62         5.36        51.63  77.38
##   Sum                 20.15          2.63        21.71        55.50 100.00

round(case_result_exons_one[['regionMatrix.1.edgeR']] /
    case_result_exons_one[['regionMatrix.1.edgeR']][6, 5] * 100, 2)

##              Result
## Case          FalseNegative FalsePositive TrueNegative TruePositive    Sum
##   allDE                0.05          0.00         0.00         3.54   3.59
##   noneDE               0.00          0.96        10.63         0.00  11.58
##   singleDE             0.90          0.00         0.00        22.49  23.40
##   singleNotDE          0.00          3.75        28.88         0.00  32.63
##   someDE               0.72          1.55        13.88        12.64  28.80
##   Sum                  1.68          6.26        53.39        38.68 100.00

In both the exons reference set, most of the false negatives are from the someDE scenario. Those are ambiguous cases and it’s the main reason why the power is decreased when using this reference set. Note how the percent of total reference units from the someDE case that result in false negatives decreases from 19.76 to 0.72 between the exons set that does not take into account multiplicity and the subset of exons overlapping only 1 transcript.

2.5 At different cutoffs

The following plots shows the observed FDR and observed FPR for controlling the FDR at 0.01, 0.05, 0.1, 0.15 and 0.2.

## Evaluate at the exons level (overlapping only 1 transcript)
## DE if any of the transcripts it overlaps is DE at different cutoffs
cuts <- c(0.01, 0.05, 0.1, 0.15, 0.2)
to_char <- function(x) { x[is.na(x)] <- 'NA'; return(x) }
emp_exons_one_cuts <- do.call(rbind, lapply(cuts, function(cut) {
    res <- emp_sum_mean(emp(mapply(count_comp, stats_GR, replicates, types,
        SIMPLIFY = FALSE, MoreArgs = list(reference = exons_one, cut = cut))))
    res$cut <- cut
    return(res)
}))
assign_cluster <- function(sum_met, stat_met, ann) {
    if(sum_met == 'featureCounts') {
        res <- ifelse(ann, 'fC-complete', 'fC-incomplete')
    } else if(sum_met == 'derfinder') {
        res <- 'derfinder'
    } else if(sum_met == 'StringTie') {
        res <- ifelse(ann, 'ST-complete', 'ST-incomplete')
    }
    return(res)
}
emp_exons_one_cuts$cluster <- with(emp_exons_one_cuts, mapply(assign_cluster, SummaryMethod, StatMethod, AnnotationComplete))

## Use only HISAT-aligned for simplicity
g1 <- ggplot(data = subset(emp_exons_one_cuts, Aligner == 'HISAT'),
    aes(x = FDR, y = Power, shape = StatMethod, color = cluster)) +
    geom_point(size = 3) + geom_line() + ylab('Empirical power') +
    xlab('Observed FDR (in percent)') +
    theme_linedraw(base_size = 16) +
    scale_color_brewer(palette = 'Set1', name = 'Group') +
    scale_shape_discrete(name = 'Statistical\nmethod')
g1

## Use only HISAT-aligned for simplicity
g2 <- ggplot(data = subset(emp_exons_one_cuts, Aligner == 'HISAT'),
    aes(x = FPR, y = Power, shape = StatMethod, color = cluster)) +
    geom_point(size = 3) + geom_line() + ylab('Empirical power') +
    xlab('Observed FPR (in percent)') +
    theme_linedraw(base_size = 16) +
    scale_color_brewer(palette = 'Set1', name = 'Group') +
    scale_shape_discrete(name = 'Statistical\nmethod')
g2

pdf('curve_controlFDR.pdf', width = 9)
g1
g2
dev.off()

## quartz_off_screen 
##                 2

#case_res_exons_one_cuts <- lapply(cuts, function(cut) {
#    i <- mapply(index_comp, stats_GR, replicates, types,
#        SIMPLIFY = FALSE, MoreArgs = list(reference = exons_one, cut = cut))
#    res <- mapply(case_result, i, replicates,
#        SIMPLIFY = FALSE, MoreArgs = list(reference = exons_one))
#    return(res)
#})

The following table shows the actual numbers used in the previous two plots. The empirical power, FPR and FDR are the mean values across the 3 simulation replicates.

## Exon level (overlapping only 1 transcript), showing mean by cutoff
kable(emp_exons_one_cuts, format = 'html', row.names = TRUE)

	Power	FPR	FDR	AnnotationComplete	Aligner	SummaryMethod	StatMethod	cut	cluster
1	73.46667	1.3000000	3.4333333	FALSE	HISAT	featureCounts	DESeq2	0.01	fC-incomplete
2	93.13333	1.8333333	3.6333333	TRUE	HISAT	featureCounts	DESeq2	0.01	fC-complete
3	75.00000	2.7666667	6.5333333	FALSE	HISAT	featureCounts	edgeR	0.01	fC-incomplete
4	95.10000	3.6000000	6.7333333	TRUE	HISAT	featureCounts	edgeR	0.01	fC-complete
5	69.33333	0.9333333	2.5666667	FALSE	HISAT	featureCounts	limma	0.01	fC-incomplete
6	88.33333	1.2666667	2.6333333	TRUE	HISAT	featureCounts	limma	0.01	fC-complete
7	68.26667	0.9000000	2.5333333	FALSE	HISAT	StringTie	ballgown-exon	0.01	ST-incomplete
8	86.80000	1.1666667	2.5666667	TRUE	HISAT	StringTie	ballgown-exon	0.01	ST-complete
9	52.56667	0.1000000	0.4000000	FALSE	HISAT	StringTie	ballgown-trans	0.01	ST-incomplete
10	64.46667	0.0666667	0.1666667	TRUE	HISAT	StringTie	ballgown-trans	0.01	ST-complete
11	92.83333	4.0000000	7.5666667	NA	HISAT	derfinder	DESeq2	0.01	derfinder
12	92.60000	3.6000000	6.9333333	NA	Rail-RNA	derfinder	DESeq2	0.01	derfinder
13	94.03333	4.5666667	8.5000000	NA	HISAT	derfinder	edgeR	0.01	derfinder
14	93.80000	4.2000000	7.9000000	NA	Rail-RNA	derfinder	edgeR	0.01	derfinder
15	87.00000	2.5000000	5.2000000	NA	HISAT	derfinder	limma	0.01	derfinder
16	87.53333	2.6333333	5.3333333	NA	Rail-RNA	derfinder	limma	0.01	derfinder
17	75.26667	3.0666667	7.2000000	FALSE	HISAT	featureCounts	DESeq2	0.05	fC-incomplete
18	95.46667	4.0666667	7.5000000	TRUE	HISAT	featureCounts	DESeq2	0.05	fC-complete
19	76.26667	6.8000000	14.6000000	FALSE	HISAT	featureCounts	edgeR	0.05	fC-incomplete
20	96.66667	8.7333333	14.7666667	TRUE	HISAT	featureCounts	edgeR	0.05	fC-complete
21	74.63333	2.8333333	6.7666667	FALSE	HISAT	featureCounts	limma	0.05	fC-incomplete
22	94.76667	3.7333333	6.9000000	TRUE	HISAT	featureCounts	limma	0.05	fC-complete
23	74.13333	2.9000000	7.0666667	FALSE	HISAT	StringTie	ballgown-exon	0.05	ST-incomplete
24	94.20000	3.7333333	7.1333333	TRUE	HISAT	StringTie	ballgown-exon	0.05	ST-complete
25	57.10000	1.6000000	5.4666667	FALSE	HISAT	StringTie	ballgown-trans	0.05	ST-incomplete
26	70.00000	0.8000000	2.1333333	TRUE	HISAT	StringTie	ballgown-trans	0.05	ST-complete
27	95.03333	8.1000000	14.0000000	NA	HISAT	derfinder	DESeq2	0.05	derfinder
28	94.73333	7.6000000	13.3000000	NA	Rail-RNA	derfinder	DESeq2	0.05	derfinder
29	95.83333	9.8000000	16.3000000	NA	HISAT	derfinder	edgeR	0.05	derfinder
30	95.30000	9.3000000	15.8000000	NA	Rail-RNA	derfinder	edgeR	0.05	derfinder
31	93.86667	8.1333333	14.1000000	NA	HISAT	derfinder	limma	0.05	derfinder
32	94.00000	8.0333333	13.9333333	NA	Rail-RNA	derfinder	limma	0.05	derfinder
33	76.06667	5.7666667	12.5666667	FALSE	HISAT	featureCounts	DESeq2	0.10	fC-incomplete
34	96.36667	7.1333333	12.3000000	TRUE	HISAT	featureCounts	DESeq2	0.10	fC-complete
35	76.86667	10.9333333	21.4000000	FALSE	HISAT	featureCounts	edgeR	0.10	fC-incomplete
36	97.60000	13.9000000	21.3333333	TRUE	HISAT	featureCounts	edgeR	0.10	fC-complete
37	75.70000	5.3000000	11.8333333	FALSE	HISAT	featureCounts	limma	0.10	fC-incomplete
38	96.10000	6.8000000	11.8000000	TRUE	HISAT	featureCounts	limma	0.10	fC-complete
39	75.53333	5.1666667	11.7000000	FALSE	HISAT	StringTie	ballgown-exon	0.10	ST-incomplete
40	95.90000	6.7333333	11.8000000	TRUE	HISAT	StringTie	ballgown-exon	0.10	ST-complete
41	58.16667	2.6000000	8.5333333	FALSE	HISAT	StringTie	ballgown-trans	0.10	ST-incomplete
42	71.20000	2.2666667	5.9666667	TRUE	HISAT	StringTie	ballgown-trans	0.10	ST-complete
43	95.80000	12.1666667	19.5666667	NA	HISAT	derfinder	DESeq2	0.10	derfinder
44	95.63333	11.8000000	19.1666667	NA	Rail-RNA	derfinder	DESeq2	0.10	derfinder
45	96.50000	14.8666667	22.7333333	NA	HISAT	derfinder	edgeR	0.10	derfinder
46	95.96667	14.1666667	22.0000000	NA	Rail-RNA	derfinder	edgeR	0.10	derfinder
47	95.70000	14.1000000	21.8333333	NA	HISAT	derfinder	limma	0.10	derfinder
48	95.53333	14.1333333	22.0333333	NA	Rail-RNA	derfinder	limma	0.10	derfinder
49	76.66667	8.2333333	17.0333333	FALSE	HISAT	featureCounts	DESeq2	0.15	fC-incomplete
50	97.13333	10.2333333	16.6000000	TRUE	HISAT	featureCounts	DESeq2	0.15	fC-complete
51	77.20000	14.8333333	26.7666667	FALSE	HISAT	featureCounts	edgeR	0.15	fC-incomplete
52	97.96667	18.7333333	26.7333333	TRUE	HISAT	featureCounts	edgeR	0.15	fC-complete
53	76.30000	7.9666667	16.7333333	FALSE	HISAT	featureCounts	limma	0.15	fC-incomplete
54	96.86667	9.9000000	16.1333333	TRUE	HISAT	featureCounts	limma	0.15	fC-complete
55	76.16667	8.2333333	17.3333333	FALSE	HISAT	StringTie	ballgown-exon	0.15	ST-incomplete
56	96.73333	10.4000000	17.0666667	TRUE	HISAT	StringTie	ballgown-exon	0.15	ST-complete
57	58.73333	3.9333333	11.9000000	FALSE	HISAT	StringTie	ballgown-trans	0.15	ST-incomplete
58	72.13333	4.1000000	9.6666667	TRUE	HISAT	StringTie	ballgown-trans	0.15	ST-complete
59	96.23333	16.4000000	24.5333333	NA	HISAT	derfinder	DESeq2	0.15	derfinder
60	96.13333	15.6333333	23.7000000	NA	Rail-RNA	derfinder	DESeq2	0.15	derfinder
61	97.06667	19.1333333	27.3666667	NA	HISAT	derfinder	edgeR	0.15	derfinder
62	96.56667	18.4000000	26.7000000	NA	Rail-RNA	derfinder	edgeR	0.15	derfinder
63	96.36667	20.2000000	28.5333333	NA	HISAT	derfinder	limma	0.15	derfinder
64	96.20000	19.9666667	28.3666667	NA	Rail-RNA	derfinder	limma	0.15	derfinder
65	76.96667	11.1000000	21.6333333	FALSE	HISAT	featureCounts	DESeq2	0.20	fC-incomplete
66	97.56667	13.8666667	21.3000000	TRUE	HISAT	featureCounts	DESeq2	0.20	fC-complete
67	77.33333	18.4666667	31.3000000	FALSE	HISAT	featureCounts	edgeR	0.20	fC-incomplete
68	98.23333	23.1333333	30.9666667	TRUE	HISAT	featureCounts	edgeR	0.20	fC-complete
69	76.90000	11.0666667	21.6333333	FALSE	HISAT	featureCounts	limma	0.20	fC-incomplete
70	97.60000	13.6666667	20.9666667	TRUE	HISAT	featureCounts	limma	0.20	fC-complete
71	76.70000	11.4333333	22.3333333	FALSE	HISAT	StringTie	ballgown-exon	0.20	ST-incomplete
72	97.36667	14.3000000	21.8666667	TRUE	HISAT	StringTie	ballgown-exon	0.20	ST-complete
73	59.43333	5.1333333	14.5666667	FALSE	HISAT	StringTie	ballgown-trans	0.20	ST-incomplete
74	73.20000	5.4000000	12.3000000	TRUE	HISAT	StringTie	ballgown-trans	0.20	ST-complete
75	96.86667	20.4000000	28.6333333	NA	HISAT	derfinder	DESeq2	0.20	derfinder
76	96.76667	19.5333333	27.8000000	NA	Rail-RNA	derfinder	DESeq2	0.20	derfinder
77	97.50000	23.6000000	31.5666667	NA	HISAT	derfinder	edgeR	0.20	derfinder
78	97.13333	22.5333333	30.7000000	NA	Rail-RNA	derfinder	edgeR	0.20	derfinder
79	96.96667	25.4333333	33.3000000	NA	HISAT	derfinder	limma	0.20	derfinder
80	96.83333	25.4000000	33.3333333	NA	Rail-RNA	derfinder	limma	0.20	derfinder

Evaluate results

L Collado-Torres

20 July 2016

Contents

1 Setup

1.1 Load data

1.2 Evaluation functions

1.3 Reference sets

2 Evaluation (FDR controlled)

2.1 Evaluate results

2.2 Summaries

2.3 Details

2.4 Specific case

2.5 At different cutoffs

3 Evaluation (FWER controlled)

3.1 Evaluate results

3.2 Summaries

3.3 Details

3.4 At different cutoffs

4 Reproducibility