Evaluate simulation

This report evaluates the simulation results for the regionMatrix() output.

library('GenomicRanges')

## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb

library('TxDb.Hsapiens.UCSC.hg19.knownGene')

## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

library('knitr')
library('devtools')
library('derfinder')

## Find out what's changed in derfinder with
## news(Version == "1.1.17", package = "derfinder")

library('derfinderHelper')
library('derfinderPlot')

## Find out what's changed in derfinderPlot with
## news(Version == "1.1.6", package = "derfinderPlot")

library('bumphunter')

## Loading required package: foreach
## foreach: simple, scalable parallel programming from Revolution Analytics
## Use Revolution R for scalability, fault tolerance and more.
## http://www.revolutionanalytics.com
## Loading required package: iterators
## Loading required package: locfit
## locfit 1.5-9.1    2013-03-22

library('limma')

## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

library('qvalue')
load('../simulation_info.Rdata')
load('../regionMatrix/regionMat-cut5.Rdata')
load('../derAnalysis/run2-v1.0.10/groupInfo.Rdata')
load('../derAnalysis/run2-v1.0.10/models.Rdata')
load('../derAnalysis/run2-v1.0.10/chr22/optionsStats.Rdata')
load('../CoverageInfo/fullCov.Rdata')

Results

## Find exons
txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, 'chr22')
txinfo <- select(txdb, keys = chosen$ucsckg_id, columns = columns(txdb), keytype = 'TXNAME')

## Buiild GRangesList with exons grouped by transcript
tx <- split(GRanges(seqnames = txinfo$EXONCHROM, IRanges(start = txinfo$EXONSTART, end = txinfo$EXONEND), strand = txinfo$EXONSTRAND), txinfo$TXNAME)
tx <- tx[match(chosen$ucsckg_id, names(tx))]

## Determine DE status for the regions
getF  <- function(fit, fit0, theData) {
    rss1 = rowSums((fitted(fit)-theData)^2)
    df1 = ncol(fit$coef)
    rss0 = rowSums((fitted(fit0)-theData)^2)
    df0 = ncol(fit0$coef)
    fstat = ((rss0-rss1)/(df1-df0))/(rss1/(ncol(theData)-df1))
    f_pval = pf(fstat, df1-1, ncol(theData)-df1,lower.tail=FALSE)
    fout = cbind(fstat,df1-1,ncol(theData)-df1,f_pval)
    colnames(fout)[2:3] = c("df1","df0")
    fout = data.frame(fout)
    return(fout)
}

# regions
regList <- lapply(regionMat, function(x) x$regions)
fullRegionGR <- unlist(GRangesList(regList))

## Explore widths
# summary(width(fullRegionGR))
#Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 4.0   106.0   156.0   448.9   307.0  5685.0 

# coverage matrix
fullRegionMat <- do.call("rbind",
    lapply(regionMat, function(x) x$coverageMatrix))

## log transform
y <- log2(fullRegionMat + 32)
rownames(y) <- NULL

## Run limma
fit <- lmFit(y, models$mod)
fit0 <- lmFit(y, models$mod0)
ff <- getF(fit,fit0, y)
fullRegionGR$sigQval <- factor(qvalue(ff$f_pval)$qvalues < 0.05, levels = c('TRUE', 'FALSE'))

## Find overlaps with DE regions
ctov <- countOverlaps(tx, fullRegionGR)

## Check result with sig regs
qval <-  fullRegionGR[fullRegionGR$sigQval == 'TRUE']
min.ov <- min(min(width(qval)), min(width(tx)))
ctov.qval <- countOverlaps(tx, qval, minoverlap =  min.ov)

## Use appropriate set
ctov.use <- if (identical(ctov > 0, ctov.qval > 0)) ctov else ctov.qval

Transcripts / genes vs DE regions

Overview

Table showing the results between whether the transcript was set to be differentially expressed (DE) and if it overlaps (minimum 1 bp) any candidate region.

addmargins(table('DE status' = chosen$DE, 'Overlaps DE region' = ctov > 0))

##          Overlaps DE region
## DE status TRUE Sum
##     FALSE   48  48
##     TRUE    48  48
##     Sum     96  96

The results are not the same using a minimum overlap of 4 bp between transcripts and candidate regions with a q-value < 0.05. Thus, we will use only the DE regions with q-value < 0.05.

addmargins(table('DE status' = chosen$DE, 'Overlaps DE region (sig q-value)' = ctov.qval > 0))

##          Overlaps DE region (sig q-value)
## DE status FALSE TRUE Sum
##     FALSE    28   20  48
##     TRUE      3   45  48
##     Sum      31   65  96

At a finer level, there is a difference in the number of exons per transcript overlapping all candidate regions vs the regions with q-value < 0.05.

## Verify things are working properly
# table(countOverlaps(tx, fullRegionGR, minoverlap = min.ov) - ctov.qval)

## Difference in overlaps found
table(ctov - ctov.qval)

## 
##  0  1  2  3  4  5  6  7  8  9 16 
## 51 20  4  2  7  3  1  2  2  2  2

By case

We can separate the transcripts by their experiment setup case. That is, whether its from a gene with:

a single transcript
- set to be DE: singleDE
- set not to be DE: singleNotDe
two transcripts
- with both set to be DE: bothDE
- with only one transcript set to be DE: oneDE
- with both set not to be DE: noneDE

Then compare against the results where

success.DE means that the transcript was set to be DE and overlaps an expressed region that is differentially expressed as determined by the limma analysis (true positive)
failed.DE means that the transcript was set to be DE and doesn't overlap an expressed DE region (false negative)
success.Reg means that the transcript was set not to de DE and doesn't overlap an expressed DE region (true negative)
failed.Reg means that the transcript was set not to be DE and does overlap an expressed DE region (false positive)

## Indexes
idx <- list(success = list(de = chosen$DE & ctov.use > 0, reg = !chosen$DE & !ctov.use > 0), failed = list(de = chosen$DE & !ctov.use > 0, reg = !chosen$DE & ctov.use > 0))
idx <- lapply(idx, function(x) { lapply(x, which) })

## Classify results
chosen$result <- 'Success.DE'
chosen$result[ idx$success$reg ] <- 'success.Reg'
chosen$result[ idx$failed$de ] <- 'Failed.DE'
chosen$result[ idx$failed$reg ] <- 'failed.Reg'

## Overall summary
kable(table(chosen$case, chosen$result), format = 'html')

	Failed.DE	failed.Reg	Success.DE	success.Reg
bothDE	0	0	24	0
noneDE	0	8	0	16
oneDE	0	12	12	0
singleDE	3	0	9	0
singleNotDE	0	0	0	12

Failed.DE (false negative)

The 3 Failed.DE cases (false negatives) are mostly short single transcript genes (one exon only) where 2 were set to have low expression on one group, normal on the other two.

## Successful cases
success.de <- tx[ idx$success$de  ]
success.Reg <- tx[ idx$success$reg ]

## What happened with the txs set to be DE that were not picked up?
failed.de <- tx[ idx$failed$de  ]

## They are short transcripts
kable(chosen[idx$failed$de, ], format = 'html')

	tx_idx	tx_n	tx_i	gene_id	ucsckg_id	fasta_i	DE	group1	group2	group3	width	readspertx	mean1	mean2	mean3	case	result
17	20	1	1	100422998	uc021wnn.1	317	TRUE	low	normal	normal	86	34	17	34	34	singleDE	Failed.DE
22	27	1	1	100500833	uc010gvn.2	347	TRUE	normal	normal	high	110	44	44	44	88	singleDE	Failed.DE
24	29	1	1	100500901	uc021wny.1	423	TRUE	normal	normal	low	58	23	23	23	12	singleDE	Failed.DE

However, 4 similar cases with short transcripts were successfully detected. So it's likely that a lower F-stat cutoff would have picked up these false negative cases.

## However there are other short transcripts that were picked up
kable(chosen[ idx$success$de[sum(width(success.de)) <= 110], ], format = 'html')

	tx_idx	tx_n	tx_i	gene_id	ucsckg_id	fasta_i	DE	group1	group2	group3	width	readspertx	mean1	mean2	mean3	case	result
2	2	1	1	100126318	uc021wmk.1	181	TRUE	low	normal	normal	78	31	16	31	31	singleDE	Success.DE
10	12	1	1	100302118	uc021wls.1	127	TRUE	normal	high	normal	78	31	31	62	31	singleDE	Success.DE
11	13	1	1	100302149	uc021wrh.1	796	TRUE	normal	normal	low	66	26	26	26	13	singleDE	Success.DE
23	28	1	1	100500860	uc021wlo.1	115	TRUE	normal	low	normal	88	35	35	18	35	singleDE	Success.DE

More info:

width(failed.de)

## IntegerList of length 3
## [["uc021wnn.1"]] 86
## [["uc010gvn.2"]] 110
## [["uc021wny.1"]] 58

width(tx[ idx$success$de[sum(width(success.de)) <= 110] ])

## IntegerList of length 4
## [["uc021wmk.1"]] 78
## [["uc021wls.1"]] 78
## [["uc021wrh.1"]] 66
## [["uc021wlo.1"]] 88

Plots

gs <- makeGenomicState(txdb, chrs = '22', verbose = FALSE)
annoTrans <- annotateTranscripts(txdb = txdb)

## Warning:   Calling species() on a TxDb object is *deprecated*.
##   Please use organism() instead.

## No annotationPackage supplied. Trying org.Hs.eg.db.
## Loading required package: org.Hs.eg.db
## Loading required package: DBI
## 
## Getting TSS and TSE.
## Getting CSS and CSE.
## Getting exons.
## Annotating genes.

makePlots <- function(reg, gs) {
    ## Prep
    strand(reg) <- '*'
    regCov <- getRegionCoverage(fullCov = fullCov, regions = reg, verbose = FALSE)
    annoReg <- annotateRegions(reg, gs$fullGenome, verbose = FALSE)
    annoNear <- matchGenes(reg, subject = annoTrans)

    ## Actually make the plots with F-stat track
    prev.name <- ''
    def.par <- par()
    def.par <- def.par[-which(names(def.par) %in% c('cin', 'cra', 'csi', 'cxy', 'din', 'page'))]
    for(reg.i in seq_len(length(reg))) {
        
        if(prev.name != names(reg)[reg.i]) {
            par(def.par)
            plot.new()
            text(0.5, 0.5, names(reg)[reg.i], cex = 5)
        }
        prev.name <- names(reg)[reg.i]
        
        
        range <- start(reg[reg.i]):end(reg[reg.i])
        dat <- fullCov$chr22[range, ]
        
        ## Skip plot if there is no coverage data
        if(max(sapply(dat, max)) == 0) {
            par(def.par)
            plot.new()
            text(0.5, 0.5, paste('No data\nReg', reg.i), cex = 5)
            next
        }
        
        ## Log2 transform
        for(i in 1:30) dat[[i]] <- log2(dat[[i]] + 32) 
        
        ## Calculate f-stats
        fstats <- as.numeric(fstats.apply(data = dat, mod = models$mod, mod0 = models$mod0))
    
        ## Make plot
        plotRegionCoverage(reg, regCov, groupInfo, annoNear, annoReg, txdb, reg.i, ask = FALSE, verbose = FALSE)
    
        ## Add F-stat track
        par(fig = c(0, 1, 0.075, 0.125), new = TRUE, xaxt = 'n', oma = c(0, 0, 0, 0), mar = c(0, 4.5, 0, 1.1))
        plot(y = fstats, x = range, ylab = 'F-stat', type = 'l', xlab = '', bty = 'n', ylim = c(0, max(fstats[is.finite(fstats)], optionsStats$cutoffFstatUsed) * 1.1))
        abline(h = optionsStats$cutoffFstatUsed, col = 'red')
    }
}

Coverage plots with F-statistics shown at the bottom for the false negative cases. One plot it shown for each exon that compose these transcripts.

makePlots(unlist(failed.de), gs)

failed.Reg (false positive)

Out of the 20 failed.Reg transcripts (false positives), 12 of them are from the oneDE case. You could then argue that they are really not false positives. However, 8 and 0 transcripts are from the noneDE and singleNotDE cases respectively which would be the truly false positives.

## What happened with those set to be not DE but overlap DE regions?
failed.Reg <- tx[ idx$failed$reg ]

kable(chosen[idx$failed$reg, ], format = 'html')

	tx_idx	tx_n	tx_i	gene_id	ucsckg_id	fasta_i	DE	group1	group2	group3	width	readspertx	mean1	mean2	mean3	case	result
26	6	2	2	100130717	uc011agh.3	17	FALSE	normal	normal	normal	650	260	260	260	260	oneDE	failed.Reg
37	56	2	1	10738	uc010gwn.3	467	FALSE	normal	normal	normal	1488	595	595	595	595	oneDE	failed.Reg
47	80	2	1	128977	uc002zpi.3	84	FALSE	normal	normal	normal	1558	623	623	623	623	oneDE	failed.Reg
49	85	2	1	129138	uc003auc.3	576	FALSE	normal	normal	normal	2149	860	860	860	860	oneDE	failed.Reg
95	196	2	1	25776	uc003awb.4	613	FALSE	normal	normal	normal	1287	515	515	515	515	noneDE	failed.Reg
96	196	2	2	25776	uc003awc.3	614	FALSE	normal	normal	normal	1147	459	459	459	459	noneDE	failed.Reg
98	206	2	2	25817	uc003bio.4	836	FALSE	normal	normal	normal	2652	1061	1061	1061	1061	oneDE	failed.Reg
121	266	2	1	339669	uc003aqe.3	538	FALSE	normal	normal	normal	1049	420	420	420	420	oneDE	failed.Reg
125	272	2	1	3761	uc003avs.1	606	FALSE	normal	normal	normal	1913	765	765	765	765	oneDE	failed.Reg
132	283	2	2	3976	uc011aks.2	379	FALSE	normal	normal	normal	3987	1595	1595	1595	1595	oneDE	failed.Reg
139	307	2	1	4248	uc003axv.4	653	FALSE	normal	normal	normal	4987	1995	1995	1995	1995	noneDE	failed.Reg
140	307	2	2	4248	uc010gxy.3	652	FALSE	normal	normal	normal	5102	2041	2041	2041	2041	noneDE	failed.Reg
150	321	2	2	4689	uc003apz.4	533	FALSE	normal	normal	normal	1646	658	658	658	658	oneDE	failed.Reg
167	391	2	1	646023	uc002zzz.2	257	FALSE	normal	normal	normal	1769	708	708	708	708	noneDE	failed.Reg
168	391	2	2	646023	uc003aad.1	256	FALSE	normal	normal	normal	2568	1027	1027	1027	1027	noneDE	failed.Reg
175	400	2	1	6523	uc003amc.3	457	FALSE	normal	normal	normal	5061	2024	2024	2024	2024	oneDE	failed.Reg
190	417	2	2	6948	uc003air.2	407	FALSE	normal	normal	normal	2006	802	802	802	802	oneDE	failed.Reg
192	421	2	2	7122	uc010grr.2	91	FALSE	normal	normal	normal	1720	688	688	688	688	oneDE	failed.Reg
215	473	2	1	83874	uc003ahk.4	383	FALSE	normal	normal	normal	2038	815	815	815	815	noneDE	failed.Reg
216	473	2	2	83874	uc010gvu.3	382	FALSE	normal	normal	normal	2017	807	807	807	807	noneDE	failed.Reg

Plots

Coverage plots with F-statistics shown at the bottom for the false positive cases. One plot it shown for each exon that compose these transcripts. For the 12 transcripts from the oneDE case, it can be seen how at least one plot contains a DE region overlapping an exon set to be DE. .

Some complex situations where there are exons on both strands can be observed.

makePlots(unlist(failed.Reg), gs)

## .

Other strand

In some simulations, we found what seemed to be false positive transcripts but turned out to overlap DE regions in sections where there are exons on both the positive and negative strands and at least one of the exons was set to be DE.

## Most of the truly false positive transcripts don't overlap other transcripts
inter <- intersect(idx$failed$reg, c(which(chosen$case == 'noneDE'), which(chosen$case == 'singleNotDE')))
table(countOverlaps(tx[inter], tx[-inter]))

## 
## 0 
## 8

## They are not short
width(tx[inter])

## IntegerList of length 8
## [["uc003awb.4"]] 98 140 116 106 119 692
## [["uc003awc.3"]] 98 116 106 119 692
## [["uc003axv.4"]] 238 4848
## [["uc010gxy.3"]] 4971
## [["uc002zzz.2"]] 126 171 128 149 173 654 1167
## [["uc003aad.1"]] 126 171 128 319 118 149 274 482
## [["uc003ahk.4"]] 294 100 108 107 115 66 190 155 862
## [["uc010gvu.3"]] 294 121 108 107 115 66 190 155 862

## To explore regions with derfinderReport
sort(subjectHits(findOverlaps(tx[inter], fullRegionGR)))

##  [1]  43  44  45  46  47  48  48  49  50  51  51  52  52  53  53  82  82
## [18]  83  83  84  84  85  85  86  86  87  87  88  88  89  89  90  90 183
## [35] 183 184 185 185 186 186 187 187 188 188 204 205 205

## DE regions
# fullRegionGR[subjectHits(findOverlaps(tx[i], fullRegionGR))]

As it can be seen below, 8 apparent false positive transcripts from the noneDE case overlap (when strand is not taken into account) genes where at least one of two transcripts was set to be DE.

kable(chosen[ subjectHits(findOverlaps(tx[inter], tx[-inter], ignore.strand = TRUE)), ], format = 'html')

tx_idx	tx_n	tx_i	gene_id	ucsckg_id	fasta_i	DE	group1	group2	group3	width	readspertx	mean1	mean2	mean3	case	result

Gene level

For each gene, if at least one transcript is set to be DE then we consider the gene to be DE. Then, we check if the gene overlaps at least one DE region.

gene <- data.frame(gene_id = unique(chosen$gene_id))
gene$DE <- sapply(gene$gene_id, function(x) { any(chosen$DE[chosen$gene_id == x])  })
gene$case <- sapply(gene$gene_id, function(x) { unique(chosen$case[chosen$gene_id == x])  })
gene$overlaps <- sapply(gene$gene_id, function(x) { sum(ctov.use[ chosen$ucsckg_id[chosen$gene_id == x] ]) })
gene$overlap <- gene$overlaps > 0

## Results between DE status and overlapping at least 1 DE regrion at the gene level
addmargins(table('DE status' = gene$DE, 'Overlaps DE region' = gene$overlap))

##          Overlaps DE region
## DE status FALSE TRUE Sum
##     FALSE    20    4  24
##     TRUE      3   33  36
##     Sum      23   37  60

Conclusions

The results from the simulation are promising as most transcripts were correctly classified as differentially expressed or not by in the expressed-regions analysis.

The majority of the false negative cases involved short single transcript genes with one group having low expression relative to the other two. These cases could potentially be mitigated by lowering the F-statistic threshold used in the derfinder analysis.

In some simulations there are some apparent false positives which are due to transcripts on one strand set not to be DE overlapping transcripts from the other strand set to be DE. This situation could be solved with strand-specific RNA-seq data and running derfinder for each strand separately.

Extra

Minimum number of reads per transcript as well as per sample.

## Distribution of the minimum number of reads per transcript
summary(apply(readmat, 1, min))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   213.2   453.5   625.5   783.0  2747.0

## Distribution of the minimum number of reads per sample
summary(apply(readmat, 2, min))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   7.000   9.000   9.067  11.000  19.000

The minimum number of reads per transcript for a given sample is 1.

Exonic segments

Next we can evaluate the simulation by classifying the exonic segments as whether they should be DE or not. Then, we can find if the DE regions overlap such segments and viceversa. We would expect that the DE regions with a q-value < 0.05 would only overlap segments that were set to be DE.

segments <- GRangesList(lapply(gene$gene_id, function(x) {
    i <- chosen$ucsckg_id[ chosen$gene_id == x]
    
    ## Find segments
    segs <- disjoin(unlist(tx[i]))
    ov <- findOverlaps(segs, tx[i])
    
    ## Find DE status per segment
    segs$DE <- as.vector(tapply(subjectHits(ov), queryHits(ov), function(y) {
        any(chosen$DE[ chosen$gene_id == x])
    }))
    
    ## Finish
    return(segs)
}))
names(segments) <- gene$gene_id
segs <- unlist(segments)

Segments vs DE regions

We can check the if the exonic segments overlap one or more DE region similarly to what we did earlier at the transcript and gene level. The results change depending on whether only the DE regions with significant q-value or all of the DE regions are used.

addmargins(table('DE status' = segs$DE, 'Overlaps DE region (sig q-value)' = countOverlaps(segs, qval) > 0))

##          Overlaps DE region (sig q-value)
## DE status FALSE TRUE Sum
##     FALSE   104    7 111
##     TRUE     11  158 169
##     Sum     115  165 280

addmargins(table('DE status' = segs$DE, 'Overlaps DE region' = countOverlaps(segs, fullRegionGR) > 0))

##          Overlaps DE region
## DE status FALSE TRUE Sum
##     FALSE     0  111 111
##     TRUE      1  168 169
##     Sum       1  279 280

Using the DE regions with significant q-values, there are 11 false negative cases. From the exploration shown below, half of them seem short. Most of the false negative segments correspond to genes from the oneDE scenario. Thus revealing that the complexity of that scenario makes it challenging to identify significant DE regions.

## Explore false negative segments using DE regions with sig q-value
seg.fn <- which(segs$DE & !countOverlaps(segs, qval) > 0)

## Around half of these segments are short
summary(width(segs[seg.fn]))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    58.0    98.0   179.0   295.6   335.5   853.0

chosen[chosen$gene_id == '6523', ]

##     tx_idx tx_n tx_i gene_id  ucsckg_id fasta_i    DE group1 group2 group3
## 175    400    2    1    6523 uc003amc.3     457 FALSE normal normal normal
## 176    400    2    2    6523 uc011alz.2     458  TRUE normal normal   high
##     width readspertx mean1 mean2 mean3  case     result
## 175  5061       2024  2024  2024  2024 oneDE failed.Reg
## 176  4779       1912  1912  1912  3824 oneDE Success.DE

## 11 of the 37 segments are from gene with id 6523
tail(sort(table(names(segs[seg.fn]))))

## 
##     25817      3761      3976      6523      7494 100130717 
##         1         1         1         1         1         2

## Cases of the genes with at least one FN segment
table(tapply(subset(chosen, gene_id %in% names(seg.fn))$case, subset(chosen, gene_id %in% names(seg.fn))$gene_id, unique))

## 
##   bothDE    oneDE singleDE 
##        1        6        3

## Type of gene where the segments come from. Mostly oneDE genes
table(sapply(names(segs[seg.fn]), function(x) { unique(chosen$case[chosen$gene_id == x]) }))

## 
##   bothDE    oneDE singleDE 
##        1        7        3

Plots

Coverage plots with F-statistics shown at the bottom for the false negative exonic segments grouped by their gene.

makePlots(segs[seg.fn], gs)

DE regions vs segments

## Do DE regions overlap segments that are set to be DE?
reg.ov <- findOverlaps(fullRegionGR, segs)
fullRegionGR$overlap <- sapply(seq_len(length(fullRegionGR)), function(x) {
    y <- which(queryHits(reg.ov) == x)
    if(length(y) == 0) return(NA)
    any(segs$DE[ subjectHits(reg.ov)[y] ])
})

We can check how many segments each DE region overlaps. Ideally they should all overlap at least one segment, but there are some cases where this could not happen (2 in this case). Possibly because of small mismatches between the transcripts and the actual mRNA used in the simulation. Alternatively, alignment problems could explain such cases.

## Do DE regions overlap at least one segment?
table(countOverlaps(fullRegionGR, segs))

## 
##   0   1   2   3   4 
##   2 223  19   2   3

## Widths of DE regions not overlapping any segment
width(fullRegionGR[is.na(fullRegionGR$overlap)])

## [1] 143 125

## Do these DE regions have a significant q-value?
table(fullRegionGR$sigQval[is.na(fullRegionGR$overlap)])

## 
##  TRUE FALSE 
##     1     1

Minimum 10bp

We can repeat the same exploration but now requiring at least a 10bp overlap.

## Do DE regions overlap segments that are set to be DE?
reg.ov10 <- findOverlaps(fullRegionGR, segs, minoverlap = 10)
fullRegionGR$overlap10 <- sapply(seq_len(length(fullRegionGR)), function(x) {
    y <- which(queryHits(reg.ov10) == x)
    if(length(y) == 0) return(NA)
    any(segs$DE[ subjectHits(reg.ov10)[y] ])
})

## How many DE regions are smaller than 10bp?
table(width(fullRegionGR) < 10)

## 
## FALSE  TRUE 
##   248     1

## How many exonic segments are smaller than 10bp?
table(width(segs) < 10)

## 
## FALSE  TRUE 
##   279     1

## Do DE regions (min 10bp long) overlap at least one segment?
table(countOverlaps(fullRegionGR[width(fullRegionGR) >= 10], segs, minoverlap = 10))

## 
##   0   1   2   3   4 
##   2 223  18   2   3

## Widths of DE regions not overlapping any segment
width(fullRegionGR[is.na(fullRegionGR$overlap10) & width(fullRegionGR) >= 10])

## [1] 143 125

## Do these DE regions have a significant q-value?
table(fullRegionGR$sigQval[is.na(fullRegionGR$overlap10) & width(fullRegionGR) >= 10])

## 
##  TRUE FALSE 
##     1     1

Minimum 20bp

And similarly with a minimum overlap of 20bp.

## Do DE regions overlap segments that are set to be DE?
reg.ov20 <- findOverlaps(fullRegionGR, segs, minoverlap = 20)
fullRegionGR$overlap20 <- sapply(seq_len(length(fullRegionGR)), function(x) {
    y <- which(queryHits(reg.ov20) == x)
    if(length(y) == 0) return(NA)
    any(segs$DE[ subjectHits(reg.ov20)[y] ])
})

## How many DE regions are smaller than 20bp?
table(width(fullRegionGR) < 20)

## 
## FALSE  TRUE 
##   248     1

## How many exonic segments are smaller than 20bp?
table(width(segs) < 20)

## 
## FALSE  TRUE 
##   277     3

## Do DE regions (min 20bp long) overlap at least one segment?
table(countOverlaps(fullRegionGR[width(fullRegionGR) >= 20], segs, minoverlap = 20))

## 
##   0   1   2   3   4 
##   2 224  17   2   3

## Widths of DE regions not overlapping any segment
width(fullRegionGR[is.na(fullRegionGR$overlap20) & width(fullRegionGR) >= 20])

## [1] 143 125

## Do these DE regions have a significant q-value?
table(fullRegionGR$sigQval[is.na(fullRegionGR$overlap20) & width(fullRegionGR) >= 20])

## 
##  TRUE FALSE 
##     1     1

DE regions correctness

However, the main result is whether the DE regions overlap segments expected to be DE. Note that for this comparison, DE regions are unstranded and could potentially overlap two segments from different strands where only one of them was set to be DE.

## Check by whether the DE region has a q-value < 0.05
addmargins(table('Overlaps a DE segment' = fullRegionGR$overlap, 'q-value < 0.05' = fullRegionGR$sigQval))

##                      q-value < 0.05
## Overlaps a DE segment TRUE FALSE Sum
##                 FALSE    6    95 101
##                 TRUE   137     9 146
##                 Sum    143   104 247

Regardless of whether the DE region p-value is significant, we see that 40.89 percent of the DE regions overlapping at least one segment, incorrectly overlap a segment set not to be DE.

Minimum 10bp

Out of the 249 DE regions, only 248 are at least 10bp long. They are compared against 279 exonic segments at least 10bp long out of the total 280. Only 2 DE regions 10bp or longer do not overlap any exonic segment regardless of its DE status.

## Check by whether the DE region has a q-value < 0.05
addmargins(table('Overlaps a DE segment' = fullRegionGR$overlap10, 'q-value < 0.05' = fullRegionGR$sigQval))

##                      q-value < 0.05
## Overlaps a DE segment TRUE FALSE Sum
##                 FALSE    6    95 101
##                 TRUE   136     9 145
##                 Sum    142   104 246

Regardless of whether the DE region p-value is significant, we see that 41.06 percent of the DE regions overlapping at least one segment (min overlap 10bp), incorrectly overlap a segment set not to be DE.

Minimum 20bp

Out of the 249 DE regions, only 248 are at least 20bp long. They are compared against 277 exonic segments at least 20bp long out of the total 280. Only 2 DE regions 20bp or longer do not overlap any exonic segment regardless of its DE status.

## Check by whether the DE region has a q-value < 0.05
addmargins(table('Overlaps a DE segment' = fullRegionGR$overlap20, 'q-value < 0.05' = fullRegionGR$sigQval))

##                      q-value < 0.05
## Overlaps a DE segment TRUE FALSE Sum
##                 FALSE    6    95 101
##                 TRUE   136     9 145
##                 Sum    142   104 246

Regardless of whether the DE region p-value is significant, we see that 41.06 percent of the DE regions overlapping at least one segment (min overlap 20bp), incorrectly overlap a segment set not to be DE.

Conclusions

The observed FDR is lower than 0.05, which is what we would expect.

Reproducibility

## Reproducibility info
Sys.time()

## [1] "2015-03-31 11:58:59 EDT"

proc.time()

##    user  system elapsed 
## 415.143   3.806 439.700

options(width = 120)
session_info()

## Session info-----------------------------------------------------------------------------------------------------------

##  setting  value                                             
##  version  R Under development (unstable) (2014-11-01 r66923)
##  system   x86_64, darwin10.8.0                              
##  ui       X11                                               
##  language (EN)                                              
##  collate  en_US.UTF-8                                       
##  tz       America/New_York

## Packages---------------------------------------------------------------------------------------------------------------

##  package                           * version  date       source                                    
##  acepack                             1.3.3.3  2013-05-03 CRAN (R 3.2.0)                            
##  AnnotationDbi                     * 1.29.20  2015-03-19 Bioconductor                              
##  Biobase                           * 2.27.3   2015-03-27 Bioconductor                              
##  BiocGenerics                      * 0.13.10  2015-03-27 Bioconductor                              
##  BiocParallel                        1.1.21   2015-03-24 Bioconductor                              
##  biomaRt                             2.23.5   2014-11-22 Bioconductor                              
##  Biostrings                          2.35.12  2015-03-26 Bioconductor                              
##  biovizBase                          1.15.3   2015-03-30 Bioconductor                              
##  bitops                              1.0.6    2013-08-17 CRAN (R 3.2.0)                            
##  BSgenome                            1.35.20  2015-03-27 Bioconductor                              
##  bumphunter                        * 1.7.6    2015-03-13 Github (lcolladotor/bumphunter@37d10e7)   
##  cluster                             2.0.1    2015-01-31 CRAN (R 3.2.0)                            
##  codetools                           0.2.11   2015-03-10 CRAN (R 3.2.0)                            
##  colorout                          * 1.0.2    2014-11-03 local                                     
##  colorspace                          1.2.6    2015-03-11 CRAN (R 3.2.0)                            
##  DBI                               * 0.3.1    2014-09-24 CRAN (R 3.2.0)                            
##  derfinder                         * 1.1.17   2015-03-14 Github (lcolladotor/derfinder@3532e0c)    
##  derfinderHelper                   * 1.1.6    2015-03-15 Bioconductor                              
##  derfinderPlot                     * 1.1.6    2015-03-14 Github (lcolladotor/derfinderPlot@1319754)
##  devtools                          * 1.6.1    2014-10-07 CRAN (R 3.2.0)                            
##  dichromat                           2.0.0    2013-01-24 CRAN (R 3.2.0)                            
##  digest                              0.6.8    2014-12-31 CRAN (R 3.2.0)                            
##  doRNG                               1.6      2014-03-07 CRAN (R 3.2.0)                            
##  evaluate                            0.5.5    2014-04-29 CRAN (R 3.2.0)                            
##  foreach                           * 1.4.2    2014-04-11 CRAN (R 3.2.0)                            
##  foreign                             0.8.63   2015-02-20 CRAN (R 3.2.0)                            
##  formatR                             1.0      2014-08-25 CRAN (R 3.2.0)                            
##  Formula                             1.2.0    2015-01-20 CRAN (R 3.2.0)                            
##  futile.logger                       1.4      2015-03-21 CRAN (R 3.2.0)                            
##  futile.options                      1.0.0    2010-04-06 CRAN (R 3.2.0)                            
##  GenomeInfoDb                      * 1.3.16   2015-03-27 Bioconductor                              
##  GenomicAlignments                   1.3.32   2015-03-18 Bioconductor                              
##  GenomicFeatures                   * 1.19.36  2015-03-30 Bioconductor                              
##  GenomicFiles                        1.3.14   2015-03-07 Bioconductor                              
##  GenomicRanges                     * 1.19.48  2015-03-27 Bioconductor                              
##  GGally                              0.4.8    2014-08-26 CRAN (R 3.2.0)                            
##  ggbio                               1.15.2   2015-03-24 Bioconductor                              
##  ggplot2                             1.0.0    2014-05-21 CRAN (R 3.2.0)                            
##  graph                               1.45.2   2015-03-01 Bioconductor                              
##  gridExtra                           0.9.1    2012-08-09 CRAN (R 3.2.0)                            
##  gtable                              0.1.2    2012-12-05 CRAN (R 3.2.0)                            
##  Hmisc                               3.14.5   2014-09-12 CRAN (R 3.2.0)                            
##  htmltools                           0.2.6    2014-09-08 CRAN (R 3.2.0)                            
##  IRanges                           * 2.1.43   2015-03-07 Bioconductor                              
##  iterators                         * 1.0.7    2014-04-11 CRAN (R 3.2.0)                            
##  knitr                             * 1.7      2014-10-13 CRAN (R 3.2.0)                            
##  knitrBootstrap                      1.0.0    2014-11-03 Github (jimhester/knitrBootstrap@76c41f0) 
##  lambda.r                            1.1.7    2015-03-20 CRAN (R 3.2.0)                            
##  lattice                             0.20.30  2015-02-22 CRAN (R 3.2.0)                            
##  latticeExtra                        0.6.26   2013-08-15 CRAN (R 3.2.0)                            
##  limma                             * 3.23.11  2015-03-15 Bioconductor                              
##  locfit                            * 1.5.9.1  2013-04-20 CRAN (R 3.2.0)                            
##  markdown                            0.7.4    2014-08-24 CRAN (R 3.2.0)                            
##  MASS                                7.3.40   2015-03-21 CRAN (R 3.2.0)                            
##  Matrix                              1.1.5.1  2015-03-23 CRAN (R 3.2.0)                            
##  matrixStats                         0.14.0   2015-02-14 CRAN (R 3.2.0)                            
##  mime                                0.3      2015-03-29 CRAN (R 3.2.0)                            
##  munsell                             0.4.2    2013-07-11 CRAN (R 3.2.0)                            
##  nnet                                7.3.9    2015-02-11 CRAN (R 3.2.0)                            
##  org.Hs.eg.db                      * 3.0.0    2014-09-26 Bioconductor                              
##  OrganismDbi                         1.9.15   2015-03-30 Bioconductor                              
##  pkgmaker                            0.22     2014-05-14 CRAN (R 3.2.0)                            
##  plyr                                1.8.1    2014-02-26 CRAN (R 3.2.0)                            
##  proto                               0.3.10   2012-12-22 CRAN (R 3.2.0)                            
##  qvalue                            * 1.99.0   2015-03-30 Bioconductor                              
##  RBGL                                1.43.0   2014-10-14 Bioconductor                              
##  RColorBrewer                        1.1.2    2014-12-07 CRAN (R 3.2.0)                            
##  Rcpp                                0.11.5   2015-03-06 CRAN (R 3.2.0)                            
##  RCurl                               1.95.4.5 2014-12-28 CRAN (R 3.2.0)                            
##  registry                            0.2      2012-01-24 CRAN (R 3.2.0)                            
##  reshape                             0.8.5    2014-04-23 CRAN (R 3.2.0)                            
##  reshape2                            1.4.1    2014-12-06 CRAN (R 3.2.0)                            
##  rmarkdown                           0.3.3    2014-09-17 CRAN (R 3.2.0)                            
##  rngtools                            1.2.4    2014-03-06 CRAN (R 3.2.0)                            
##  rpart                               4.1.9    2015-02-24 CRAN (R 3.2.0)                            
##  Rsamtools                           1.19.49  2015-03-27 Bioconductor                              
##  RSQLite                           * 1.0.0    2014-10-25 CRAN (R 3.2.0)                            
##  rstudioapi                          0.2      2014-12-31 CRAN (R 3.2.0)                            
##  rtracklayer                         1.27.10  2015-03-27 Bioconductor                              
##  S4Vectors                         * 0.5.22   2015-03-06 Bioconductor                              
##  scales                              0.2.4    2014-04-22 CRAN (R 3.2.0)                            
##  stringr                             0.6.2    2012-12-06 CRAN (R 3.2.0)                            
##  survival                            2.38.1   2015-02-24 CRAN (R 3.2.0)                            
##  TxDb.Hsapiens.UCSC.hg19.knownGene * 3.0.0    2014-09-26 Bioconductor                              
##  VariantAnnotation                   1.13.46  2015-03-26 Bioconductor                              
##  XML                                 3.98.1.1 2013-06-20 CRAN (R 3.2.0)                            
##  xtable                              1.7.4    2014-09-12 CRAN (R 3.2.0)                            
##  XVector                           * 0.7.4    2015-02-08 Bioconductor                              
##  yaml                                2.1.13   2014-06-12 CRAN (R 3.2.0)                            
##  zlibbioc                            1.13.3   2015-03-23 Bioconductor