Compare output from regionMatrix() with DERs

Setup

Libraries

library('GenomicRanges')
## Loading required package: methods
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library('data.table')
library('rCharts')
library('knitr')
library('ggplot2')
library('devtools')
opts_chunk$set(bootstrap.show.code = FALSE, dev = 'CairoPNG')

Load data

## Check that data was loaded
stopifnot(all(c('fullRegions', 'regionMat', 'analysisPath') %in% ls()))

## Fix DERs
ders <- fullRegions
names(ders) <- NULL

## Fix region matrices
regs <- unlist(GRangesList(lapply(regionMat, '[[', 'regions')))
names(regs) <- NULL

## Assign seqlengths
data(hg19Ideogram, package = 'biovizBase', envir = environment())
seqlengths(ders) <- seqlengths(hg19Ideogram)[names(seqlengths(ders))]
seqlengths(regs) <- seqlengths(hg19Ideogram)[names(seqlengths(regs))]

## Sort
ders <- sort(ders)
regs <- sort(regs)

## Check names match
identical(seqlengths(regs), seqlengths(ders))
## [1] TRUE
## Save the raw data
save(ders, file = file.path(analysisPath, 'dersOriginal.Rdata'))
save(regs, file = file.path(analysisPath, 'regsOriginal.Rdata'))

## Filter out those with less than 3bp
c('# ders under 3bp' = sum(width(ders) < 3), '# regs under 3bp' = sum(width(regs) < 3))
## # ders under 3bp # regs under 3bp 
##           227573            18216
ders <- ders[width(ders) >= 3]
regs <- regs[width(regs) >= 3]

## Save data used
save(ders, file = file.path(analysisPath, 'ders.Rdata'))
save(regs, file = file.path(analysisPath, 'regs.Rdata'))

Construct logical indexes for DERs and regionMatrix regions.

## Construct logical Rle indexes for bases with some region
build_index <- function(gr) {
    res <- lapply(names(seqlengths(gr)), function(chr) {
        chr.len <- seqlengths(gr)[chr]
        ir <- sort(ranges(gr[seqnames(gr) == chr]))
        log <- c(rep(c(FALSE, TRUE), length(ir)), FALSE)
    
        starts <- ends <- rep(NA, length(ir) * 2)
        i <- rep(c(TRUE, FALSE), length(ir))
        starts[i] <- start(ir)
        ends[i] <- end(ir)
    
        starts[!i] <- ends[i] + 1
    
        if(max(ends, na.rm = TRUE) < chr.len) {
            ends[!i] <- c(starts[i] - 1, chr.len)[-1]
        } else {
            ends[!i] <- c(starts[i] - 1, NULL)[-1]
            starts <- starts[- length(starts)]
            log <- log[- length(log)]
        }
   
        if(starts[1] != 1) {
            ends <- c(starts[1] - 1, ends)
            starts <- c(1, starts)
        } else {
            log <- log[-1]
        }
   
        widths <- mapply(function(s, e) { e - s + 1}, starts, ends)
    
        Rle(log, widths)
    })
    names(res) <- names(seqlengths(gr))
    return(res)
}
index.ders <- build_index(ders)
index.regs <- build_index(regs)

Compare

Visually explore

library('epivizr')
mgr <- startEpiviz()
ders_dev <- mgr$addDevice(ders[!as.logical(ders$significantFWER)], "DERs no sig FWER")
ders_sig_dev <- mgr$addDevice(ders[as.logical(ders$significantFWER)], "DERs sig FWER")
regs_dev <- mgr$addDevice(regs, "Region Matrix")

## SOX11
mgr$navigate("chr2", 5810000, 5850000)

## MEX3A
mgr$navigate("chr1", 156040000, 156090000)

## VASH2
mgr$navigate("chr1", 213120000, 213170000)

## TG:
mgr$navigate("chr8", 134040000, 134120000)

## IGF2BP2
mgr$navigate("chr3", 185350000, 185410000)

## FBN3
mgr$navigate("chr19", 8130000, 8180000)

## End
mgr$stopServer()

Basic comparison

Number of regions

## Number of regions
c('ders' = length(ders), 'regs' = length(regs))
##   ders   regs 
## 552101 189113

Summary on width of regions

## Size of regions
c('ders' = summary(width(ders)), 'regs' = summary(width(regs)))
##    ders.Min. ders.1st Qu.  ders.Median    ders.Mean ders.3rd Qu. 
##          3.0          7.0         27.0        104.1        102.0 
##    ders.Max.    regs.Min. regs.1st Qu.  regs.Median    regs.Mean 
##      12620.0          3.0         44.0         99.0        182.8 
## regs.3rd Qu.    regs.Max. 
##        164.0      12630.0

Compare indexes

Base-pairs

Number of base-pairs in each index. Summary first, then overall info for the genome (in number of bases, then in percent of the genome), and finally results in interactive table.

## Merge all the indexes
index.all <- mapply(function(der, reg) {
    both <- der & reg
    only.der <- der & !reg
    only.reg <- !der & reg
    none <- !der & !reg
    
    res <- list('both' = both, 'only.der' = only.der, 'only.reg' = only.reg,
        'none' = none, 'all.der' = der, 'all.reg' = reg)
    return(list(res))
}, index.ders, index.regs)

## Find number of base-pairs in each index
index.num <- data.frame(do.call(rbind, lapply(index.all, function(x) { sapply(x, sum)})))
index.num$chrLen <- seqlengths(ders)
index.num$chr <- rownames(index.num)
rownames(index.num) <- NULL

## Print info
summary(index.num)
##       both            only.der          only.reg           none          
##  Min.   :   5567   Min.   :  15641   Min.   : 11674   Min.   : 47376832  
##  1st Qu.: 760895   1st Qu.: 934355   1st Qu.:312838   1st Qu.: 77403327  
##  Median :1007583   Median :1401115   Median :390288   Median :130877735  
##  Mean   :1037526   Mean   :1356227   Mean   :403074   Mean   :126189732  
##  3rd Qu.:1338974   3rd Qu.:1668691   3rd Qu.:510818   3rd Qu.:158858728  
##  Max.   :2525713   Max.   :3221560   Max.   :954726   Max.   :242548622  
##     all.der           all.reg            chrLen         
##  Min.   :  21208   Min.   :  17241   Min.   : 48129895  
##  1st Qu.:1718933   1st Qu.:1107262   1st Qu.: 80415720  
##  Median :2412340   Median :1386397   Median :134429206  
##  Mean   :2393754   Mean   :1440600   Mean   :128986559  
##  3rd Qu.:3014595   3rd Qu.:1820474   3rd Qu.:162132764  
##  Max.   :5747273   Max.   :3480439   Max.   :249250621  
##      chr           
##  Length:24         
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
## Overall info
overallInfo <- colSums(index.num[, -ncol(index.num)])
overallInfo
##       both   only.der   only.reg       none    all.der    all.reg 
##   24900632   32549456    9673768 3028553556   57450088   34574400 
##     chrLen 
## 3095677412
## Overall info in percent
overallInfo / sum(as.numeric(index.num$chrLen)) * 100
##        both    only.der    only.reg        none     all.der     all.reg 
##   0.8043678   1.0514486   0.3124928  97.8316909   1.8558164   1.1168606 
##      chrLen 
## 100.0000000
d1 <- data.table(data.frame(row = seq_len(nrow(index.num)), index.num, check.names=FALSE))
t1 <- dTable(d1, sPaginationType=  'full_numbers', iDisplayLength=25,
    sScrollX='100%')
t1$print("bases", cdn=TRUE)

Segments per index

Number of segments per index. First summary, then results for genome, and finally an interactive table.

## Find number of segments in each index
index.seg <- data.frame(do.call(rbind, lapply(index.all, function(x) { 
    sapply(x, function(y) {
        sum(runValue(y))
    })
})))
index.seg$chr <- rownames(index.seg)
rownames(index.seg) <- NULL

## Print info
summary(index.seg)
##       both          only.der        only.reg          none      
##  Min.   :   40   Min.   :  316   Min.   :  144   Min.   :  430  
##  1st Qu.: 5998   1st Qu.:14879   1st Qu.: 3920   1st Qu.:15243  
##  Median : 7712   Median :22944   Median : 4920   Median :23125  
##  Mean   : 7923   Mean   :22314   Mean   : 4977   Mean   :22911  
##  3rd Qu.:10482   3rd Qu.:27852   3rd Qu.: 6575   3rd Qu.:29030  
##  Max.   :18326   Max.   :50837   Max.   :11565   Max.   :52339  
##     all.der         all.reg          chr           
##  Min.   :  306   Min.   :  163   Length:24         
##  1st Qu.:15392   1st Qu.: 5904   Class :character  
##  Median :23357   Median : 7562   Mode  :character  
##  Mean   :23004   Mean   : 7880                     
##  3rd Qu.:29185   3rd Qu.:10452                     
##  Max.   :52342   Max.   :18423
## Overall info
colSums(index.seg[, -ncol(index.seg)])
##     both only.der only.reg     none  all.der  all.reg 
##   190143   535541   119450   549875   552101   189113
d2 <- data.table(data.frame(row = seq_len(nrow(index.seg)), index.seg, check.names=FALSE))
t2 <- dTable(d2, sPaginationType=  'full_numbers', iDisplayLength=25,
    sScrollX='100%')
t2$print("segments")

Segments width

Summary of the segment widths for each index. First the overall summary, then the results for each index.

## Get an idea of the width of the segments in each index
index.width <- data.frame(do.call(rbind, lapply(index.all, function(x) {
    tmp <- data.frame(do.call(rbind, lapply(x, function(y) {
        summary(runLength(y)[runValue(y)])
    })), check.names = FALSE)
    tmp$index <- names(x)
    rownames(tmp) <- NULL
    return(tmp)
})), check.names = FALSE)
index.width$chr <- rep(names(seqlengths(ders)), each = 6)
rownames(index.width) <- NULL

## Print info
summary(index.width)
##       Min.          1st Qu.         Median            Mean          
##  Min.   :1.000   Min.   : 5.0   Min.   : 13.00   Min.   :    49.50  
##  1st Qu.:1.000   1st Qu.: 5.0   1st Qu.: 26.00   1st Qu.:    80.73  
##  Median :1.000   Median : 7.0   Median : 47.00   Median :   113.70  
##  Mean   :1.667   Mean   :13.6   Mean   : 52.81   Mean   :  1961.69  
##  3rd Qu.:3.000   3rd Qu.:14.0   3rd Qu.: 76.00   3rd Qu.:   184.12  
##  Max.   :3.000   Max.   :55.0   Max.   :105.00   Max.   :138000.00  
##     3rd Qu.             Max.             index          
##  Min.   :  51.00   Min.   :     660   Length:144        
##  1st Qu.:  93.75   1st Qu.:    4976   Class :character  
##  Median : 113.50   Median :    6829   Mode  :character  
##  Mean   : 221.40   Mean   : 1610003                     
##  3rd Qu.: 164.25   3rd Qu.:    9505                     
##  Max.   :1113.00   Max.   :36250000                     
##      chr           
##  Length:144        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
d3 <- data.table(data.frame(row = seq_len(nrow(index.width)), index.width, check.names=FALSE))
t3 <- dTable(d3, sPaginationType=  'full_numbers', iDisplayLength=25,
    sScrollX='100%')
t3$print("widths")

Overlaps

Minimum 20 bp

ov20 <- findOverlaps(ders, regs, minoverlap = 20L)

counts <- list()
for(type in c("any", "within", "equal")) {
    ct.ders <- countOverlaps(ders, regs, minoverlap = 20L, type = type)
    plot(log10(table(ct.ders)), main = paste("DERs in regs for type", type))
    ct.regs <- countOverlaps(regs, ders, minoverlap = 20L, type = type)
    plot(log10(table(ct.regs)), main = paste("Regs in DERs for type", type))
    counts <- c(counts, list(table(ct.ders), table(ct.regs)))
}
cts <- as.integer(unique(unlist(lapply(counts, names))))

nOverlap20 <- do.call(rbind, lapply(counts, function(x) { 
    df <- data.frame(nOverlap = cts, freq = x[match(cts, names(x))],
        row.names = seq_len(length(cts)))
    df$observed <- !is.na(df$freq)
    df$freq[is.na(df$freq)] <- 0
    df$cumFreq <- cumsum(df$freq)
    df$cumPerc <- df$cumFreq / max(df$cumFreq) * 100
    return(df)
}))
nOverlap20$type <- factor(rep(c('any', 'within', 'equal'), each = length(cts) * 2), levels = c('any', 'within', 'equal'))
nOverlap20$match <- rep(rep(c('DERs-in-regs', 'regs-in-DERs'), each = length(cts)), 3)
#nOverlap <- nOverlap[complete.cases(nOverlap), ]
rownames(nOverlap20) <- NULL

nOverlap20$alpha <- ifelse(nOverlap20$observed, 1, 1/3)

Summary plots showing cumulative frequency and cumulative percent.

## Make a nice plot
ggplot(data = nOverlap20, aes(x = nOverlap, y = cumFreq, colour = match, alpha = alpha)) + geom_point() + facet_grid( . ~ type )# + geom_smooth(se=FALSE)
## Show cumulative percents
ggplot(data = nOverlap20, aes(x = nOverlap, y = cumPerc, colour = match, linetype = match)) + geom_line(lwd=1) + facet_grid( . ~ type )

Some important numbers: percent of regions with width < 20 bp, base level agreement, region level agreement (min overlap 20 bp).

## Percent with widths < 20L
small <- c('ders' = sum(width(ders) < 20) / length(ders), 'regs' = sum(width(regs) < 20) / length(regs)) * 100
data.frame('under-20' = small, '20-and-above' = 100 - small, check.names = FALSE)
##      under-20 20-and-above
## ders 44.39948     55.60052
## regs 16.79261     83.20739
## Base level agreement
c('regs' = overallInfo['both'] / (overallInfo['both'] + overallInfo['only.reg']) * 100, 'ders' = overallInfo['both'] / (overallInfo['both'] + overallInfo['only.der']) * 100)
## regs.both ders.both 
##  72.02043  43.34307
## Overlap (min 20) agreement
c('regs' = 100 - subset(nOverlap20, match == 'regs-in-DERs' & nOverlap == 0 & type == 'any')$cumPerc, 'ders' = 100 - subset(nOverlap20, match == 'DERs-in-regs' & nOverlap == 0 & type == 'any')$cumPerc)
##     regs     ders 
## 63.52234 21.21297

Minimum 1 bp

ov1 <- findOverlaps(ders, regs, minoverlap = 1L)

counts <- list()
for(type in c("any", "within", "equal")) {
    ct.ders <- countOverlaps(ders, regs, minoverlap = 1L, type = type)
    plot(log10(table(ct.ders)), main = paste("DERs in regs for type", type))
    ct.regs <- countOverlaps(regs, ders, minoverlap = 1L, type = type)
    plot(log10(table(ct.regs)), main = paste("Regs in DERs for type", type))
    counts <- c(counts, list(table(ct.ders), table(ct.regs)))
}
cts <- as.integer(unique(unlist(lapply(counts, names))))

nOverlap1 <- do.call(rbind, lapply(counts, function(x) { 
    df <- data.frame(nOverlap = cts, freq = x[match(cts, names(x))], 
        row.names = seq_len(length(cts)))
    df$observed <- !is.na(df$freq)
    df$freq[is.na(df$freq)] <- 0
    df$cumFreq <- cumsum(df$freq)
    df$cumPerc <- df$cumFreq / max(df$cumFreq) * 100
    return(df)
}))
nOverlap1$type <- factor(rep(c('any', 'within', 'equal'), each = length(cts) * 2), levels = c('any', 'within', 'equal'))
nOverlap1$match <- rep(rep(c('DERs-in-regs', 'regs-in-DERs'), each = length(cts)), 3)
#nOverlap <- nOverlap[complete.cases(nOverlap), ]
rownames(nOverlap1) <- NULL

nOverlap1$alpha <- ifelse(nOverlap1$observed, 1, 1/3)

## Overlap (min 1bp) agreement
c('regs' = 100 - subset(nOverlap1, match == 'regs-in-DERs' & nOverlap == 0 & type == 'any')$cumPerc, 'ders' = 100 - subset(nOverlap1, match == 'DERs-in-regs' & nOverlap == 0 & type == 'any')$cumPerc)
##     regs     ders 
## 77.94652 27.64349

Summary plots showing cumulative frequency and cumulative percent.

## Make a nice plot
ggplot(data = nOverlap1, aes(x = nOverlap, y = cumFreq, colour = match, alpha = alpha)) + geom_point() + facet_grid( . ~ type )# + geom_smooth(se=FALSE)
## Show cumulative percents
ggplot(data = nOverlap1, aes(x = nOverlap, y = cumPerc, colour = match, linetype = match)) + geom_line(lwd=1) + facet_grid( . ~ type )

Save results

save(index.all, index.num, index.seg, index.width, nOverlap20, ov20, nOverlap1, ov1, overallInfo, file = file.path(analysisPath, "comparison-results.Rdata"))

Reproducibility

Analysis path: /dcl01/lieber/ajaffe/derRuns/derSoftware/brainspan/regionMatrix-vs-DERs/cut0.25-vs-run4-v1.0.10

Re-make the report

# Load fullRegions.Rdata and regionMat.Rdata before this step
library('rmarkdown')
library('knitrBootstrap')
render('step7-regMatVsDERs.Rmd')

Date the report was generated.

## [1] "2014-12-01 18:05:44 EST"

R session information.

## Session info-----------------------------------------------------------------------------
##  setting  value                                      
##  version  R version 3.1.1 Patched (2014-10-16 r66782)
##  system   x86_64, linux-gnu                          
##  ui       X11                                        
##  language (EN)                                       
##  collate  en_US.UTF-8                                
##  tz       
## Packages---------------------------------------------------------------------------------
##  package        * version date       source                                   
##  BiocGenerics   * 0.12.1  2014-11-15 Bioconductor                             
##  Cairo            1.5.6   2014-06-26 CRAN (R 3.1.0)                           
##  chron            2.3.45  2014-02-11 CRAN (R 3.1.1)                           
##  colorspace       1.2.4   2013-09-30 CRAN (R 3.1.0)                           
##  data.table     * 1.9.4   2014-10-02 CRAN (R 3.1.1)                           
##  devtools       * 1.6.1   2014-10-07 CRAN (R 3.1.1)                           
##  digest           0.6.4   2013-12-03 CRAN (R 3.1.0)                           
##  evaluate         0.5.5   2014-04-29 CRAN (R 3.1.0)                           
##  formatR          1.0     2014-08-25 CRAN (R 3.1.1)                           
##  GenomeInfoDb   * 1.2.3   2014-11-15 Bioconductor                             
##  GenomicRanges  * 1.18.3  2014-11-20 Bioconductor                             
##  ggplot2        * 1.0.0   2014-05-21 CRAN (R 3.1.0)                           
##  gtable           0.1.2   2012-12-05 CRAN (R 3.1.0)                           
##  htmltools        0.2.6   2014-09-08 CRAN (R 3.1.1)                           
##  IRanges        * 2.0.0   2014-10-15 Bioconductor                             
##  knitr          * 1.8     2014-11-11 CRAN (R 3.1.1)                           
##  knitrBootstrap * 1.0.0   2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)
##  labeling         0.3     2014-08-23 CRAN (R 3.1.1)                           
##  lattice          0.20.29 2014-04-04 CRAN (R 3.1.1)                           
##  markdown         0.7.4   2014-08-24 CRAN (R 3.1.1)                           
##  MASS             7.3.35  2014-09-30 CRAN (R 3.1.1)                           
##  mime             0.2     2014-09-26 CRAN (R 3.1.1)                           
##  munsell          0.4.2   2013-07-11 CRAN (R 3.1.0)                           
##  plyr             1.8.1   2014-02-26 CRAN (R 3.1.0)                           
##  proto            0.3.10  2012-12-22 CRAN (R 3.1.0)                           
##  rCharts        * 0.4.5   2014-11-17 Github (ramnathv/rCharts@929875d)        
##  Rcpp             0.11.3  2014-09-29 CRAN (R 3.1.1)                           
##  reshape2         1.4     2014-04-23 CRAN (R 3.1.0)                           
##  RJSONIO          1.3.0   2014-07-28 CRAN (R 3.1.1)                           
##  rmarkdown      * 0.3.3   2014-09-17 CRAN (R 3.1.1)                           
##  rstudioapi       0.1     2014-03-27 CRAN (R 3.1.1)                           
##  S4Vectors      * 0.4.0   2014-10-15 Bioconductor                             
##  scales           0.2.4   2014-04-22 CRAN (R 3.1.0)                           
##  stringr          0.6.2   2012-12-06 CRAN (R 3.1.0)                           
##  whisker          0.3.2   2013-04-28 CRAN (R 3.1.0)                           
##  XVector          0.6.0   2014-10-15 Bioconductor                             
##  yaml             2.1.13  2014-06-12 CRAN (R 3.1.1)