Compare output from regionMatrix() with DERs

Setup

Libraries

library('GenomicRanges')
## Loading required package: methods
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library('data.table')
library('rCharts')
library('knitr')
library('ggplot2')
library('devtools')
opts_chunk$set(bootstrap.show.code = FALSE, dev = 'CairoPNG')

Load data

## Check that data was loaded
stopifnot(all(c('fullRegions', 'regionMat', 'analysisPath') %in% ls()))

## Fix DERs
ders <- fullRegions
names(ders) <- NULL

## Fix region matrices
regs <- unlist(GRangesList(lapply(regionMat, '[[', 'regions')))
names(regs) <- NULL

## Assign seqlengths
data(hg19Ideogram, package = 'biovizBase', envir = environment())
seqlengths(ders) <- seqlengths(hg19Ideogram)[names(seqlengths(ders))]
seqlengths(regs) <- seqlengths(hg19Ideogram)[names(seqlengths(regs))]

## Sort
ders <- sort(ders)
regs <- sort(regs)

## Check names match
identical(seqlengths(regs), seqlengths(ders))
## [1] TRUE
## Save the raw data
save(ders, file = file.path(analysisPath, 'dersOriginal.Rdata'))
save(regs, file = file.path(analysisPath, 'regsOriginal.Rdata'))

## Filter out those with less than 3bp
c('# ders under 3bp' = sum(width(ders) < 3), '# regs under 3bp' = sum(width(regs) < 3))
## # ders under 3bp # regs under 3bp 
##           227573            21484
ders <- ders[width(ders) >= 3]
regs <- regs[width(regs) >= 3]

## Save data used
save(ders, file = file.path(analysisPath, 'ders.Rdata'))
save(regs, file = file.path(analysisPath, 'regs.Rdata'))

Construct logical indexes for DERs and regionMatrix regions.

## Construct logical Rle indexes for bases with some region
build_index <- function(gr) {
    res <- lapply(names(seqlengths(gr)), function(chr) {
        chr.len <- seqlengths(gr)[chr]
        ir <- sort(ranges(gr[seqnames(gr) == chr]))
        log <- c(rep(c(FALSE, TRUE), length(ir)), FALSE)
    
        starts <- ends <- rep(NA, length(ir) * 2)
        i <- rep(c(TRUE, FALSE), length(ir))
        starts[i] <- start(ir)
        ends[i] <- end(ir)
    
        starts[!i] <- ends[i] + 1
    
        if(max(ends, na.rm = TRUE) < chr.len) {
            ends[!i] <- c(starts[i] - 1, chr.len)[-1]
        } else {
            ends[!i] <- c(starts[i] - 1, NULL)[-1]
            starts <- starts[- length(starts)]
            log <- log[- length(log)]
        }
   
        if(starts[1] != 1) {
            ends <- c(starts[1] - 1, ends)
            starts <- c(1, starts)
        } else {
            log <- log[-1]
        }
   
        widths <- mapply(function(s, e) { e - s + 1}, starts, ends)
    
        Rle(log, widths)
    })
    names(res) <- names(seqlengths(gr))
    return(res)
}
index.ders <- build_index(ders)
index.regs <- build_index(regs)

## Add info for chrs where there are no regs
miss <- !paste0('chr', c(1:22, 'X', 'Y')) %in% names(index.regs)
names(miss) <- paste0('chr', c(1:22, 'X', 'Y'))
if(any(miss)) {
    miss.add <- lapply(names(miss)[miss], function(x) { 
        Rle(FALSE, seqlengths(hg19Ideogram)[x]) 
    })
    names(miss.add) <- names(miss)[miss]
    index.regs <- c(index.regs, miss.add)
    index.regs <- index.regs[match(names(miss), names(index.regs))]
}

## Add info for chrs where there are no DERs
miss <- !names(index.regs) %in% names(index.ders)
if(any(miss)) {
    miss.add <- lapply(names(index.regs)[miss], function(x) { 
        Rle(FALSE, length(index.regs[[x]])) 
    })
    names(miss.add) <- names(index.regs)[miss]
    index.ders <- c(index.ders, miss.add)
    index.ders <- index.ders[match(names(index.regs), names(index.ders))]
}

Compare

Visually explore

library('epivizr')
mgr <- startEpiviz()
ders_dev <- mgr$addDevice(ders[!as.logical(ders$significantFWER)], "DERs no sig FWER")
ders_sig_dev <- mgr$addDevice(ders[as.logical(ders$significantFWER)], "DERs sig FWER")
regs_dev <- mgr$addDevice(regs, "Region Matrix")

## SOX11
mgr$navigate("chr2", 5810000, 5850000)

## MEX3A
mgr$navigate("chr1", 156040000, 156090000)

## VASH2
mgr$navigate("chr1", 213120000, 213170000)

## TG:
mgr$navigate("chr8", 134040000, 134120000)

## IGF2BP2
mgr$navigate("chr3", 185350000, 185410000)

## FBN3
mgr$navigate("chr19", 8130000, 8180000)

## End
mgr$stopServer()

Basic comparison

Number of regions

## Number of regions
c('ders' = length(ders), 'regs' = length(regs))
##   ders   regs 
## 552101 233159

Summary on width of regions

## Size of regions
c('ders' = summary(width(ders)), 'regs' = summary(width(regs)))
##    ders.Min. ders.1st Qu.  ders.Median    ders.Mean ders.3rd Qu. 
##          3.0          7.0         27.0        104.1        102.0 
##    ders.Max.    regs.Min. regs.1st Qu.  regs.Median    regs.Mean 
##      12620.0          3.0         54.0        108.0        218.1 
## regs.3rd Qu.    regs.Max. 
##        177.0      15150.0

Compare indexes

Base-pairs

Number of base-pairs in each index. Summary first, then overall info for the genome (in number of bases, then in percent of the genome), and finally results in interactive table.

## Merge all the indexes
index.all <- mapply(function(der, reg) {
    both <- der & reg
    only.der <- der & !reg
    only.reg <- !der & reg
    none <- !der & !reg
    
    res <- list('both' = both, 'only.der' = only.der, 'only.reg' = only.reg,
        'none' = none, 'all.der' = der, 'all.reg' = reg)
    return(list(res))
}, index.ders, index.regs)

## Find number of base-pairs in each index
index.num <- data.frame(do.call(rbind, lapply(index.all, function(x) { sapply(x, sum)})))
index.num$chrLen <- seqlengths(ders)
index.num$chr <- rownames(index.num)
rownames(index.num) <- NULL

## Print info
summary(index.num)
##       both            only.der          only.reg            none          
##  Min.   :   8776   Min.   :  12432   Min.   :  33554   Min.   : 47302618  
##  1st Qu.:1066843   1st Qu.: 623248   1st Qu.: 518222   1st Qu.: 77165660  
##  Median :1439537   Median : 960310   Median : 622103   Median :130559524  
##  Mean   :1467604   Mean   : 926149   Mean   : 650851   Mean   :125941954  
##  3rd Qu.:1874161   3rd Qu.:1159444   3rd Qu.: 843508   3rd Qu.:158570338  
##  Max.   :3522803   Max.   :2224470   Max.   :1536102   Max.   :241967246  
##     all.der           all.reg            chrLen         
##  Min.   :  21208   Min.   :  42330   Min.   : 48129895  
##  1st Qu.:1718933   1st Qu.:1622989   1st Qu.: 80415720  
##  Median :2412340   Median :2056512   Median :134429206  
##  Mean   :2393754   Mean   :2118456   Mean   :128986559  
##  3rd Qu.:3014595   3rd Qu.:2733104   3rd Qu.:162132764  
##  Max.   :5747273   Max.   :5058905   Max.   :249250621  
##      chr           
##  Length:24         
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
## Overall info
overallInfo <- colSums(index.num[, -ncol(index.num)])
overallInfo
##       both   only.der   only.reg       none    all.der    all.reg 
##   35222504   22227584   15620431 3022606893   57450088   50842935 
##     chrLen 
## 3095677412
## Overall info in percent
overallInfo / sum(as.numeric(index.num$chrLen)) * 100
##        both    only.der    only.reg        none     all.der     all.reg 
##   1.1377963   0.7180200   0.5045885  97.6395952   1.8558164   1.6423848 
##      chrLen 
## 100.0000000
d1 <- data.table(data.frame(row = seq_len(nrow(index.num)), index.num, check.names=FALSE))
t1 <- dTable(d1, sPaginationType=  'full_numbers', iDisplayLength=25,
    sScrollX='100%')
t1$print("bases", cdn=TRUE)

Segments per index

Number of segments per index. First summary, then results for genome, and finally an interactive table.

## Find number of segments in each index
index.seg <- data.frame(do.call(rbind, lapply(index.all, function(x) { 
    sapply(x, function(y) {
        sum(runValue(y))
    })
})))
index.seg$chr <- rownames(index.seg)
rownames(index.seg) <- NULL

## Print info
summary(index.seg)
##       both          only.der        only.reg          none      
##  Min.   :   46   Min.   :  307   Min.   :  292   Min.   :  565  
##  1st Qu.: 7911   1st Qu.:12862   1st Qu.: 6517   1st Qu.:14758  
##  Median :10458   Median :20018   Median : 8136   Median :22432  
##  Mean   :10414   Mean   :19421   Mean   : 8292   Mean   :22254  
##  3rd Qu.:13196   3rd Qu.:24145   3rd Qu.:10928   3rd Qu.:28206  
##  Max.   :24503   Max.   :43895   Max.   :19503   Max.   :50572  
##     all.der         all.reg          chr           
##  Min.   :  306   Min.   :  304   Length:24         
##  1st Qu.:15392   1st Qu.: 7326   Class :character  
##  Median :23357   Median : 9600   Mode  :character  
##  Mean   :23004   Mean   : 9715                     
##  3rd Qu.:29185   3rd Qu.:12452                     
##  Max.   :52342   Max.   :22836
## Overall info
colSums(index.seg[, -ncol(index.seg)])
##     both only.der only.reg     none  all.der  all.reg 
##   249935   466093   199006   534092   552101   233159
d2 <- data.table(data.frame(row = seq_len(nrow(index.seg)), index.seg, check.names=FALSE))
t2 <- dTable(d2, sPaginationType=  'full_numbers', iDisplayLength=25,
    sScrollX='100%')
t2$print("segments")

Segments width

Summary of the segment widths for each index. First the overall summary, then the results for each index.

## Get an idea of the width of the segments in each index
index.width <- data.frame(do.call(rbind, lapply(index.all, function(x) {
    tmp <- data.frame(do.call(rbind, lapply(x, function(y) {
        summary(runLength(y)[runValue(y)])
    })), check.names = FALSE)
    tmp$index <- names(x)
    rownames(tmp) <- NULL
    return(tmp)
})), check.names = FALSE)
index.width$chr <- rep(names(seqlengths(ders)), each = 6)
rownames(index.width) <- NULL

## Print info
summary(index.width)
##       Min.          1st Qu.          Median            Mean          
##  Min.   :1.000   Min.   : 4.00   Min.   : 12.00   Min.   :    40.50  
##  1st Qu.:1.000   1st Qu.: 5.00   1st Qu.: 24.00   1st Qu.:    78.44  
##  Median :1.000   Median : 7.50   Median : 60.00   Median :   120.85  
##  Mean   :1.674   Mean   :15.07   Mean   : 55.52   Mean   :  1765.97  
##  3rd Qu.:3.000   3rd Qu.:13.00   3rd Qu.: 86.00   3rd Qu.:   218.38  
##  Max.   :3.000   Max.   :63.00   Max.   :121.00   Max.   :105000.00  
##     3rd Qu.             Max.             index          
##  Min.   :  41.00   Min.   :     642   Length:144        
##  1st Qu.:  88.75   1st Qu.:    5098   Class :character  
##  Median : 134.40   Median :    7788   Mode  :character  
##  Mean   : 247.36   Mean   : 1563425                     
##  3rd Qu.: 176.25   3rd Qu.:    9829                     
##  Max.   :1403.00   Max.   :30230000                     
##      chr           
##  Length:144        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
d3 <- data.table(data.frame(row = seq_len(nrow(index.width)), index.width, check.names=FALSE))
t3 <- dTable(d3, sPaginationType=  'full_numbers', iDisplayLength=25,
    sScrollX='100%')
t3$print("widths")

Overlaps

Minimum 20 bp

ov20 <- findOverlaps(ders, regs, minoverlap = 20L)

counts <- list()
for(type in c("any", "within", "equal")) {
    ct.ders <- countOverlaps(ders, regs, minoverlap = 20L, type = type)
    plot(log10(table(ct.ders)), main = paste("DERs in regs for type", type))
    ct.regs <- countOverlaps(regs, ders, minoverlap = 20L, type = type)
    plot(log10(table(ct.regs)), main = paste("Regs in DERs for type", type))
    counts <- c(counts, list(table(ct.ders), table(ct.regs)))
}
cts <- as.integer(unique(unlist(lapply(counts, names))))

nOverlap20 <- do.call(rbind, lapply(counts, function(x) { 
    df <- data.frame(nOverlap = cts, freq = x[match(cts, names(x))],
        row.names = seq_len(length(cts)))
    df$observed <- !is.na(df$freq)
    df$freq[is.na(df$freq)] <- 0
    df$cumFreq <- cumsum(df$freq)
    df$cumPerc <- df$cumFreq / max(df$cumFreq) * 100
    return(df)
}))
nOverlap20$type <- factor(rep(c('any', 'within', 'equal'), each = length(cts) * 2), levels = c('any', 'within', 'equal'))
nOverlap20$match <- rep(rep(c('DERs-in-regs', 'regs-in-DERs'), each = length(cts)), 3)
#nOverlap <- nOverlap[complete.cases(nOverlap), ]
rownames(nOverlap20) <- NULL

nOverlap20$alpha <- ifelse(nOverlap20$observed, 1, 1/3)

Summary plots showing cumulative frequency and cumulative percent.

## Make a nice plot
ggplot(data = nOverlap20, aes(x = nOverlap, y = cumFreq, colour = match, alpha = alpha)) + geom_point() + facet_grid( . ~ type )# + geom_smooth(se=FALSE)
## Show cumulative percents
ggplot(data = nOverlap20, aes(x = nOverlap, y = cumPerc, colour = match, linetype = match)) + geom_line(lwd=1) + facet_grid( . ~ type )

Some important numbers: percent of regions with width < 20 bp, base level agreement, region level agreement (min overlap 20 bp).

## Percent with widths < 20L
small <- c('ders' = sum(width(ders) < 20) / length(ders), 'regs' = sum(width(regs) < 20) / length(regs)) * 100
data.frame('under-20' = small, '20-and-above' = 100 - small, check.names = FALSE)
##      under-20 20-and-above
## ders 44.39948     55.60052
## regs 14.71742     85.28258
## Base level agreement
c('regs' = overallInfo['both'] / (overallInfo['both'] + overallInfo['only.reg']) * 100, 'ders' = overallInfo['both'] / (overallInfo['both'] + overallInfo['only.der']) * 100)
## regs.both ders.both 
##  69.27709  61.30975
## Overlap (min 20) agreement
c('regs' = 100 - subset(nOverlap20, match == 'regs-in-DERs' & nOverlap == 0 & type == 'any')$cumPerc, 'ders' = 100 - subset(nOverlap20, match == 'DERs-in-regs' & nOverlap == 0 & type == 'any')$cumPerc)
##     regs     ders 
## 62.84038 28.67537

Minimum 1 bp

ov1 <- findOverlaps(ders, regs, minoverlap = 1L)

counts <- list()
for(type in c("any", "within", "equal")) {
    ct.ders <- countOverlaps(ders, regs, minoverlap = 1L, type = type)
    plot(log10(table(ct.ders)), main = paste("DERs in regs for type", type))
    ct.regs <- countOverlaps(regs, ders, minoverlap = 1L, type = type)
    plot(log10(table(ct.regs)), main = paste("Regs in DERs for type", type))
    counts <- c(counts, list(table(ct.ders), table(ct.regs)))
}
cts <- as.integer(unique(unlist(lapply(counts, names))))

nOverlap1 <- do.call(rbind, lapply(counts, function(x) { 
    df <- data.frame(nOverlap = cts, freq = x[match(cts, names(x))], 
        row.names = seq_len(length(cts)))
    df$observed <- !is.na(df$freq)
    df$freq[is.na(df$freq)] <- 0
    df$cumFreq <- cumsum(df$freq)
    df$cumPerc <- df$cumFreq / max(df$cumFreq) * 100
    return(df)
}))
nOverlap1$type <- factor(rep(c('any', 'within', 'equal'), each = length(cts) * 2), levels = c('any', 'within', 'equal'))
nOverlap1$match <- rep(rep(c('DERs-in-regs', 'regs-in-DERs'), each = length(cts)), 3)
#nOverlap <- nOverlap[complete.cases(nOverlap), ]
rownames(nOverlap1) <- NULL

nOverlap1$alpha <- ifelse(nOverlap1$observed, 1, 1/3)

## Overlap (min 1bp) agreement
c('regs' = 100 - subset(nOverlap1, match == 'regs-in-DERs' & nOverlap == 0 & type == 'any')$cumPerc, 'ders' = 100 - subset(nOverlap1, match == 'DERs-in-regs' & nOverlap == 0 & type == 'any')$cumPerc)
##     regs     ders 
## 74.47150 39.29372

Summary plots showing cumulative frequency and cumulative percent.

## Make a nice plot
ggplot(data = nOverlap1, aes(x = nOverlap, y = cumFreq, colour = match, alpha = alpha)) + geom_point() + facet_grid( . ~ type )# + geom_smooth(se=FALSE)
## Show cumulative percents
ggplot(data = nOverlap1, aes(x = nOverlap, y = cumPerc, colour = match, linetype = match)) + geom_line(lwd=1) + facet_grid( . ~ type )

Save results

save(index.all, index.num, index.seg, index.width, nOverlap20, ov20, nOverlap1, ov1, overallInfo, file = file.path(analysisPath, "comparison-results.Rdata"))

Reproducibility

Analysis path: /dcl01/lieber/ajaffe/derRuns/derSoftware/brainspan/regionMatrix-vs-DERs/cut0.1-vs-run4-v1.0.10

Re-make the report

# Load fullRegions.Rdata and regionMat.Rdata before this step
library('rmarkdown')
library('knitrBootstrap')
render('step7-regMatVsDERs.Rmd')

Date the report was generated.

## [1] "2014-12-17 22:37:53 EST"

R session information.

## Session info-----------------------------------------------------------------------------
##  setting  value                                      
##  version  R version 3.1.1 Patched (2014-10-16 r66782)
##  system   x86_64, linux-gnu                          
##  ui       X11                                        
##  language (EN)                                       
##  collate  en_US.UTF-8                                
##  tz       
## Packages---------------------------------------------------------------------------------
##  package        * version date       source                                   
##  BiocGenerics   * 0.12.1  2014-11-15 Bioconductor                             
##  Cairo            1.5.6   2014-06-26 CRAN (R 3.1.0)                           
##  chron            2.3.45  2014-02-11 CRAN (R 3.1.1)                           
##  colorspace       1.2.4   2013-09-30 CRAN (R 3.1.0)                           
##  data.table     * 1.9.4   2014-10-02 CRAN (R 3.1.1)                           
##  devtools       * 1.6.1   2014-10-07 CRAN (R 3.1.1)                           
##  digest           0.6.6   2014-12-10 CRAN (R 3.1.1)                           
##  evaluate         0.5.5   2014-04-29 CRAN (R 3.1.0)                           
##  formatR          1.0     2014-08-25 CRAN (R 3.1.1)                           
##  GenomeInfoDb   * 1.2.3   2014-11-15 Bioconductor                             
##  GenomicRanges  * 1.18.3  2014-11-20 Bioconductor                             
##  ggplot2        * 1.0.0   2014-05-21 CRAN (R 3.1.0)                           
##  gtable           0.1.2   2012-12-05 CRAN (R 3.1.0)                           
##  htmltools        0.2.6   2014-09-08 CRAN (R 3.1.1)                           
##  IRanges        * 2.0.1   2014-12-13 Bioconductor                             
##  knitr          * 1.8     2014-11-11 CRAN (R 3.1.1)                           
##  knitrBootstrap * 1.0.0   2014-11-19 Github (jimhester/knitrBootstrap@76c41f0)
##  labeling         0.3     2014-08-23 CRAN (R 3.1.1)                           
##  lattice          0.20.29 2014-04-04 CRAN (R 3.1.1)                           
##  markdown         0.7.4   2014-08-24 CRAN (R 3.1.1)                           
##  MASS             7.3.35  2014-09-30 CRAN (R 3.1.1)                           
##  mime             0.2     2014-09-26 CRAN (R 3.1.1)                           
##  munsell          0.4.2   2013-07-11 CRAN (R 3.1.0)                           
##  plyr             1.8.1   2014-02-26 CRAN (R 3.1.0)                           
##  proto            0.3.10  2012-12-22 CRAN (R 3.1.0)                           
##  rCharts        * 0.4.5   2014-11-17 Github (ramnathv/rCharts@929875d)        
##  Rcpp             0.11.3  2014-09-29 CRAN (R 3.1.1)                           
##  reshape2         1.4.1   2014-12-06 CRAN (R 3.1.1)                           
##  RJSONIO          1.3.0   2014-07-28 CRAN (R 3.1.1)                           
##  rmarkdown      * 0.3.3   2014-09-17 CRAN (R 3.1.1)                           
##  rstudioapi       0.1     2014-03-27 CRAN (R 3.1.1)                           
##  S4Vectors      * 0.4.0   2014-10-15 Bioconductor                             
##  scales           0.2.4   2014-04-22 CRAN (R 3.1.0)                           
##  stringr          0.6.2   2012-12-06 CRAN (R 3.1.0)                           
##  whisker          0.3.2   2013-04-28 CRAN (R 3.1.0)                           
##  XVector          0.6.0   2014-10-15 Bioconductor                             
##  yaml             2.1.13  2014-06-12 CRAN (R 3.1.1)