## Check that data was loaded
stopifnot(all(c('fullRegions', 'regionMat', 'analysisPath') %in% ls()))
## Fix DERs
ders <- fullRegions
names(ders) <- NULL
## Fix region matrices
regs <- unlist(GRangesList(lapply(regionMat, '[[', 'regions')))
names(regs) <- NULL
## Assign seqlengths
data(hg19Ideogram, package = 'biovizBase', envir = environment())
seqlengths(ders) <- seqlengths(hg19Ideogram)[names(seqlengths(ders))]
seqlengths(regs) <- seqlengths(hg19Ideogram)[names(seqlengths(regs))]
## Sort
ders <- sort(ders)
regs <- sort(regs)
## Check names match
identical(seqlengths(regs), seqlengths(ders))
## [1] TRUE
## Save the raw data
save(ders, file = file.path(analysisPath, 'dersOriginal.Rdata'))
save(regs, file = file.path(analysisPath, 'regsOriginal.Rdata'))
## Filter out those with less than 3bp
c('# ders under 3bp' = sum(width(ders) < 3), '# regs under 3bp' = sum(width(regs) < 3))
## # ders under 3bp # regs under 3bp
## 227573 21484
ders <- ders[width(ders) >= 3]
regs <- regs[width(regs) >= 3]
## Save data used
save(ders, file = file.path(analysisPath, 'ders.Rdata'))
save(regs, file = file.path(analysisPath, 'regs.Rdata'))
Construct logical indexes for DERs and regionMatrix regions.
## Construct logical Rle indexes for bases with some region
build_index <- function(gr) {
res <- lapply(names(seqlengths(gr)), function(chr) {
chr.len <- seqlengths(gr)[chr]
ir <- sort(ranges(gr[seqnames(gr) == chr]))
log <- c(rep(c(FALSE, TRUE), length(ir)), FALSE)
starts <- ends <- rep(NA, length(ir) * 2)
i <- rep(c(TRUE, FALSE), length(ir))
starts[i] <- start(ir)
ends[i] <- end(ir)
starts[!i] <- ends[i] + 1
if(max(ends, na.rm = TRUE) < chr.len) {
ends[!i] <- c(starts[i] - 1, chr.len)[-1]
} else {
ends[!i] <- c(starts[i] - 1, NULL)[-1]
starts <- starts[- length(starts)]
log <- log[- length(log)]
}
if(starts[1] != 1) {
ends <- c(starts[1] - 1, ends)
starts <- c(1, starts)
} else {
log <- log[-1]
}
widths <- mapply(function(s, e) { e - s + 1}, starts, ends)
Rle(log, widths)
})
names(res) <- names(seqlengths(gr))
return(res)
}
index.ders <- build_index(ders)
index.regs <- build_index(regs)
## Add info for chrs where there are no regs
miss <- !paste0('chr', c(1:22, 'X', 'Y')) %in% names(index.regs)
names(miss) <- paste0('chr', c(1:22, 'X', 'Y'))
if(any(miss)) {
miss.add <- lapply(names(miss)[miss], function(x) {
Rle(FALSE, seqlengths(hg19Ideogram)[x])
})
names(miss.add) <- names(miss)[miss]
index.regs <- c(index.regs, miss.add)
index.regs <- index.regs[match(names(miss), names(index.regs))]
}
## Add info for chrs where there are no DERs
miss <- !names(index.regs) %in% names(index.ders)
if(any(miss)) {
miss.add <- lapply(names(index.regs)[miss], function(x) {
Rle(FALSE, length(index.regs[[x]]))
})
names(miss.add) <- names(index.regs)[miss]
index.ders <- c(index.ders, miss.add)
index.ders <- index.ders[match(names(index.regs), names(index.ders))]
}
Number of base-pairs in each index. Summary first, then overall info for the genome (in number of bases, then in percent of the genome), and finally results in interactive table.
## Merge all the indexes
index.all <- mapply(function(der, reg) {
both <- der & reg
only.der <- der & !reg
only.reg <- !der & reg
none <- !der & !reg
res <- list('both' = both, 'only.der' = only.der, 'only.reg' = only.reg,
'none' = none, 'all.der' = der, 'all.reg' = reg)
return(list(res))
}, index.ders, index.regs)
## Find number of base-pairs in each index
index.num <- data.frame(do.call(rbind, lapply(index.all, function(x) { sapply(x, sum)})))
index.num$chrLen <- seqlengths(ders)
index.num$chr <- rownames(index.num)
rownames(index.num) <- NULL
## Print info
summary(index.num)
## both only.der only.reg none
## Min. : 8776 Min. : 12432 Min. : 33554 Min. : 47302618
## 1st Qu.:1066843 1st Qu.: 623248 1st Qu.: 518222 1st Qu.: 77165660
## Median :1439537 Median : 960310 Median : 622103 Median :130559524
## Mean :1467604 Mean : 926149 Mean : 650851 Mean :125941954
## 3rd Qu.:1874161 3rd Qu.:1159444 3rd Qu.: 843508 3rd Qu.:158570338
## Max. :3522803 Max. :2224470 Max. :1536102 Max. :241967246
## all.der all.reg chrLen
## Min. : 21208 Min. : 42330 Min. : 48129895
## 1st Qu.:1718933 1st Qu.:1622989 1st Qu.: 80415720
## Median :2412340 Median :2056512 Median :134429206
## Mean :2393754 Mean :2118456 Mean :128986559
## 3rd Qu.:3014595 3rd Qu.:2733104 3rd Qu.:162132764
## Max. :5747273 Max. :5058905 Max. :249250621
## chr
## Length:24
## Class :character
## Mode :character
##
##
##
## Overall info
overallInfo <- colSums(index.num[, -ncol(index.num)])
overallInfo
Number of segments per index. First summary, then results for genome, and finally an interactive table.
## Find number of segments in each index
index.seg <- data.frame(do.call(rbind, lapply(index.all, function(x) {
sapply(x, function(y) {
sum(runValue(y))
})
})))
index.seg$chr <- rownames(index.seg)
rownames(index.seg) <- NULL
## Print info
summary(index.seg)
## both only.der only.reg none
## Min. : 46 Min. : 307 Min. : 292 Min. : 565
## 1st Qu.: 7911 1st Qu.:12862 1st Qu.: 6517 1st Qu.:14758
## Median :10458 Median :20018 Median : 8136 Median :22432
## Mean :10414 Mean :19421 Mean : 8292 Mean :22254
## 3rd Qu.:13196 3rd Qu.:24145 3rd Qu.:10928 3rd Qu.:28206
## Max. :24503 Max. :43895 Max. :19503 Max. :50572
## all.der all.reg chr
## Min. : 306 Min. : 304 Length:24
## 1st Qu.:15392 1st Qu.: 7326 Class :character
## Median :23357 Median : 9600 Mode :character
## Mean :23004 Mean : 9715
## 3rd Qu.:29185 3rd Qu.:12452
## Max. :52342 Max. :22836
## Overall info
colSums(index.seg[, -ncol(index.seg)])
Summary of the segment widths for each index. First the overall summary, then the results for each index.
## Get an idea of the width of the segments in each index
index.width <- data.frame(do.call(rbind, lapply(index.all, function(x) {
tmp <- data.frame(do.call(rbind, lapply(x, function(y) {
summary(runLength(y)[runValue(y)])
})), check.names = FALSE)
tmp$index <- names(x)
rownames(tmp) <- NULL
return(tmp)
})), check.names = FALSE)
index.width$chr <- rep(names(seqlengths(ders)), each = 6)
rownames(index.width) <- NULL
## Print info
summary(index.width)
## Min. 1st Qu. Median Mean
## Min. :1.000 Min. : 4.00 Min. : 12.00 Min. : 40.50
## 1st Qu.:1.000 1st Qu.: 5.00 1st Qu.: 24.00 1st Qu.: 78.44
## Median :1.000 Median : 7.50 Median : 60.00 Median : 120.85
## Mean :1.674 Mean :15.07 Mean : 55.52 Mean : 1765.97
## 3rd Qu.:3.000 3rd Qu.:13.00 3rd Qu.: 86.00 3rd Qu.: 218.38
## Max. :3.000 Max. :63.00 Max. :121.00 Max. :105000.00
## 3rd Qu. Max. index
## Min. : 41.00 Min. : 642 Length:144
## 1st Qu.: 88.75 1st Qu.: 5098 Class :character
## Median : 134.40 Median : 7788 Mode :character
## Mean : 247.36 Mean : 1563425
## 3rd Qu.: 176.25 3rd Qu.: 9829
## Max. :1403.00 Max. :30230000
## chr
## Length:144
## Class :character
## Mode :character
##
##
##
## setting value
## version R version 3.1.1 Patched (2014-10-16 r66782)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz