## Check that data was loaded
stopifnot(all(c('fullRegions', 'regionMat', 'analysisPath') %in% ls()))
## Fix DERs
ders <- fullRegions
names(ders) <- NULL
## Fix region matrices
regs <- unlist(GRangesList(lapply(regionMat, '[[', 'regions')))
names(regs) <- NULL
## Assign seqlengths
data(hg19Ideogram, package = 'biovizBase', envir = environment())
seqlengths(ders) <- seqlengths(hg19Ideogram)[names(seqlengths(ders))]
seqlengths(regs) <- seqlengths(hg19Ideogram)[names(seqlengths(regs))]
## Sort
ders <- sort(ders)
regs <- sort(regs)
## Check names match
identical(seqlengths(regs), seqlengths(ders))
## [1] FALSE
## Save the raw data
save(ders, file = file.path(analysisPath, 'dersOriginal.Rdata'))
save(regs, file = file.path(analysisPath, 'regsOriginal.Rdata'))
## Filter out those with less than 3bp
c('# ders under 3bp' = sum(width(ders) < 3), '# regs under 3bp' = sum(width(regs) < 3))
## # ders under 3bp # regs under 3bp
## 165 2
ders <- ders[width(ders) >= 3]
regs <- regs[width(regs) >= 3]
## Save data used
save(ders, file = file.path(analysisPath, 'ders.Rdata'))
save(regs, file = file.path(analysisPath, 'regs.Rdata'))
Construct logical indexes for DERs and regionMatrix regions.
## Construct logical Rle indexes for bases with some region
build_index <- function(gr) {
res <- lapply(names(seqlengths(gr)), function(chr) {
chr.len <- seqlengths(gr)[chr]
ir <- sort(ranges(gr[seqnames(gr) == chr]))
log <- c(rep(c(FALSE, TRUE), length(ir)), FALSE)
starts <- ends <- rep(NA, length(ir) * 2)
i <- rep(c(TRUE, FALSE), length(ir))
starts[i] <- start(ir)
ends[i] <- end(ir)
starts[!i] <- ends[i] + 1
if(max(ends, na.rm = TRUE) < chr.len) {
ends[!i] <- c(starts[i] - 1, chr.len)[-1]
} else {
ends[!i] <- c(starts[i] - 1, NULL)[-1]
starts <- starts[- length(starts)]
log <- log[- length(log)]
}
if(starts[1] != 1) {
ends <- c(starts[1] - 1, ends)
starts <- c(1, starts)
} else {
log <- log[-1]
}
widths <- mapply(function(s, e) { e - s + 1}, starts, ends)
Rle(log, widths)
})
names(res) <- names(seqlengths(gr))
return(res)
}
index.ders <- build_index(ders)
index.regs <- build_index(regs)
## Add info for chrs where there are no regs
miss <- !paste0('chr', c(1:22, 'X', 'Y')) %in% names(index.regs)
names(miss) <- paste0('chr', c(1:22, 'X', 'Y'))
if(any(miss)) {
miss.add <- lapply(names(miss)[miss], function(x) {
Rle(FALSE, seqlengths(hg19Ideogram)[x])
})
names(miss.add) <- names(miss)[miss]
index.regs <- c(index.regs, miss.add)
index.regs <- index.regs[match(names(miss), names(index.regs))]
}
## Add info for chrs where there are no DERs
miss <- !names(index.regs) %in% names(index.ders)
if(any(miss)) {
miss.add <- lapply(names(index.regs)[miss], function(x) {
Rle(FALSE, length(index.regs[[x]]))
})
names(miss.add) <- names(index.regs)[miss]
index.ders <- c(index.ders, miss.add)
index.ders <- index.ders[match(names(index.regs), names(index.ders))]
}
Number of base-pairs in each index. Summary first, then overall info for the genome (in number of bases, then in percent of the genome), and finally results in interactive table.
## Merge all the indexes
index.all <- mapply(function(der, reg) {
both <- der & reg
only.der <- der & !reg
only.reg <- !der & reg
none <- !der & !reg
res <- list('both' = both, 'only.der' = only.der, 'only.reg' = only.reg,
'none' = none, 'all.der' = der, 'all.reg' = reg)
return(list(res))
}, index.ders, index.regs)
## Find number of base-pairs in each index
index.num <- data.frame(do.call(rbind, lapply(index.all, function(x) { sapply(x, sum)})))
index.num$chrLen <- seqlengths(ders)
index.num$chr <- rownames(index.num)
rownames(index.num) <- NULL
## Print info
summary(index.num)
## both only.der only.reg none
## Min. : 0 Min. :0 Min. : 0 Min. : 48129795
## 1st Qu.: 0 1st Qu.:0 1st Qu.: 0 1st Qu.: 80415720
## Median : 0 Median :0 Median : 0 Median :134429116
## Mean : 2112 Mean :0 Mean : 2687 Mean :128981760
## 3rd Qu.: 0 3rd Qu.:0 3rd Qu.: 85 3rd Qu.:162132764
## Max. :50685 Max. :0 Max. :63897 Max. :249250621
## all.der all.reg chrLen chr
## Min. : 0 Min. : 0 Min. :51304566 Length:24
## 1st Qu.: 0 1st Qu.: 0 1st Qu.:51304566 Class :character
## Median : 0 Median : 0 Median :51304566 Mode :character
## Mean : 2112 Mean : 4798 Mean :51304566
## 3rd Qu.: 0 3rd Qu.: 85 3rd Qu.:51304566
## Max. :50685 Max. :114582 Max. :51304566
## Overall info
overallInfo <- colSums(index.num[, -ncol(index.num)])
overallInfo
Number of segments per index. First summary, then results for genome, and finally an interactive table.
## Find number of segments in each index
index.seg <- data.frame(do.call(rbind, lapply(index.all, function(x) {
sapply(x, function(y) {
sum(runValue(y))
})
})))
index.seg$chr <- rownames(index.seg)
rownames(index.seg) <- NULL
## Print info
summary(index.seg)
## both only.der only.reg none
## Min. : 0.00 Min. :0 Min. : 0.00 Min. : 1.00
## 1st Qu.: 0.00 1st Qu.:0 1st Qu.: 0.00 1st Qu.: 1.00
## Median : 0.00 Median :0 Median : 0.00 Median : 1.00
## Mean : 12.67 Mean :0 Mean : 16.79 Mean : 12.42
## 3rd Qu.: 0.00 3rd Qu.:0 3rd Qu.: 1.00 3rd Qu.: 2.00
## Max. :304.00 Max. :0 Max. :397.00 Max. :269.00
## all.der all.reg chr
## Min. : 0.00 Min. : 0.00 Length:24
## 1st Qu.: 0.00 1st Qu.: 0.00 Class :character
## Median : 0.00 Median : 0.00 Mode :character
## Mean : 12.67 Mean : 11.42
## 3rd Qu.: 0.00 3rd Qu.: 1.00
## Max. :304.00 Max. :268.00
## Overall info
colSums(index.seg[, -ncol(index.seg)])
Summary of the segment widths for each index. First the overall summary, then the results for each index.
## Get an idea of the width of the segments in each index
index.width <- data.frame(do.call(rbind, lapply(index.all, function(x) {
tmp <- data.frame(do.call(rbind, lapply(x, function(y) {
summary(runLength(y)[runValue(y)])
})), check.names = FALSE)
tmp$index <- names(x)
rownames(tmp) <- NULL
return(tmp)
})), check.names = FALSE)
index.width$chr <- rep(names(seqlengths(ders)), each = 6)
rownames(index.width) <- NULL
## Print info
summary(index.width)
## Min. 1st Qu. Median
## Min. : 1 Min. : 6 Min. : 33
## 1st Qu.: 100 1st Qu.: 100 1st Qu.: 100
## Median : 29640000 Median : 46865000 Median : 58355000
## Mean : 65166780 Mean : 66660054 Mean : 68153096
## 3rd Qu.:114350000 3rd Qu.:114350000 3rd Qu.:114350000
## Max. :249300000 Max. :249300000 Max. :249300000
## NA's :104 NA's :104 NA's :104
## Mean 3rd Qu. Max.
## Min. : 80 Min. : 80 Min. : 80
## 1st Qu.: 101 1st Qu.: 101 1st Qu.: 101
## Median : 58355000 Median : 59250000 Median : 59250000
## Mean : 68157810 Mean : 69647284 Mean : 71578531
## 3rd Qu.:114350000 3rd Qu.:114350000 3rd Qu.:114350000
## Max. :249300000 Max. :249300000 Max. :249300000
## NA's :104 NA's :104 NA's :104
## index chr
## Length:144 Length:144
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## setting value
## version R version 3.1.1 Patched (2014-10-16 r66782)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz