Summary info

## Options
opt$example <- eval(parse(text=opt$example))
example <- opt$example
names(example) <- NULL


## Required libs
suppressMessages(library("GenomicRanges"))
suppressMessages(library("ggbio"))
suppressMessages(library("TxDb.Hsapiens.UCSC.hg19.knownGene"))
suppressMessages(library("derfinder"))
suppressMessages(library("derfinderPlot"))

Summary information for brainspan data set, analysis run4-v1.0.10 showcasing best clusters 5, 16, 18 which illustrate the complexity induced by alternative transcription, coverage dips, and coverage variability even on long single exon regions respectively.

Number of filtered bases

## Extract data from log files
reads <- system(paste0('grep filterData ', file.path(rootdir, opt$short, 'CoverageInfo', 'logs'), '/fullCov-*.e* | grep -v "that 0 percent"'), intern=TRUE)
filt <- data.frame(original=as.integer(gsub("were | rows", "", regmatches(reads, regexpr("were [0-9]* rows", reads)))), filtered=as.integer(gsub("are | rows", "", regmatches(reads, regexpr("are [0-9]* rows", reads)))))

## How many were filtered?
## What is the percent filtered?
## Percent remaining?
filtered <- colSums(filt)
summ <- c(
    'Filtered' = filtered["original"] - filtered["filtered"],
    'PercentFilt' = (filtered["original"] - filtered["filtered"]) / filtered["original"] * 100,
    'PercentRemaining' = 100 - (filtered["original"] - filtered["filtered"]) / filtered["original"] * 100
)
summ
##         Filtered.original      PercentFilt.original 
##              2.924750e+09              9.447850e+01 
## PercentRemaining.original 
##              5.521503e+00

Number of candidate regions

## Load regions data
load(file.path(rootdir, opt$short, 'derAnalysis', opt$run, 'fullRegions.Rdata'))

## How many candidate regions?
nRegs  <- c('cDERsN' = length(fullRegions))
nRegs
## cDERsN 
## 779674

Number of DE regions

As determined by q-value < 0.10

## How many regions DE? Judged by q-value
## What is the percent of regions DE among the candidate ones?
qval <- c(
    'nDE' = sum(fullRegions$significantQval == TRUE),
    'percentDE' = sum(fullRegions$significantQval == TRUE) / length(fullRegions) * 100
    )
qval
##          nDE    percentDE 
## 218544.00000     28.03018

As determined by FWER adjusted p-value < 0.05

## How many regions DE? Judged by FWER adjusted p-value
## What is the percent of regions DE among the candidate ones?
fwer <- c(
    'nDE' = sum(fullRegions$significantFWER == TRUE),
    'percentDE' = sum(fullRegions$significantFWER == TRUE) / length(fullRegions) * 100
    )
fwer
##          nDE    percentDE 
## 115658.00000     14.83415
## Save results
save(summ, nRegs, qval, fwer, file=file.path(resdir, "summaryResults.Rdata"))

Example regions from each case

## Load full coverage data
load(file.path(rootdir, opt$short, 'CoverageInfo', 'fullCov.Rdata'))

## Load options
load(file.path(rootdir, opt$short, 'derAnalysis', opt$run, 'chr22', 'optionsStats.Rdata'))

## For ggplot
tmp <- fullRegions
names(tmp) <- seq_len(length(tmp))
regions.df <- as.data.frame(tmp)
regions.df$width <- width(tmp)
rm(tmp)

## Select clusters by cluster area
df <- data.frame(area=fullRegions$area, clusterChr=paste0(as.integer(fullRegions$cluster), chr=as.character(seqnames(fullRegions))))
regionClustAreas <- tapply(df$area, df$clusterChr, sum)
bestArea <- sapply(names(head(sort(regionClustAreas, decreasing=TRUE), 70)), function(y) { which(df$clusterChr == y)[[1]]})

## Graphical setup: ideograms 
## Load ideogram info
data(hg19IdeogramCyto, package = "biovizBase")
ideos.set <- as.character(unique(seqnames(fullRegions[bestArea])))
p.ideos <- lapply(ideos.set, function(xx) { 
    plotIdeogram(hg19IdeogramCyto, xx)
})
names(p.ideos) <- ideos.set


## Graphical setup: transcription database
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

## Graphical setup: main plotting function
regionClusterPlot <- function(idx, tUse="fwer") {
    ## Chr specific selections
    chr <- as.character(seqnames(fullRegions[idx]))
    p.ideo <- p.ideos[[chr]]
    covInfo <- fullCov[[chr]]
    
    ## Make the plot
    p <- plotCluster(idx, regions=fullRegions, annotation=regions.df, coverageInfo=covInfo, groupInfo=optionsStats$groupInfo, titleUse=tUse, txdb=txdb, p.ideogram=p.ideo)
    print(p)
    
    ## Save .Rdata
    save(p, file=file.path(resdir, paste0("exampleRegion", idx, ".Rdata")) )
    
    ## Save as pdf
    pdf(file=file.path(resdir, paste0("exampleRegion", idx, ".pdf")), width=20, height=10)
    print(p)
    dev.off()
    rm(p.ideo, covInfo)
    
    return(invisible(TRUE)) 
}

## Genome plots
for(idx in opt$example) {
    regionClusterPlot(bestArea[idx], "fwer")
}