derfinder basic results exploration

Project: run4-v1.0.10.

Introduction

This report is meant to help explore the results of the derfinder (Collado-Torres, Frazee, Jaffe, and Leek, 2014) package and was generated using regionReport (Collado-Torres, Jaffe, and Leek, 2014) package. While the report is rich, it is meant to just start the exploration of the results and exemplify some of the code used to do so. You will most likely need a more in-depth analysis for your specific data set.

Most plots were made with using ggplot2 (Wickham, 2009).

Code setup

#### Libraries needed

## Bioconductor
library('IRanges')
library('GenomicRanges')
library('GenomeInfoDb')

if(hg19) {
    library('biovizBase')
    library('TxDb.Hsapiens.UCSC.hg19.knownGene')
}

## CRAN
library('ggplot2')
library('grid')
library('gridExtra')
library('knitr')
library('RColorBrewer')
library('mgcv')
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:IRanges':
## 
##     collapse
## 
## This is mgcv 1.8-3. For overview type 'help("mgcv-package")'.
## GitHub
library('derfinder')

## Working behind the scenes
# library('knitcitations')
# library('rmarkdown')
# library('knitrBootstrap')

#### Code setup

## For ggplot
tmp <- fullRegions
names(tmp) <- seq_len(length(tmp))
regions.df <- as.data.frame(tmp)
regions.df$width <- width(tmp)
rm(tmp)
nulls.df <- as.data.frame(fullNullSummary)

## Special subsets: need at least 3 points for a density plot
keepChr <- table(regions.df$seqnames) > 2
regions.df.plot <- subset(regions.df, seqnames %in% names(keepChr[keepChr]))

if(hasSig) {
    ## Keep only those sig
    regions.df.sig <- regions.df[idx.sig, ]
    keepChr <- table(regions.df.sig$seqnames) > 2
    regions.df.sig <- subset(regions.df.sig, seqnames %in% names(keepChr[keepChr]))
    
    if(nrow(regions.df.sig) > 0) {
        ## If there's any sig, keep those with finite areas
        if(hasArea) {
            finite.area.sig <- which(is.finite(regions.df.sig$area))
            
            regions.df.sig.area <- regions.df.sig[finite.area.sig, ]
            keepChr <- table(regions.df.sig.area$seqnames) > 2
            regions.df.sig.area <- subset(regions.df.sig.area, seqnames %in%
                names(keepChr[keepChr]))
            
            ## Save the info
            hasArea <- (nrow(regions.df.sig.area) > 0)
        }
    } else {
        hasSig <- hasArea <- FALSE
    }
}

## Get chr lengths
if(hg19) {
    data(hg19Ideogram, package = 'biovizBase')
    seqlengths(fullRegions) <- seqlengths(hg19Ideogram)[mapSeqlevels(names(seqlengths(fullRegions)),
         'UCSC')]
}

## Find which chrs are present in the data set
chrs <- levels(seqnames(fullRegions))

## Subset the fullCoverage data in case that a subset was used
colsubset <- optionsStats$colsubset
if(!is.null(fullCov) & !is.null(colsubset)) {
    fullCov <- lapply(fullCov, function(x) { x[, colsubset] })
}

## Get region coverage for the top regions
if(nBestRegions > 0) {
    if(packageVersion('derfinder') >= '0.0.60') {
        regionCoverage <- getRegionCoverage(fullCov = fullCov, 
            regions = fullRegions[seq_len(nBestRegions)],
            chrsStyle = chrsStyle, species = species,
            currentStyle = currentStyle, verbose = FALSE)
    } else {
        regionCoverage <- getRegionCoverage(fullCov = fullCov, 
            regions = fullRegions[seq_len(nBestRegions)],
            verbose = FALSE)
    }
    
    save(regionCoverage, file=file.path(workingDir, 'regionCoverage.Rdata'))
}

## Graphical setup: transcription database
if(hg19 & is.null(txdb)) {
    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
} else {
    stopifnot(!is.null(txdb))
}

Quality checks

P-values

Theoretically, the p-values should be uniformly distributed between 0 and 1.

p1 <- ggplot(regions.df.plot, aes(x=pvalues, colour=seqnames)) +
    geom_line(stat='density') + xlim(0, 1) +
    labs(title='Density of p-values') + xlab('p-values') +
    scale_colour_discrete(limits=chrs) + theme(legend.title=element_blank())
p1
## Compare the pvalues
summary(fullRegions$pvalues)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000002 0.0050660 0.4483000 0.4349000 0.7890000 1.0000000

This is the numerical summary of the distribution of the p-values.

Q-values

summary(fullRegions$qvalues)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000217 0.0202600 0.8965000 0.6189000 1.0000000 1.0000000

This is the numerical summary of the distribution of the q-values.

qtable <- lapply(c(1e-04, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
    0.6, 0.7, 0.8, 0.9, 1), function(x) {
    data.frame('Cut' = x, 'Count' = sum(fullRegions$qvalues <= x))
})
qtable <- do.call(rbind, qtable)
kable(qtable, format = 'html', align = c('c', 'c'))
Cut Count
0.0001 38272
0.0010 114551
0.0100 183452
0.0250 198310
0.0500 207859
0.1000 218544
0.2000 237317
0.3000 254228
0.4000 271439
0.5000 289422
0.6000 307944
0.7000 328838
0.8000 354604
0.9000 392655
1.0000 779674

This table shows the number of candidate Differentially Expressed Regions (DERs) with q-value less or equal than some commonly used cutoff values.

FWER adjusted P-values

summary(fullRegions$fwer)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.6260  1.0000  0.7678  1.0000  1.0000

This is the numerical summary of the distribution of the q-values.

fwertable <- lapply(c(1e-04, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
    0.6, 0.7, 0.8, 0.9, 1), function(x) {
    data.frame('Cut' = x, 'Count' = sum(fullRegions$fwer <= x))
})
fwertable <- do.call(rbind, fwertable)
kable(fwertable, format = 'html', align = c('c', 'c'))
Cut Count
0.0001 8902
0.0010 12376
0.0100 47888
0.0250 76732
0.0500 117054
0.1000 137577
0.2000 161727
0.3000 174284
0.4000 181682
0.5000 187388
0.6000 192784
0.7000 200076
0.8000 206550
0.9000 215071
1.0000 779674

This table shows the number of candidate Differentially Expressed Regions (DERs) with FWER adjusted p-values less or equal than some commonly used cutoff values.

Region width

xrange <- range(log10(regions.df.plot$width))
p2a <- ggplot(regions.df.plot, aes(x=log10(width), colour=seqnames)) + 
    geom_line(stat='density') + labs(title='Density of region lengths') +
    xlab('Region width (log10)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
p2b <- ggplot(regions.df.sig, aes(x=log10(width), colour=seqnames)) +
    geom_line(stat='density') +
    labs(title='Density of region lengths (significant only)') +
    xlab('Region width (log10)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p2a, p2b)

This plot shows the density of the region lengths for all regions. The bottom panel is restricted to significant regions (q-value < 0.1)

Region Area

xrange <- range(log10(regions.df.plot$area[finite.area]))
if(inf.area > 0) {
    print(paste('Dropping', inf.area, 'due to Inf values.'))
}
p3a <- ggplot(regions.df[finite.area, ], aes(x=log10(area), colour=seqnames)) +
    geom_line(stat='density') + labs(title='Density of region areas') +
    xlab('Region area (log10)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
p3b <- ggplot(regions.df.sig.area, aes(x=log10(area), colour=seqnames)) +
    geom_line(stat='density') +
    labs(title='Density of region areas (significant only)') +
    xlab('Region area (log10)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p3a, p3b)

This plot shows the density of the region areas for all regions. The bottom panel is restricted to significant regions (q-value < 0.1)

Null regions: width and area

p4 <- ggplot(nulls.df, aes(x=log10(width), colour=chr)) +
    geom_line(stat='density') + labs(title='Density of null region lengths') +
    xlab('Region width (log10)') + scale_colour_discrete(limits=chrs) +
    theme(legend.title=element_blank())
nulls.inf <- !is.finite(nulls.df$area)
if(sum(nulls.inf) > 0) {
    print(paste('Dropping', sum(nulls.inf), 'due to Inf values.'))
}
p5 <- ggplot(nulls.df[!nulls.inf, ], aes(x=log10(area), colour=chr)) +
    geom_line(stat='density') + labs(title='Density of null region areas') +
    xlab('Region area (log10)') + scale_colour_discrete(limits=chrs) +
    theme(legend.title=element_blank())
grid.arrange(p4, p5)

This plot shows the density of the null region lengths and areas. There were a total of 4032470 null regions.

Mean coverage

xrange <- range(log2(regions.df.plot$meanCoverage))
p6a <- ggplot(regions.df.plot, aes(x=log2(meanCoverage), colour=seqnames)) +
    geom_line(stat='density') + labs(title='Density of region mean coverage') +
    xlab('Region mean coverage (log2)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
p6b <- ggplot(regions.df.sig, aes(x=log2(meanCoverage), colour=seqnames)) +
    geom_line(stat='density') +
    labs(title='Density of region mean coverage (significant only)') +
    xlab('Region mean coverage (log2)') + scale_colour_discrete(limits=chrs) +
    xlim(xrange) + theme(legend.title=element_blank())
grid.arrange(p6a, p6b)

This plot shows the density of the region mean coverage for all regions. The bottom panel is restricted to significant regions (q-value < 0.1)

Mean coverage vs fold change

The following plots are MA-style plots comparing each group vs the first one. The mean coverage is calculated using only two groups at a time and is weighted according to the number of samples on each group. Note that the mean coverage and fold change as calculated here do not taking into account the library sizes.

These plots are only shown when there are two or more groups. A total of 5 plot(s) were made.

for(j in grep('log2FoldChange', colnames(values(fullRegions)))) {
    ## Identify the groups
    groups <- strsplit(gsub('log2FoldChange', '',
        colnames(values(fullRegions))[j]), 'vs')[[1]]
    
    ## Calculate the mean coverage only using the 2 groups in question
    j.mean <- which(colnames(values(fullRegions)) %in% paste0('mean', groups))
    groups.n <- sapply(groups, function(x) { sum(optionsStats$groupInfo == x) })
    ma.mean.mat <- as.matrix(values(fullRegions)[, j.mean])
    ## Weighted means
    ma.mean <- drop(ma.mean.mat %*% groups.n) / sum(groups.n) +
        optionsStats$scalefac
    ma.fold2 <- drop(log2(ma.mean.mat + optionsStats$scalefac) %*% c(1, -1))
    
    ma <- data.frame(mean=ma.mean, log2FoldChange=ma.fold2)
    ma2 <- ma[is.finite(ma$log2FoldChange), ]
    fold.mean <- data.frame(foldMean=mean(ma2$log2FoldChange, na.rm=TRUE))
    
    p.ma <- ggplot(ma, aes(x=log2(mean), y=log2FoldChange)) +
        geom_point(size=1.5, alpha=1/5) + 
        ylab("Fold Change [log2(x + sf)]\nRed dashed line at mean; blue line is GAM fit: y ~ s(x, bs = 'cs')") +
        xlab(paste('Mean coverage [log2(x + sf)] using only groups', groups[1], 'and',
            groups[2])) + labs(title=paste('MA style plot:', groups[1], 'vs ', 
            groups[2])) + geom_hline(aes(yintercept=foldMean), data=fold.mean, 
            colour='#990000', linetype='dashed') +
        geom_smooth(aes(y=log2FoldChange, x=log2(mean)), data=subset(ma2,
            mean > 0), method = 'gam', formula = y ~ s(x, bs = 'cs'))
    print(p.ma)
}