## Usage:
# qrsh -l mem_free=130G,h_vmem=150G
# module load R/3.3
# mkdir -p logs
# Rscript brainspan_regionLevel.R > logs/brainspan_regionLevel_log.txt 2>&1

###
library(limma)
library(GenomicRanges)
library(derfinder)
library(bumphunter)
library('rtracklayer')
library('EnsDb.Hsapiens.v75')
library('GenomeInfoDb')
library('GenomicFeatures')
library('org.Hs.eg.db')
library('GOstats')

ss = function(x, pattern, slot=1,...) sapply(strsplit(x,pattern,...), function(y) y[slot])
splitit = function(x) split(seq(along=x),x) # splits into list

getF = function(fit, fit0, theData) {
	rss1 = rowSums((fitted(fit)-theData)^2)
	df1 = ncol(fit$coef)
	rss0 = rowSums((fitted(fit0)-theData)^2)
	df0 = ncol(fit0$coef)
	fstat = ((rss0-rss1)/(df1-df0))/(rss1/(ncol(theData)-df1))
	f_pval = pf(fstat, df1-df0, ncol(theData)-df1,lower.tail=FALSE)
	fout = cbind(fstat,df1-df0,ncol(theData)-df1,f_pval)
	colnames(fout)[2:3] = c("df1","df0")
	fout = data.frame(fout)
	return(fout)
}

# load data
load("/dcl01/lieber/ajaffe/derRuns/derSupplement/brainspan/regionMatrix/regionMat-cut0.25.Rdata")
load("/dcl01/lieber/ajaffe/derRuns/derSupplement/brainspan/derAnalysis/run5-v1.5.30/models.Rdata")

## Load single base results
if(!file.exists('rdas/summarized_BrainSpan_DERs.rda')) {
    stop("Run characterize_brainspan_DERs.R first")
} else {
    load("rdas/summarized_BrainSpan_DERs.rda")
}

## Fix models
stopifnot(nrow(models$mod) == nrow(models$mod0))
if(unique(sapply(models, nrow)) == 487) {
    models$mod <- models$mod[-bad_samples, ]
    models$mod0 <- matrix(models$mod0[-bad_samples, ], ncol = 1)
}
stopifnot(nrow(models$mod) == 484)
stopifnot(nrow(pdSpan) == 484)

sigSpan$annotation = ss(sigSpan$annotation, " ")

## add pheno info
pdSpan$fetal = ifelse(pdSpan$Age < 0, "Fetal", "Postnatal")
pdSpan$fetal = factor(pdSpan$fetal,levels=c("Postnatal","Fetal"))
pdSpan$struct = factor(pdSpan$structure_acronym, levels = c("DFC","VFC","MFC",
	"OFC","M1C","S1C", "IPC", "A1C", "STC", "ITC", "V1C", "HIP",
	"AMY", "STR", "MD", "CBC"))
ncx = as.character(pdSpan$struct)
ncx[ncx %in% c("DFC","VFC","MFC",
	"OFC","M1C","S1C", "IPC", "A1C", "STC", "ITC", "V1C")] = "NCX"
pdSpan$NCX = factor(ncx, levels = c("NCX",  "HIP",
	"AMY", "STR", "MD", "CBC"))
pdSpan$Group = with(pdSpan, paste0(NCX, ":", fetal))
pdSpan$Group = factor(pdSpan$Group, levels = 
	paste0(rep(levels(pdSpan$NCX), each=2), ":", 
		rep(c("Fetal","Postnatal"), times=6)))

# regions
regList = lapply(regionMat, function(x) x$regions)
fullRegionGR = unlist(GRangesList(regList))
print('Number of regions and MB covered: cut 0.25')
length(fullRegionGR)
sum(width(fullRegionGR))/1e6

# coverage matrix
fullRegionMat = do.call("rbind",
	lapply(regionMat, function(x) x$coverageMatrix))
## Drop bad samples
if(ncol(fullRegionMat) == 487) fullRegionMat <- fullRegionMat[, -bad_samples]

stopifnot(ncol(fullRegionMat) == 484)

## Explore single base-level DERs widths (signifcant onlhy)
summary(width(sigSpan))

## drop regions shorter than 6 bp
keepIndex=which(width(fullRegionGR) >= 6)
fullRegionGR = fullRegionGR[keepIndex]
fullRegionMat = fullRegionMat[keepIndex,]

print('Number of regions and MB covered: cut 0.25, >= 6bp')
length(fullRegionGR)
sum(width(fullRegionGR))/1e6

##### lower cutoff
load("/dcl01/lieber/ajaffe/derRuns/derSupplement/brainspan/regionMatrix/regionMat-cut0.1.Rdata")
regList1 = lapply(regionMat, function(x) x$regions)
fullRegionGR1 = unlist(GRangesList(regList1))
fullRegionMat1 = do.call("rbind",
	lapply(regionMat, function(x) x$coverageMatrix))
    
print('Number of regions and MB covered: cut 0.10')
length(fullRegionGR1)
sum(width(fullRegionGR1))/1e6

keepIndex1=which(width(fullRegionGR1) >= 6)
fullRegionGR1 = fullRegionGR1[keepIndex1]
fullRegionMat1 = fullRegionMat1[keepIndex1,]
if(ncol(fullRegionMat1) == 487) fullRegionMat1 <- fullRegionMat1[, -bad_samples]
stopifnot(ncol(fullRegionMat1) == 484)

print('Number of regions and MB covered: cut 0.10, >= 6bp')
length(fullRegionGR1)
sum(width(fullRegionGR1))/1e6

## log transform
y = log2(fullRegionMat + 1)
rownames(y) = NULL

## annotate regions based on transcriptome databases
load("/users/ajaffe/GenomicStates/GenomicState.Hsapiens.ensembl.GRCh37.p12.rda")
gs = GenomicState.Hsapiens.ensembl.GRCh37.p12$fullGenome
ensemblAnno = annotateRegions(fullRegionGR,gs)
countTable = ensemblAnno$countTable

mean(countTable$exon > 0 & countTable$intergenic == 0 & 
	countTable$intron== 0) * 100
mean(countTable$exon == 0 & (countTable$intergenic > 0 | 
	countTable$intron > 0)) * 100

## with 0.1 cutoff	
ensemblAnno1 = annotateRegions(fullRegionGR1,gs)
countTable1 = ensemblAnno1$countTable

dir.create('plots', showWarnings = FALSE)
pdf("plots/venn_counts_brainspan_regionLevel.pdf",h=5,w=6)
vennDiagram(vennCounts(countTable > 0)); mtext("Ensembl, cutoff = 0.25, ERs >= 6bp", line=1,cex=2)
vennDiagram(vennCounts(countTable1 > 0)); mtext("Ensembl, cutoff = 0.10, ERs >= 6bp", line=1,cex=2)
dev.off()

mean(countTable1$exon > 0 & countTable1$intergenic == 0 & 
	countTable1$intron== 0) * 100
mean(countTable1$exon == 0 & (countTable1$intergenic > 0 | 
	countTable1$intron > 0)) * 100

## compare to single base
fit = lmFit(y, models$mod)
fit0 = lmFit(y, models$mod0)
ff = getF(fit,fit0, y)

sum(p.adjust(ff$f_pval,"bonf") < 0.05)
mean(p.adjust(ff$f_pval,"bonf") < 0.05) * 100

sigIndex=which(p.adjust(ff$f_pval,"bonf") < 0.05)

xx = ensemblAnno$annotationList[sigIndex]
theGenes = unlist(unlist(xx)$gene)
theGenes = sapply(theGenes, function(x) x[!is.na(x)])
theGenes = theGenes[!is.na(theGenes)]

theSymbols <- unlist(unlist(xx)$symbol)
theSymbols <- sapply(theSymbols, function(x) x[!is.na(x)])
theSymbols <- theSymbols[!is.na(theSymbols)]

print("Number of unique ensembl genes, then unique genes with symbols")
length(unique(theGenes))
length(unique(theSymbols))

an = annotateNearest(fullRegionGR, sigSpan)
table(an$dist == 0)
table(an$dist == 0) / length(an$dist) * 100

print("Number of sig ER-level DERs overlapping sig single base-level DERs")
sum(ff$f_pval[an$dist==0] < 0.05/nrow(y))
print("Percent of sig ER-level DERs overlapping sig single base-level DERs")
sum(ff$f_pval[an$dist==0] < 0.05/nrow(y)) / sum(ff$f_pval < 0.05/nrow(y)) * 100
print("Percent of ER-level DERs overlapping sig single base-level DERs that are significant")
mean(ff$f_pval[an$dist==0] < 0.05/nrow(y)) * 100

print("sig SB-level DERs overlapping ER-level DERs")
an2 = annotateNearest(sigSpan, fullRegionGR)
table(an2$dist == 0)
table(an2$dist == 0) / length(an2$dist) * 100

## Load meanCoverage for sig SB-level DERs
if(!file.exists('rdas/summarized_BrainSpan_DERs_meanCov.rda')) {
    stop("Run characterize_brainspan_DERs.R first")
} else {
    load("rdas/summarized_BrainSpan_DERs_meanCov.rda")
}
stopifnot(length(sigSpan) == nrow(meanCoverage))

## Get mean coverage per sig DB-level DER
mean_meanCov <- rowMeans(meanCoverage)

print("Percent of sig SB-level DERs with mean coverage < 0.25")
table(mean_meanCov < 0.25)
table(mean_meanCov < 0.25) / length(mean_meanCov) * 100

print("Percent of sig SB-level DERs not overlapping ER-level DERs with mean coverage < 0.25")
table(mean_meanCov[an2$dist != 0] < 0.25)
mean(mean_meanCov[an2$dist != 0] < 0.25) * 100

print("Width of sig SB-level DERs not overlapping ER-level DERs, then overlapping them, then all of them")
summary(width(sigSpan[an2$dist != 0]))
summary(width(sigSpan[an2$dist == 0]))
summary(width(sigSpan))


### Run some checks
if(!file.exists('wgEncodeDacMapabilityConsensusExcludable.bed.gz')) {
    system('wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDacMapabilityConsensusExcludable.bed.gz .')
}
exclude <- import.bed('wgEncodeDacMapabilityConsensusExcludable.bed.gz')

print('Percent of significant ER-level DERs overlapping blacklisted regions')
round(sum(countOverlaps(fullRegionGR[sigIndex], exclude) > 0) / length(sigIndex) * 100, 4)

## Find lincRnas
lincs <- transcripts(EnsDb.Hsapiens.v75, filter = TxbiotypeFilter('lincRNA'))
seqlevelsStyle(lincs) <- 'UCSC'
print('Percent of significant ER-level DERs overlapping lincRNA transcripts')
round(sum(countOverlaps(fullRegionGR[sigIndex], lincs) > 0) / length(sigIndex) * 100, 2)

## Do it for all types
ov_type <- sapply(listTxbiotypes(EnsDb.Hsapiens.v75), function(type) {
    message(paste(Sys.time(), 'processing type', type))
    reg_type <- transcripts(EnsDb.Hsapiens.v75, filter = TxbiotypeFilter(type))
    seqlevelsStyle(reg_type) <- 'UCSC'
    sum(countOverlaps(fullRegionGR[sigIndex], reg_type) > 0) / length(sigIndex) * 100
})
print('Percent of significant ER-level DERs overlapping different transcript types')
sort(ov_type, decreasing = TRUE)
print('Percent of significant ER-level DERs overlapping different transcript types: rounded to 2 digits')
round(sort(ov_type, decreasing = TRUE), 2)


## GO analysis
sql_file <- "/users/ajaffe/Lieber/Projects/RNAseq/Ribozero_Compare/TxDb.Hsapiens.BioMart.ensembl.GRCh37.p12/inst/extdata/TxDb.Hsapiens.BioMart.ensembl.GRCh37.p12.sqlite"
TranscriptDb <- loadDb(sql_file)

## Fix seqlevels
seqlevels(TranscriptDb, force=TRUE) <- c(1:22,"X","Y","MT")
seqlevels(TranscriptDb) <- paste0("chr", c(1:22,"X","Y","M"))
ensGene <- genes(TranscriptDb)

# Modified from /users/ajaffe/Lieber/lieber_functions_aj.R
dogo <- function(names, Universe, goP = 0.01, cond = FALSE, ontology = 'BP'){
    gomap_names <- org.Hs.egREFSEQ2EG
    x <- unlist(mget(as.character(names), gomap_names, ifnotfound = NA))
    x <- x[!is.na(x)]    
    Universe <- unique(c(Universe, unique(x)))

    params <- new("GOHyperGParams", geneIds = unique(x),
                  universeGeneIds = Universe,
                  annotation = 'org.Hs.eg.db',
                  ontology = ontology, pvalueCutoff = goP, conditional = cond,
                  testDirection="over")
    ht <- hyperGTest(params)
    tab <- summary(ht)
    tmp1 <- geneIdsByCategory(ht)
    tmp1 <- tmp1[tab[, 1]]
    tab$IDs <- sapply(tmp1, function(y) paste(names(x)[x %in% y], collapse=";"))
    return(tab)
}


## Define universe: genes with a ER within 5kb
bg_genes <- resize(ensGene, width(ensGene) + 1e4, fix = 'center')
bg_genes <- bg_genes[countOverlaps(bg_genes, fullRegionGR, ignore.strand = TRUE) > 0]
print('Percent of genes included in the background')
round(length(bg_genes) / length(ensGene) * 100, 2)

## Define GO universe
gomap <- org.Hs.egENSEMBL2EG
bg_universe <- unlist(mget(as.character(bg_genes$gene_id), gomap, ifnotfound = NA))
print('Percent of background genes missing')
round(sum(is.na(bg_universe)) / length(bg_universe) * 100, 2)
bg_universe <- bg_universe[!is.na(bg_universe)]

library("TxDb.Hsapiens.UCSC.hg19.knownGene")
genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
if(!file.exists('rdas/annotation.knownGene_ER.Rdata')) {
    annotation.knownGene <- matchGenes(x = fullRegionGR[sigIndex], subject = genes)
    save(annotation.knownGene, file = 'rdas/annotation.knownGene_ER.Rdata')
} else {
    load('rdas/annotation.knownGene_ER.Rdata')
}

regs_names <- unlist(strsplit(annotation.knownGene$annotation, ' '))
## Clean up
regs_names <- regs_names[!is.na(regs_names)]

if(!file.exists('rdas/go_ER.Rdata')) {
    go <- tryCatch(dogo(regs_names, bg_universe), error = function(e) return(NULL))
    save(go, file = 'rdas/go_ER.Rdata')
} else {
    load('rdas/go_ER.Rdata')
}
print('Top GO results')
options(width = 180)
dim(go)
head(go[, -8], 40)


### subset analysis ###
Index1 = which(pdSpan$fetal == "Fetal" & 
	pdSpan$NCX %in% c("HIP", "STR"))
mod1 = model.matrix(~as.character(pdSpan$NCX[Index1]))
colnames(mod1)[2] = "STR"
fit1 = lmFit(y[,Index1], mod1)
eb1 = ebayes(fit1)
sigIndex1 = order(eb1$p[,2])[1:sum(p.adjust(eb1$p[,2], "bonf") < 0.05)]
length(sigIndex1)


theGenes1 <- annotation.knownGene[sigIndex1, ]
theGenes1$annotation = ss(theGenes1$annotation, " ")

geneList1 = split(theGenes1$name, sign(eb1$t[sigIndex1,2]))
geneList1 = lapply(geneList1, unique)
print('Subset analysis results, top 50 in each direction, then number of unique genes')
lapply(geneList1, head, 50)
length(unique(unlist(geneList1)))

## Reproducibility info
library('devtools')
options(width = 120)
session_info()
Sys.time()
proc.time()