First we load the necessary libraries, along with a set of utility functions.
library(stringr)
library(magrittr)
library(dplyr)
library(SummarizedExperiment)
library(ggplot2)
library(assertthat)
library(RColorBrewer)
library(BSgenome.Hsapiens.UCSC.hg38)
library(doParallel)
ncores <- getOption("mc.cores", default=parallel::detectCores(logical = FALSE))
options(mc.cores=ncores)
registerDoParallel(cores=ncores)
library(BiocParallel)
register(DoparParam())
options(future.globals.maxSize=4 * 1024^3)
library(future)
plan(multicore)
source(file.path(params$basedir, "scripts/utilities.R"))
First we load the consensus peaks called from the reads pooled from all samples.
histone.marks <- c("H3K4me3", "H3K4me2", "H3K27me3") %>% setNames(., .)
peakfiles <- file.path(
params$basedir, "peak_calls", "epic_hg38.analysisSet",
str_c(histone.marks, "_condition.ALL_donor.ALL"),
"peaks_noBL_IDR.narrowPeak") %>% setNames(histone.marks)
assert_that(all(file.exists(peakfiles)))
## [1] TRUE
allpeaks <- bplapply(peakfiles, . %>% read.narrowPeak %>% as("GRanges") %>%
assign_into(seqinfo(.), seqinfo(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(.)]))
We begin by selecting only peaks with an IDR value of 0.2 or less, then converting them to a data frame in preparation for plotting.
idr.threshold <- 0.2
genome.size <- seqlengths(seqinfo(allpeaks[[1]])) %>% as.numeric %>% sum
peaks <- lapply(allpeaks, . %>% .[.$qValue >= -log10(idr.threshold)])
peaktable <- lapply(names(peaks), function(i) cbind(HistoneMark=i, as.data.frame(peaks[[i]]))) %>%
do.call(what=rbind)
Now we examine the distribution of peak widths for each histone mark.
q95 <- quantile(peaktable$width, .95)
p <- ggplot(peaktable) +
aes(x=width) +
geom_histogram(binwidth=200, boundary=0) +
facet_wrap(~HistoneMark, ncol=1) +
ggtitle("Histogram of peak widths")
ggprint(p + labs(subtitle="Full distribution"))
ggprint(p + coord_cartesian(xlim=c(0, q95)) +
labs(subtitle="Bottom 95%"))
ggprint(p + coord_cartesian(xlim=c(0, 10000)) +
labs(subtitle="Truncated at 10000bp"))
ggprint(p + coord_cartesian(xlim=c(0, 5000)) +
labs(subtitle="Truncated at 5000bp"))