Preliminary Setup

First we load the necessary libraries, along with a set of utility functions.

library(stringr)
library(magrittr)
library(dplyr)
library(SummarizedExperiment)
library(ggplot2)
library(assertthat)
library(RColorBrewer)

library(BSgenome.Hsapiens.UCSC.hg38)

library(doParallel)
ncores <- getOption("mc.cores", default=parallel::detectCores(logical = FALSE))
options(mc.cores=ncores)
registerDoParallel(cores=ncores)
library(BiocParallel)
register(DoparParam())

options(future.globals.maxSize=4 * 1024^3)
library(future)
plan(multicore)

source(file.path(params$basedir, "scripts/utilities.R"))

Data Loading and Preprocessing

First we load the consensus peaks called from the reads pooled from all samples.

histone.marks <- c("H3K4me3", "H3K4me2", "H3K27me3") %>% setNames(., .)
peakfiles <- file.path(
    params$basedir, "peak_calls", "epic_hg38.analysisSet",
    str_c(histone.marks, "_condition.ALL_donor.ALL"),
    "peaks_noBL_IDR.narrowPeak") %>% setNames(histone.marks)
assert_that(all(file.exists(peakfiles)))

## [1] TRUE

allpeaks <- bplapply(peakfiles, . %>% read.narrowPeak %>% as("GRanges") %>%
        assign_into(seqinfo(.), seqinfo(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(.)]))

Peak width analysis

We begin by selecting only peaks with an IDR value of 0.2 or less, then converting them to a data frame in preparation for plotting.

idr.threshold <- 0.2
genome.size <- seqlengths(seqinfo(allpeaks[[1]])) %>% as.numeric %>% sum
peaks <- lapply(allpeaks, . %>% .[.$qValue >= -log10(idr.threshold)])
peaktable <- lapply(names(peaks), function(i) cbind(HistoneMark=i, as.data.frame(peaks[[i]]))) %>% 
    do.call(what=rbind)

Now we examine the distribution of peak widths for each histone mark.

q95 <- quantile(peaktable$width, .95)
p <- ggplot(peaktable) + 
    aes(x=width) + 
    geom_histogram(binwidth=200, boundary=0) + 
    facet_wrap(~HistoneMark, ncol=1) + 
    ggtitle("Histogram of peak widths")
ggprint(p + labs(subtitle="Full distribution"))

ggprint(p + coord_cartesian(xlim=c(0, q95)) +
            labs(subtitle="Bottom 95%"))

ggprint(p + coord_cartesian(xlim=c(0, 10000)) + 
            labs(subtitle="Truncated at 10000bp"))

ggprint(p + coord_cartesian(xlim=c(0, 5000)) + 
            labs(subtitle="Truncated at 5000bp"))

Exploration of CD4 ChIP-Seq Peak Sizes

Ryan C. Thompson

March 16, 2017

Preliminary Setup

Data Loading and Preprocessing

Peak width analysis