Preliminary Setup

First we load the necessary libraries, along with a set of utility functions.

library(stringr)
library(magrittr)
library(openxlsx)
library(SummarizedExperiment)
library(dplyr)
library(edgeR)
library(limma)
library(csaw)
library(sva)
library(ggplot2)
library(scales)
library(GGally)
library(ggalt)
library(ggthemes)
library(splines)
library(reshape2)
library(assertthat)
library(ggfortify)
library(broom)
library(ks)
library(RColorBrewer)

library(BSgenome.Hsapiens.UCSC.hg38)

library(doParallel)
ncores <- getOption("mc.cores", default=parallel::detectCores(logical = FALSE))
options(mc.cores=ncores)
registerDoParallel(cores=ncores)
library(BiocParallel)
register(DoparParam())

options(future.globals.maxSize=4 * 1024^3)
library(future)
plan(multicore)

source(file.path(params$basedir, "scripts/utilities.R"))

# Required in order to use DGEList objects with future
length.DGEList <- function(x) {
    length(unclass(x))
}

Data Loading and Preprocessing

First we load the consensus peaks called from the reads pooled from all samples. This consensus peak set is not biased toward or against any sample or condition, and therefore the peak significance is expected to be independent of any differential binding in that peak.

peakfile <- file.path(
    params$basedir, "peak_calls", "epic_hg38.analysisSet",
    str_c(params$histone_mark, "_condition.ALL_donor.ALL"),
    "peaks_noBL_IDR.narrowPeak")
allpeaks <- {
    read.narrowPeak(peakfile) %>% as("GRanges") %>%
        assign_into(seqinfo(.), seqinfo(BSgenome.Hsapiens.UCSC.hg38)[seqlevels(.)])
}

Now we’ll load the ChIP-seq read count data set from RDS files containing SummarizedExperiment objects, and modify them to use the sample names as column names. We also ensure that the column order is identical between the two objects. Lastly, we filter out any windows with fewer than one count per sample. This is a very mild filtering criterion, but it often eliminates many windows, greatly easing the subsequent computational burden of computing the real filtering threshold.

sexpfile <-
    file.path(params$basedir, "saved_data",
              with(params, sprintf("csaw-counts-%s-windows-%s-reads-%s.RDS", window_size, fragment_length, histone_mark)))
full.sexp <- readRDS(sexpfile)
colnames(full.sexp) <- colData(full.sexp)$SampleName
sexp <- full.sexp %>% .[rowSums(assay(.)) >= ncol(.),]
# Exepected number of counts per read, based on overlapping multiple windows.
# NOTE: Assumes windows exactly tile the genome (no overlaps, no gaps).
colData(sexp)$CountDupFactor <- (colData(sexp)$ext - 1) / median(width(rowRanges(sexp))) + 1

We extract the sample metadata from the SummarizedExperiment. We set all factors to use a sum-to-zero variant of the treatment-contrast coding, which will ease the subtraction of batch effects later.

sample.table <- colData(sexp) %>%
    as.data.frame %>% autoFactorize %>%
    mutate(days_after_activation=time_point %>% str_extract("\\d+$") %>% as.numeric(),
           time_point=factor(days_after_activation) %>% `levels<-`(sprintf("D%s", levels(.))),
           group=interaction(cell_type, time_point, sep="")) %>%
    autoFactorize %>%
    set_rownames(colnames(sexp))
for (i in names(sample.table)) {
    if (is.factor(sample.table[[i]]) && nlevels(sample.table[[i]]) > 1) {
        sample.table[[i]] %<>% C(code_control_named(levels(.)))
    }
}

Peak and Window Filtering

We first select all peaks with an IDR value of 0.2 or less. This is a quite relaxed threshold in order to ensure that even peaks present in only one condition will be included.

idr.threshold <- 0.2
genome.size <- seqlengths(seqinfo(allpeaks)) %>% as.numeric %>% sum
peaks <- allpeaks[allpeaks$qValue >= -log10(idr.threshold)]
pct.covered <- width(peaks) %>% sum %>% divide_by(genome.size) %>% multiply_by(100)
mean.pct.reads <- sexp %>% subsetByOverlaps(peaks) %>% assay("counts") %>%
    colSums %>% divide_by(colData(sexp) %$% {totals * CountDupFactor}) %>% multiply_by(100) %>%
    mean
message(sprintf("Selected %i peaks at an IDR threshold of %.3g, with an average width of %.0f nucleotides and covering a total of %.3g%% of the genome, containing on average %.3g%% of reads", length(peaks), idr.threshold, mean(width(peaks)), pct.covered, mean.pct.reads))

## Selected 24278 peaks at an IDR threshold of 0.2, with an average width of 15710 nucleotides and covering a total of 12.4% of the genome, containing on average 24.6% of reads

Then we select only the windows overlapping the selected peaks.

sexp %<>% subsetByOverlaps(peaks)

Now we create a DGEList from the counts.

## Extract gene metadata and colapse lists
all.window.meta <- rowRanges(sexp) %>% as.data.frame %>%
    select(-width, -strand) %>% rename(chr=seqnames)
# Convert list columns to character vectors
all.window.meta[] %<>% lapply(function(x) if (is.list(x)) sapply(x, str_c, collapse=",") else x)
rownames(all.window.meta) <- all.window.meta %$% sprintf("%s:%s-%s", chr, start, end)
dge <- asDGEList(sexp) %>%
    assign_into(.$offset, NULL) %>%
    assign_into(.$genes, all.window.meta) %>%
    set_rownames(rownames(all.window.meta))

Then we select all windows with an average read count of at least 5.

count.threshold <- 5
filter.threshold <- aveLogCPM(count.threshold, lib.size=mean(dge$samples$lib.size))
filt <- aveLogCPM(dge) >= filter.threshold
dge <- dge[filt,]
message(sprintf("Excluding %i out of %i peak-overlapping windows (%.3g%%) with average count below %s.",
                sum(filt == FALSE), length(filt), 100*(1-mean(filt)), count.threshold))

## Excluding 232852 out of 787010 peak-overlapping windows (29.6%) with average count below 5.

Model fitting

Now we are ready to fit a model to the data. We begin by building the design matrix, inclulding coefficients for the interaction of cell type and time point (group).

design <- model.matrix(~0 + group, sample.table, strip.prefixes = TRUE)
colnames(design)

## [1] "NaiveD0"   "MemoryD0"  "NaiveD1"   "MemoryD1"  "NaiveD5"   "MemoryD5" 
## [7] "NaiveD14"  "MemoryD14"

Instead of incorporating all the batch effects we think might be important into the design matrix, we let SVA infer the important confounding factors from the data, and add those to the design matrix.

dge %<>% calcNormFactors()
logcpm <- cpm(dge, log=TRUE, prior.count=1)
svobj <- sva(logcpm, design)

## Number of significant surrogate variables is:  4 
## Iteration (out of 5 ):1  2  3  4  5

ggduo.dataXY(sample.table[c("cell_type", "time_point", "donor_id", "totals")], add.numbered.colnames(svobj$sv, "SV"))

svmat <- add.numbered.colnames(svobj$sv, "SV")
design.sv <- cbind(design, svmat)

We now proceed to computing normalization factors, estimating dispersions, and fitting the quasi-likelihood GLM.

dge %<>% estimateDisp(design.sv, robust=TRUE)
plotBCV(dge)

fit <- glmQLFit(dge, design.sv, robust=TRUE)
plotQLDisp(fit)

Next, we define the differential expression tests we wish to perform as contrasts. Each contrast is an arithmetic expression in terms of the model coefficients.

celltypes <- sort(unique(sample.table$cell_type))
all.timepoints <- unique(sample.table$time_point)
nonzero.timepoints <- setdiff(all.timepoints, "D0")

timepoint.anova.tests <- setNames(llply(celltypes, function(ct) {
    setNames(sprintf("%s%s - %sD0", ct, nonzero.timepoints, ct),
             sprintf("%s.D0v%s", ct, nonzero.timepoints))
}), nm=str_c(celltypes, ".AllT"))
timepoint.single.tests <- as.list(unlist(unname(timepoint.anova.tests)))
celltype.singlet.tests <-
    as.list(setNames(sprintf("Memory%s - Naive%s", all.timepoints, all.timepoints),
                     sprintf("NvM.%s", all.timepoints)))
celltype.allt.test <- list(NvM.AllT=unlist(celltype.singlet.tests))
factorial.singlet.tests <-
    as.list(setNames(sprintf("(Memory%s - MemoryD0) - (Naive%s - NaiveD0)",
                             nonzero.timepoints, nonzero.timepoints),
                     sprintf("Fac.%s", nonzero.timepoints)))
factorial.allt.test <- list(Fac.AllT=unlist(factorial.singlet.tests))
mi.vs.nf.test <- list(MD0vND14="MemoryD0 - NaiveD14")
sv.test <- list(SV=colnames(svmat))
alltests <- c(timepoint.anova.tests, timepoint.single.tests,
              celltype.allt.test, celltype.singlet.tests,
              factorial.allt.test, factorial.singlet.tests,
              mi.vs.nf.test,
              sv.test
              )
print(alltests)

## $Naive.AllT
##          Naive.D0vD1          Naive.D0vD5         Naive.D0vD14 
##  "NaiveD1 - NaiveD0"  "NaiveD5 - NaiveD0" "NaiveD14 - NaiveD0" 
## 
## $Memory.AllT
##           Memory.D0vD1           Memory.D0vD5          Memory.D0vD14 
##  "MemoryD1 - MemoryD0"  "MemoryD5 - MemoryD0" "MemoryD14 - MemoryD0" 
## 
## $Naive.D0vD1
## [1] "NaiveD1 - NaiveD0"
## 
## $Naive.D0vD5
## [1] "NaiveD5 - NaiveD0"
## 
## $Naive.D0vD14
## [1] "NaiveD14 - NaiveD0"
## 
## $Memory.D0vD1
## [1] "MemoryD1 - MemoryD0"
## 
## $Memory.D0vD5
## [1] "MemoryD5 - MemoryD0"
## 
## $Memory.D0vD14
## [1] "MemoryD14 - MemoryD0"
## 
## $NvM.AllT
##                 NvM.D1                 NvM.D0                 NvM.D5 
##   "MemoryD1 - NaiveD1"   "MemoryD0 - NaiveD0"   "MemoryD5 - NaiveD5" 
##                NvM.D14 
## "MemoryD14 - NaiveD14" 
## 
## $NvM.D1
## [1] "MemoryD1 - NaiveD1"
## 
## $NvM.D0
## [1] "MemoryD0 - NaiveD0"
## 
## $NvM.D5
## [1] "MemoryD5 - NaiveD5"
## 
## $NvM.D14
## [1] "MemoryD14 - NaiveD14"
## 
## $Fac.AllT
##                                          Fac.D1 
##   "(MemoryD1 - MemoryD0) - (NaiveD1 - NaiveD0)" 
##                                          Fac.D5 
##   "(MemoryD5 - MemoryD0) - (NaiveD5 - NaiveD0)" 
##                                         Fac.D14 
## "(MemoryD14 - MemoryD0) - (NaiveD14 - NaiveD0)" 
## 
## $Fac.D1
## [1] "(MemoryD1 - MemoryD0) - (NaiveD1 - NaiveD0)"
## 
## $Fac.D5
## [1] "(MemoryD5 - MemoryD0) - (NaiveD5 - NaiveD0)"
## 
## $Fac.D14
## [1] "(MemoryD14 - MemoryD0) - (NaiveD14 - NaiveD0)"
## 
## $MD0vND14
## [1] "MemoryD0 - NaiveD14"
## 
## $SV
## [1] "SV1" "SV2" "SV3" "SV4"

We now perform the differential expression tests for each contrast or set of contrasts. For a single contrast, the test is analogous to a t-test. For a multi-contrast test, the global null hypothesis that all contrasts are equal to zero is used, analogous to an F-test.

window.results.tables <- bplapply(alltests, function(ct) {
    ctmat <- makeContrasts(contrasts = ct, levels=design.sv) %>% set_colnames(names(ct))
    ctest <- glmQLFTest(fit, contrast = ctmat)
    topTags(ctest, n=Inf, sort.by="none") %>% as.data.frame %>% add.qvalue
})

p <- bplapply(names(window.results.tables), function(testname) {
    pvals <- window.results.tables[[testname]]$PValue
    pi0 <- pi0est(pvals)$pi0
    p <- plotpvals(pvals, ptn = pi0) +
        labs(title=sprintf("P-value histogram for %s", testname),
             subtitle=sprintf("Est. Non-Null Prop.: %g%%", (1 - pi0) * 100))
})
ggprint(p)

Next, because we wish to control the FDR at the level of peaks, not individual windows, we must combine the results for all the windows in each peak and report a single combined p-value for each peak.

peak.results.tables <- bplapply(window.results.tables, function(tt) {
    gr <- as(tt, "GRanges")
    olap <- findOverlaps(peaks, gr)
    combineOverlaps(olap, tt) %>%
        filter(!is.na(PValue)) %>%
        arrange(PValue) %>%
        add.qvalue
})
peak.results.tables %<>% lapply(. %>% add.qvalue)

We now visualize the p-value histogram for each test.

p <- bplapply(names(peak.results.tables), function(testname) {
    pvals <- peak.results.tables[[testname]]$PValue
    pi0 <- pi0est(pvals)$pi0
    p <- plotpvals(pvals, ptn = pi0) +
        labs(title=sprintf("P-value histogram for %s", testname),
             subtitle=sprintf("Est. Non-Null Prop.: %g%%", (1 - pi0) * 100))
})
ggprint(p)

x <- sapply(peak.results.tables, . %$% FDR %>% is_weakly_less_than(0.1) %>% sum) %>% cbind(NumSignif=.)
x

##               NumSignif
## Naive.AllT         4463
## Memory.AllT        1968
## Naive.D0vD1         243
## Naive.D0vD5         932
## Naive.D0vD14        661
## Memory.D0vD1          4
## Memory.D0vD5       2168
## Memory.D0vD14        17
## NvM.AllT             93
## NvM.D1               39
## NvM.D0                5
## NvM.D5                6
## NvM.D14               2
## Fac.AllT              4
## Fac.D1                0
## Fac.D5                0
## Fac.D14               0
## MD0vND14            535
## SV                20608

# Take all genes significant at 10% FDR or top 100 genes, whichever is more
filtered.peak.results.tables <- lapply(peak.results.tables, . %>% filter(FDR <= 0.1 | seq_len(nrow(.)) <= 100))
dir.create(file.path(params$basedir, "saved_data", "ChIP-seq"), recursive = TRUE, showWarnings = FALSE)
dir.create(file.path(params$basedir, "results", "ChIP-seq"), recursive = TRUE, showWarnings = FALSE)
saveRDS(window.results.tables, file.path(params$basedir, 'saved_data', 'ChIP-seq', sprintf('%s-window-diffmod.RDS', params$histone_mark)))
saveRDS(peak.results.tables, file.path(params$basedir, 'saved_data', 'ChIP-seq', sprintf('%s-peak-diffmod.RDS', params$histone_mark)))
save.image(file.path(params$basedir, 'saved_data', 'ChIP-seq', sprintf('%s-diffmod.rda', params$histone_mark)))
write.xlsx(filtered.peak.results.tables, file.path(params$basedir, "results", "ChIP-seq",sprintf('%s-peak-diffmod.xlsx', params$histone_mark)))

Exploration of CD4 ChIP-Seq Dataset

For histone mark H3K27me3

Ryan C. Thompson

April 4, 2017

Preliminary Setup

Data Loading and Preprocessing

Peak and Window Filtering

Model fitting