Preliminary Setup

First we load the necessary libraries, along with a set of utility functions.

library(stringr)
library(magrittr)
library(openxlsx)
library(Biobase)
library(SummarizedExperiment)
library(dplyr)
library(edgeR)
library(limma)
library(sva)
library(ggplot2)
library(scales)
library(GGally)
library(ggalt)
library(reshape2)
library(assertthat)
library(ggfortify)
library(broom)
library(toOrdinal)
library(qvalue)
library(codingMatrices)
library(rex)

library(doParallel)
ncores <- getOption("mc.cores", default=parallel::detectCores(logical = FALSE))
registerDoParallel(cores=ncores)
library(BiocParallel)
register(DoparParam())

source(file.path(params$basedir, "scripts/utilities.R"))

Data Loading and Preprocessing

Now we’ll load the RNA-seq data set from an RDS file containing a SummarizedExperiment object, and modify it to use the sample names as column names.

sexpfile <- file.path(params$basedir, "saved_data",
                      sprintf("SummarizedExperiment_rnaseq_%s.RDS", params$dataset))
sexp <- readRDS(sexpfile)
colnames(sexp) <- colData(sexp)$SampleName

We extract the sample metadata from the SummarizedExperiment. We also tell R to use a coding matrix for each factor that puts the intercept at the mean of all factor levels when incorporating it into a design matrix.

sample.table <- colData(sexp) %>%
    as.data.frame %>% autoFactorize %>%
    rename(batch=technical_batch) %>%
    mutate(time_point=factor(days_after_activation) %>% `levels<-`(sprintf("D%s", levels(.))),
           group=interaction(cell_type, time_point, sep=""))
for (i in names(sample.table)) {
    if (is.factor(sample.table[[i]]) && nlevels(sample.table[[i]]) > 1) {
        contrasts(sample.table[[i]]) <- code_control_named(levels(sample.table[[i]]))
    }
}

Next we extract the count matrix from the SummarizedExperiment. This is made more complicated than usual by the fact that half of the samples were sequenced with a different protocol than the other half, and the two protocols produce reads with opposite strand orientations. Hence, we need the sense counts for half of the samples and the antisense counts for the other half. The appropriate strand for each sample is documented in the libType column of the sample metadata, using the library type abbreviations established by Salmon.

For SummarizedExperiments generated using tximport, this step is skipped, since the quantification tool has already been told which strand to use and only provides counts for that strand.

libtype.assayNames <- c(SF="sense.counts", SR="antisense.counts")
if (all(libtype.assayNames %in% assayNames(sexp))) {
    message("Selecting stranded counts for each sample")
    sample.table %<>% mutate(count_type=libtype.assayNames[libType])
    assay(sexp, "unstranded.counts") <- assay(sexp, "counts")
    assay(sexp, "counts") <- lapply(seq_len(nrow(sample.table)), function(i) {
        message("Using ", sample.table[i,]$count_type, " for ", colnames(sexp)[i])
        assay(sexp, sample.table[i,]$count_type %>% as.character)[,i]
    }) %>% do.call(what=cbind)
}

As a sanity check, we make sure that we selected the strand sense with the higher total count for each sample.

if (all(libtype.assayNames %in% assayNames(sexp))) {
    total.counts <- sexp %>% assays %>% sapply(colSums) %>% data.frame %>%
        mutate(SampleName=row.names(.)) %>%
        inner_join(sample.table, by="SampleName")
    total.counts %$% invisible(assert_that(all(counts == pmax(sense.counts, antisense.counts))))
}

Model Setup

Before testing for differential expression, we need to normalize the data, filter low-count genes, perform the log transformation with precision weighting, and finally subtract batch effects using ComBat.

First, we create a DGEList from the counts, copying over all the gene metadata.

## Extract gene metadata and colapse lists
all.gene.meta <- mcols(sexp) %>% ensureAtomicColumns %>% as.data.frame
dge <- DGEList(counts=assay(sexp, "counts"))
dge$genes <- all.gene.meta

Next we take care of the initial scaling normalization for sequencing depth and composition bias. We also discard any genes with all zero counts, since there is no meaningful analysis that can be done with these genes.

## Remove all genes with zero counts in all samples
nonzero <- rowSums(dge$counts) > 0
dge %<>% .[nonzero,]
dge %<>% calcNormFactors

In addition, if there is a length assay, we also use that to derive an offset matrix that corrects for differences in effective gene length between samples.

if ("length" %in% assayNames(sexp)) {
    normMat <- assay(sexp, "length") %>% divide_by(exp(rowMeans(log(.)))) %>%
        .[nonzero,]
    normCounts <- dge$counts/normMat
    lib.offsets <- log(calcNormFactors(normCounts)) + log(colSums(normCounts))
    dge$offset <- t(t(log(normMat)) + lib.offsets)
}

We plot the distribution of average log2 CPM values to verify that our chosen presence threshold is appropriate. The distribution is expected to be bimodal, with a low-abundance peak representing non-expressed genes and a high-abundance peak representing expressed genes. The chosen threshold should separate the two peaks of the bimodal distribution.

a <- aveLogCPMWithOffset(dge)
avelogcpm.presence.threshold <- -1
p <- list(
    Histogram=ggplot(data.frame(logCPM=a)) +
        aes(x=logCPM) +
        geom_histogram(aes(y=100*(..count..)/sum(..count..)), binwidth=0.25, boundary=0) +
        geom_vline(xintercept=avelogcpm.presence.threshold, color="red", linetype="dashed") +
        xlab("Average logCPM") + ylab("Percent of genes in bin") +
        coord_cartesian(xlim=quantile(a, c(0, 0.995)), ylim=c(0,10)) +
        labs(title="Average gene LogCPM distribution",
             subtitle="for genes with at least 1 read") +
        theme(plot.caption = element_text(hjust = 0)),
    ECDF=ggplot(fortify(ecdf(a))) +
        aes(x=x, y=y*100) +
        geom_step() +
        geom_vline(xintercept=avelogcpm.presence.threshold, color="red", linetype="dashed") +
        xlab("Average logCPM") + ylab("Percent of genes with smaller average logCPM") +
        coord_cartesian(xlim=quantile(a, c(0, 0.995))) +
        labs(title="Empirical Cumulative Distribution Function of gene LogCPM values",
             subtitle="for genes with at least 1 read") +
        theme(plot.caption = element_text(hjust = 0)))

ggprint(p)

The red dashed line in each plot indicates the chosen presence threshold. We now subset the DGEList to only those genes above the threshold.

dge %<>% .[a >= avelogcpm.presence.threshold,]

Next, we use the voom method with sample quality weights to prepare the data for model fitting with limma. Note that donor_id is still included in the design matrix even though we subtracted out donor differences using ComBat. This ensures that the model fitting is honest about the number of residual degrees of freedom in the model, with 3 degrees of freedom “spent” modeling the inter-donor effects.

design <- model.matrix(~0 + group + donor_id, sample.table, strip.prefixes = TRUE)
elist.nc <- voomWithQualityWeightsAndOffset(dge, design, plot=TRUE)

Next we use ComBat to perform batch correction, which performs empirical Bayes shrinkage of the batch correction parameters. We use a ComBat’s non-parametric mode due to the poor fit of the parametric assumptions for the variance distribution shown by the QC plot.

design.cb <- model.matrix(~cell_type + donor_id, sample.table, strip.prefixes = TRUE)
elist.cb <- elist.nc
invisible(suppressMessages(ComBat(elist.cb$E, batch=sample.table$batch, mod=design.cb, par.prior=TRUE, prior.plots=TRUE)))
## Found 2 batches
## Adjusting for 4 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors

## Finding parametric adjustments
## Adjusting the Data
elist.cb$E <- ComBat(elist.cb$E, batch=sample.table$batch, mod=design.cb, par.prior=FALSE)
## Found 2 batches
## Adjusting for 4 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding nonparametric adjustments
## Adjusting the Data

To make sure that the batch correction worked, we make MDS plots before and after.

dmat.nc <- suppressPlot(plotMDS(elist.nc)$distance.matrix) %>% as.dist
mds.nc<- cmdscale(dmat.nc, k=attr(dmat.nc, "Size") - 1, eig=TRUE)
mds.nc$points %<>% add.numbered.colnames("Dim") %>% data.frame(sample.table, .)
dmat.cb <- suppressPlot(plotMDS(elist.cb)$distance.matrix) %>% as.dist
mds.cb <- cmdscale(dmat.cb, k=attr(dmat.cb, "Size") - 1, eig=TRUE)
mds.cb$points %<>% add.numbered.colnames("Dim") %>% data.frame(sample.table, .)

mdsggbase <- function(df) {
    ggplot(df) +
        aes(x=Dim1, y=Dim2, label=SampleName, color=batch, fill=time_point, shape=cell_type, linetype=donor_id, group=cell_type:donor_id) +
        geom_encircle(aes(group=time_point:cell_type, color=NULL), s_shape=0.75, expand=0.05, color=NA, alpha=0.2) +
        geom_path(color=hcl(c=0, l=45), aes(color=NULL)) +
        geom_point(size=4) +
        scale_shape_manual(values=c(Naive=21, Memory=24)) +
        scale_color_manual(values=col2hcl(c(B1="green", B2="magenta"), l=80)) +
        scale_fill_hue(l=55) +
        scale_linetype_manual(values=c("solid", "dashed", "dotdash", "twodash")) +
        guides(colour = guide_legend(override.aes = list(shape = 21)),
               fill = guide_legend(override.aes = list(shape = 21)),
               shape = guide_legend(override.aes = list(color=hcl(c=0, l=80), fill=hcl(c=0, l=55)))) +
        labs(title="limma voom Principal Coordinates 1 & 2") +
        coord_equal()
}

pbase.nc <- mdsggbase(mds.nc$points)
pbase.cb <- mdsggbase(mds.cb$points)

p <- list(NC.PC12=pbase.nc +
              labs(subtitle="Before Batch Correction"),
          NC.PC23=pbase.nc +
              aes(x=Dim2, y=Dim3) +
              labs(title="limma voom Principal Coordinates 2 & 3",
                   subtitle="Before Batch Correction"),
          CB.PC12=pbase.cb +
              labs(subtitle="After ComBat Batch Correction"),
          CB.PC23=pbase.cb +
              aes(x=Dim2, y=Dim3) +
              labs(title="limma voom Principal Coordinates 2 & 3",
                   subtitle="After ComBat Batch Correction"))
ggprint(p)

Model fitting and differential expression testing

We now start by fitting the linear model for each gene:

fit <- lmFit(elist.cb, design) %>% eBayes(robust=TRUE)

We plot the mean-variance relationship of the data. Since the voom weights should counteract the mean-variance trend, there should me a minimal trend visible in this plot.

p <- ggplot(data_frame(CPM=2^fit$Amean, sigma=fit$sigma)) +
    aes(x=CPM, y=sigma) +
    geom_point(size=0.2) +
    geom_density2d(color=muted("blue", c=100, l=50)) +
    geom_smooth(method="loess", color=muted("red", c=100, l=50)) +
    geom_hline(yintercept = sqrt(fit$s2.prior), color=muted("green", c=100, l=50)) +
    scale_x_continuous(name="Mean Normalized Counts per Million", trans=log10_trans()) +
    scale_y_continuous(name="Standard Deviation", trans=log2_trans()) +
    labs(title="Mean-Variance Trend")
ggprint(p)

Next, we define the differential expression tests we wish to perform as contrasts. Each contrast is an arithmetic expression in terms of the model coefficients.

celltypes <- unique(sample.table$cell_type)
all.timepoints <- unique(sample.table$time_point)
nonzero.timepoints <- setdiff(all.timepoints, "D0")

timepoint.anova.tests <- setNames(llply(celltypes, function(ct) {
    setNames(sprintf("%s%s - %sD0", ct, nonzero.timepoints, ct),
             sprintf("%s.D0v%s", ct, nonzero.timepoints))
}), nm=str_c(celltypes, ".AllT"))
timepoint.single.tests <- as.list(unlist(unname(timepoint.anova.tests)))
celltype.singlet.tests <-
    as.list(setNames(sprintf("Memory%s - Naive%s", all.timepoints, all.timepoints),
                     sprintf("NvM.%s", all.timepoints)))
celltype.allt.test <- list(NvM.AllT=unlist(celltype.singlet.tests))
factorial.singlet.tests <-
    as.list(setNames(sprintf("(Memory%s - MemoryD0) - (Naive%s - NaiveD0)",
                             nonzero.timepoints, nonzero.timepoints),
                     sprintf("Fac.%s", nonzero.timepoints)))
factorial.allt.test <- list(Fac.AllT=unlist(factorial.singlet.tests))
mi.vs.nf.test <- list(MD0vND14="MemoryD0 - NaiveD14")
donor.var.test <- list(InterDonor=sample.table$donor_id %>% levels %>% {sprintf("%s.vs.%s", .[1], .[-1]) %>% set_names(., .)})
alltests <- c(timepoint.anova.tests, timepoint.single.tests,
              celltype.allt.test, celltype.singlet.tests,
              factorial.allt.test, factorial.singlet.tests,
              mi.vs.nf.test, donor.var.test)
print(alltests)
## $Naive.AllT
##          Naive.D0vD1          Naive.D0vD5         Naive.D0vD14 
##  "NaiveD1 - NaiveD0"  "NaiveD5 - NaiveD0" "NaiveD14 - NaiveD0" 
## 
## $Memory.AllT
##           Memory.D0vD1           Memory.D0vD5          Memory.D0vD14 
##  "MemoryD1 - MemoryD0"  "MemoryD5 - MemoryD0" "MemoryD14 - MemoryD0" 
## 
## $Naive.D0vD1
## [1] "NaiveD1 - NaiveD0"
## 
## $Naive.D0vD5
## [1] "NaiveD5 - NaiveD0"
## 
## $Naive.D0vD14
## [1] "NaiveD14 - NaiveD0"
## 
## $Memory.D0vD1
## [1] "MemoryD1 - MemoryD0"
## 
## $Memory.D0vD5
## [1] "MemoryD5 - MemoryD0"
## 
## $Memory.D0vD14
## [1] "MemoryD14 - MemoryD0"
## 
## $NvM.AllT
##                 NvM.D0                 NvM.D1                 NvM.D5 
##   "MemoryD0 - NaiveD0"   "MemoryD1 - NaiveD1"   "MemoryD5 - NaiveD5" 
##                NvM.D14 
## "MemoryD14 - NaiveD14" 
## 
## $NvM.D0
## [1] "MemoryD0 - NaiveD0"
## 
## $NvM.D1
## [1] "MemoryD1 - NaiveD1"
## 
## $NvM.D5
## [1] "MemoryD5 - NaiveD5"
## 
## $NvM.D14
## [1] "MemoryD14 - NaiveD14"
## 
## $Fac.AllT
##                                          Fac.D1 
##   "(MemoryD1 - MemoryD0) - (NaiveD1 - NaiveD0)" 
##                                          Fac.D5 
##   "(MemoryD5 - MemoryD0) - (NaiveD5 - NaiveD0)" 
##                                         Fac.D14 
## "(MemoryD14 - MemoryD0) - (NaiveD14 - NaiveD0)" 
## 
## $Fac.D1
## [1] "(MemoryD1 - MemoryD0) - (NaiveD1 - NaiveD0)"
## 
## $Fac.D5
## [1] "(MemoryD5 - MemoryD0) - (NaiveD5 - NaiveD0)"
## 
## $Fac.D14
## [1] "(MemoryD14 - MemoryD0) - (NaiveD14 - NaiveD0)"
## 
## $MD0vND14
## [1] "MemoryD0 - NaiveD14"
## 
## $InterDonor
##   D4659.vs.D5053   D4659.vs.D5131   D4659.vs.D5291 
## "D4659.vs.D5053" "D4659.vs.D5131" "D4659.vs.D5291"

We now perform the differential expression tests for each contrast or set of contrasts. For a single contrast, this performs a t-test. For a multi-contrast test, an F-test is performed. We also add q-values and Bayesian posterior probability statistics to the output, as alternative measures of significance.

results.tables <- bplapply(alltests, function(ct) {
    ctmat <- makeContrasts(contrasts = ct, levels=design) %>% set_colnames(names(ct))
    cfit <- contrasts.fit(fit, ctmat) %>% eBayes(robust=TRUE)
    tt <- topTable(cfit, n=Inf, sort.by="none")
    # Fix logFC columns names
    if (ncol(ctmat) > 1) {
        bad.logfc.colnames <- make.names(colnames(ctmat))
        good.logfc.colnames <- paste0("logFC.", colnames(ctmat))
        cols.to.fix <- match(bad.logfc.colnames, names(tt))
        names(tt)[cols.to.fix] <- good.logfc.colnames
    }
    tt %<>%
        rename(logCPM=AveExpr,
               PValue=P.Value,
               FDR=adj.P.Val) %>%
        arrange(PValue) %>%
        add.bfdr %>% add.qvalue(pfdr=TRUE)
    annot.cols <- intersect(names(tt), c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
    logfc.cols <- names(tt) %>% .[str_detect(.,"^logFC(\\.|$)")]
    teststat.cols <- intersect(names(tt), c("F", "t"))
    bfdr.cols <- intersect(names(tt), c("B", "PP", "BayesFDR"))
    qval.cols <- intersect(names(tt), c("QValue", "LocFDR"))
    selected.cols <- c(annot.cols, "PValue", "FDR", "logCPM", logfc.cols, teststat.cols, bfdr.cols, qval.cols)
    remaining.cols <- setdiff(names(tt), selected.cols)
    tt[c(selected.cols, remaining.cols)]
})

We save the full result tables to an R data file and the tables of significant results only (FDR 10% or less) to an Excel file. (For tests with fewer than 100 significant genes, we just save the top 100 genes.) We also save the entire workspace for later follow-up analysis.

# Take all genes significant at 10% FDR or top 100 genes, whichever is more
filtered.results.tables <- lapply(results.tables, . %>% filter(FDR <= 0.1 | seq_len(nrow(.)) <= 100))
dir.create(file.path(params$basedir, "saved_data", "RNA-seq"), recursive = TRUE, showWarnings = FALSE)
dir.create(file.path(params$basedir, "results", "RNA-seq"), recursive = TRUE, showWarnings = FALSE)
saveRDS(results.tables, file.path(params$basedir, 'saved_data', 'RNA-seq', sprintf('%s-diffexp-tables.RDS', params$dataset)))
save.image(file.path(params$basedir, 'saved_data', 'RNA-seq', sprintf('%s-diffexp.rda', params$dataset)))
write.xlsx(filtered.results.tables, file.path(params$basedir, "results", "RNA-seq",sprintf('%s-diffexp.xlsx', params$dataset)))

To have confidence in the significance measures returned, we should verify that the p-value distributions for each test look reasonable.

p <- bplapply(names(results.tables), function(testname) {
    pvals <- results.tables[[testname]]$PValue
    pi0 <- pi0est(pvals)$pi0
    p <- plotpvals(pvals, ptn = pi0) +
        labs(title=sprintf("P-value histogram for %s", testname),
             subtitle=sprintf("Est. Non-Null Prop.: %g%%", (1 - pi0) * 100))
})
ggprint(p)

We can look at the number of significant genes as an FDR of 10% for the various significance measures, as well as the estimated number of truly differentially expressed genes based on estimates of pi0.

results.tables %>% sapply(function(tt) {
    ndiff <- (1-pi0est(tt$PValue)$pi0) %>% multiply_by(nrow(tt)) %>% floor
    nsig.fdr <- sum(tt$FDR <= 0.1)
    nsig.qval <- sum(tt$QValue <= 0.1)
    if ("BayesFDR" %in% names(tt)) {
        nsig.bfdr <- sum(tt$BayesFDR <= 0.1)
    } else {
        nsig.bfdr <- NA
    }
    c(Pi0Est=ndiff, FDR=nsig.fdr, QValue=nsig.qval, BayesFDR=nsig.bfdr)
}) %>% data.frame
##          Naive.AllT Memory.AllT Naive.D0vD1 Naive.D0vD5 Naive.D0vD14
## Pi0Est         6614        5595        5671        4238         2103
## FDR            2196         972        1616          31          288
## QValue         3380        1486        2427          67          333
## BayesFDR         NA          NA         348          15           70
##          Memory.D0vD1 Memory.D0vD5 Memory.D0vD14 NvM.AllT NvM.D0 NvM.D1
## Pi0Est           2783         2947          1536     7897      0   8254
## FDR               473           19           219     4865      4   5537
## QValue            593           27           247     6677      0   7409
## BayesFDR          137           14           107       NA      3   1480
##          NvM.D5 NvM.D14 Fac.AllT Fac.D1 Fac.D5 Fac.D14 MD0vND14 InterDonor
## Pi0Est        0    5776        0    227      0     805      872      11546
## FDR           0    2212       23     15      0      10      275       5040
## QValue        0    3029        0      0      0       0      298      11322
## BayesFDR      0     466       NA      6      0      12      114         NA

We can see that the results for Day 5 tend to have fewer significant genes than Day 1 or Day 14. This is likely related to the fact that Day 5 is in the low-quality first batch and therefore the Day 5 samples have been substantially down-weighted, as shown by modeling the log of the sample weight as a function of either batch or time point. (By comparison, the weights are not substantially different between cell types or donors.)

c("time_point", "batch", "cell_type", "donor_id") %>% setNames(.,.) %>%
    lapply(. %>% sample.table[.] %>% cbind(Weight=elist.cb$sample.weights) %>%
               lm(log2(Weight) ~ 0 + ., data=.) %>%
               coef %>% {2^.})
## $time_point
##  time_pointD0  time_pointD1  time_pointD5 time_pointD14 
##     0.3804519     2.8483374     0.3706621     2.4896060 
## 
## $batch
##   batchB1   batchB2 
## 0.3755251 2.6629378 
## 
## $cell_type
##  cell_typeNaive cell_typeMemory 
##       1.0604936       0.9429571 
## 
## $donor_id
## donor_idD4659 donor_idD5053 donor_idD5131 donor_idD5291 
##     1.0027199     1.1922900     0.8989688     0.9304518

To get an idea of how the various significance measures compare to each other, we can plot them against each other. We can see that the QValue is smaller than the BH FDR by a constant factor up to a certian lower bound, while the FDR derived from Bayesian posterior probabilities is generally more conservative than either one.

p <- results.tables$Naive.D0vD1 %$% list(
    qplot(FDR, QValue, log="xy") + geom_abline(slope=1, intercept=0, linetype="dashed", alpha=0.5) + coord_fixed(),
    qplot(FDR, BayesFDR, log="xy") + geom_abline(slope=1, intercept=0, linetype="dashed", alpha=0.5) + coord_fixed(),
    qplot(BayesFDR, QValue, log="xy") + geom_abline(slope=1, intercept=0, linetype="dashed", alpha=0.5) + coord_fixed(),
    qplot(LocFDR, 1-PP, log="xy") + geom_abline(slope=1, intercept=0, linetype="dashed", alpha=0.5) + coord_fixed())
ggprint(p)