#!/usr/bin/env Rscript # Script to check reproducibility of fRMA training process library(xlsx) library(frma) library(frmaTools) library(stringr) library(magrittr) library(plyr) library(affy) library(preprocessCore) library(ggplot2) library(proto) library(dplyr) load("consistency.rda") ## Select a random subset of 20 arrays from each tissue (Would rather ## use entire dataset but ENOMEM) set.seed(1986) norm.exprs <- lapply(names(vectors), function(ttype) { stab <- sample.tables[[ttype]] %>% sample_n(20) tsmsg("Reading 20 random arrays for ", ttype) affy <- ReadAffy(filenames=stab$Filename, sampleNames=rownames(stab)) tsmsg("Normalizing with RMA for comparison") eset.rma <- rma(affy) rma.exprs <- eset.rma %>% exprs %>% as.vector tsmsg("Normalizing with 5 trains fRMA vector sets") esets.frma <- lapply(vectors[[ttype]], . %>% frma(affy, input.vecs=.)) frma.exprs <- sapply(esets.frma, . %>% exprs %>% as.vector) data.frame(RMA=rma.exprs, fRMA=frma.exprs) }) %>% setNames(names(vectors)) ## Save because the above takes a while save.image("consistency.rda") dir.create("fRMA_consistency_results", FALSE) ## Compute M/A data for all pairwise comparisons ma.data <- lapply(norm.exprs, function(normexprs) { normexprs %>% names %>% combn(2, simplify=FALSE) %>% ldply(. %>% { col1 <- .[1] col2 <- .[2] x1 <- normexprs[[col1]] x2 <- normexprs[[col2]] data.frame(Comparison = str_c(col1,".vs.",col2), x1=x1, x2=x2, M = x2 - x1, A = (x1+x2)/2) }) }) for (ttype in names(norm.exprs)) { madata <- ma.data[[ttype]] pdf(str_c("fRMA_consistency_results/MA Plots for ", ttype, ".pdf")) ## MA plot for every pair of normalizations ddply(madata, .(Comparison), . %$% { smoothScatter(x=A, y=M, nbin=512, main=Comparison[1], xlab="A", ylab="M") }) dev.off() ## M boxplots & violin plots pdf(str_c("fRMA_consistency_results/M Boxplots for ", ttype, ".pdf"), width=8, height=12) p <- ggplot(madata) + aes(x=Comparison, y=M) + scale_x_discrete(limits = rev(levels(madata$Comparison))) + coord_flip() print(p + geom_boxplot(notch=TRUE, outlier.shape = NA) + ggtitle(str_c("Boxplots of M value distributions for ", ttype))) print(p + geom_violin(scale="width") + ggtitle(str_c("Violin plots of M value distributions for ", ttype))) dev.off() }