#!/usr/bin/env Rscript library(xlsx) library(frmaTools) library(stringr) library(magrittr) library(plyr) library(affy) library(preprocessCore) training.data.dir <- "Training Data" datasets <- data.frame(Dataset=list.files(training.data.dir)) rownames(datasets) <- datasets$Dataset datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b")) tsmsg <- function(...) { message(date(), ": ", ...) } ## Some Scan Dates are marked as identical for multiple batches, which ## is bad. But the dates embedded in the file names for these batches ## are different, so we use those dates instead. parse.date.from.filename <- function(fname) { res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)] res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1] res1[is.na(res1)] <- res2[is.na(res1)] colnames(res1) <- c("year", "month", "day") res1[,"year"] %<>% str_c("20", .) as.Date(do.call(ISOdate, data.frame(res1))) } makeVectorsAffyBatch <- function (files, batch.id, background = "rma", normalize = "quantile", normVec = NULL, cdfname = NULL, file.dir = ".", verbose = TRUE) { wd <- getwd() setwd(file.dir) object <- ReadAffy(filenames = files, cdfname = cdfname, verbose = verbose) setwd(wd) if (verbose) message("Data loaded \n") batch.size <- table(batch.id)[1] if (!all(table(batch.id) == batch.size)) stop("Batches must be of the same size.") if (background == "rma") { object <- bg.correct.rma(object) if (verbose) message("Background Corrected \n") gc() } pms <- pm(object) pns <- probeNames(object) pmi <- unlist(pmindex(object)) if (!all(sprintf("%i", pmi) == rownames(pms))) stop("Mismatch between pmindex and rownames of pms") rm(object) gc() if (normalize == "quantile") { if (is.null(normVec)) normVec <- normalize.quantiles.determine.target(pms) pms <- normalize.quantiles.use.target(pms, normVec) names(normVec) <- as.character(pmi) if (verbose) message("Normalized \n") } pms <- log2(pms) gc() N <- 1:dim(pms)[1] S <- split(N, pns) nc <- ncol(pms) nr <- nrow(pms) resids <- matrix(ncol = nc, nrow = nr) probeVec <- vector(length = nr) if (verbose) message("Beginning Probe Effect Calculation ... \n") for (k in 1:length(S)) { fit <- rcModelPLM(pms[S[[k]], , drop = FALSE]) resids[S[[k]], ] <- fit$Residuals probeVec[S[[k]]] <- fit$Estimates[(nc + 1):length(fit$Estimates)] if ((k%%1000) == 0) { message(paste("Finished probeset:", k, "\n")) gc() } } names(probeVec) <- as.character(pmi) if (verbose) message("Probe Effects Calculated \n") gc() tmp <- split(t(resids), batch.id) withinMean <- lapply(tmp, frmaTools:::getProbeMean, batch.size) withinVar <- lapply(tmp, frmaTools:::getProbeVar, batch.size) withinAvgVar <- rowMeans(matrix(unlist(withinVar), ncol = length(withinVar))) btwVar <- apply(matrix(unlist(withinMean), ncol = length(withinMean)), 1, var) rm(tmp) rm(withinMean) rm(withinVar) names(withinAvgVar) <- names(btwVar) <- as.character(pmi) if (verbose) message("Probe Variances Calculated \n") gc() tmp <- split(resids, pns) psetMAD <- unlist(lapply(tmp, frmaTools:::getPsetMAD, nc, batch.id)) names(psetMAD) <- names(tmp) rm(tmp) rm(resids) if (verbose) message("Probe Set SDs Calculated \n") gc() w <- 1/(withinAvgVar + btwVar) w[w == Inf] <- 1 medianSE <- vector(length = length(psetMAD)) if (verbose) message("Beginning Median SE Calculation ... \n") for (k in 1:length(S)) { fit <- frmaTools:::rwaFit2(pms[S[[k]], , drop = FALSE], w[S[[k]]], probeVec[S[[k]]], psetMAD[k]) medianSE[k] <- median(fit$StdErrors) if ((k%%1000) == 0) { message(paste("Finished probeset:", k, "\n")) gc() } } names(medianSE) <- names(psetMAD) if (verbose) message("Median SEs Calculated \n") gc() rm(w) rm(pms) rm(pns) gc() if (is.null(cdfname)) { vers <- "" } else { vers <- as.character(packageVersion(cdfname)) } ## vers <- ifelse(!is.null(cdfname), as.character(packageVersion(cdfname)), ## "") return(list(normVec = normVec, probeVec = probeVec, probeVarWithin = withinAvgVar, probeVarBetween = btwVar, probesetSD = psetMAD, medianSE = medianSE, version = vers)) } ## Error: the following are not valid files: ## Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL blacklist <- "Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL" ## This reads in the xlsx file for each of the 7 datasets and combines ## them into one big table of all samples. The Batch column contains ## the partitioning of samples into unique combinations of Dataset, ## Scan Date, and Phenotype. Finally, we split based on Tissue type to ## get one table for biopsies (BX), and one for blood (PAX). sample.tables <- ddply(datasets, .(Dataset), function(df) { df <- df[1,] rownames(df) <- NULL dset.dir <- file.path(training.data.dir, df$Dataset) x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>% setNames(c("Filename", "Phenotype", "ScanDate")) x$Filename <- as.character(x$Filename) missing.CEL <- !str_detect(x$Filename, "\\.CEL$") x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL") stopifnot(all(str_detect(x$Filename, "\\.CEL$"))) parsed.date <- parse.date.from.filename(x$Filename) x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)] x %>% cbind(df) %>% transform(Filename=file.path(dset.dir, Filename), Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>% subset(! Filename %in% blacklist) %>% subset(!duplicated(Filename)) }) %>% split(.$Tissue) %>% lapply(droplevels) ## fRMA requires equal-sized batches, so for each batch size from 3 to ## 15, compute how many batches have at least that many samples. x <- sapply(3:15, function(i) sapply(sample.tables, . %$% Batch %>% table %>% as.vector %>% {sum(. >= i)})) colnames(x) <- 3:15 ## Based on the above and the recommendations in the frmaTools paper, ## I chose 5 as the optimal batch size. This could be optimized ## empirically, though. arrays.per.batch <- 5 ## For each tissue type, compute fRMA vectors. vectors <- lapply(sample.tables, function(stab) { set.seed(1986) tsmsg("Reading full dataset") affy <- ReadAffy(filenames=stab$Filename, sampleNames=rownames(stab)) tsmsg("Getting reference normalization distribution from full dataset") normVec <- normalize.quantiles.determine.target(pm(bg.correct.rma(affy))) rm(affy); gc() tsmsg("Selecting batches") ## Keep only arrays with enough samples big.enough <- stab$Batch %>% table %>% .[.>= arrays.per.batch] %>% names stab <- stab[stab$Batch %in% big.enough,] %>% droplevels ## Sample an equal number of arrays from each batch subtab <- ddply(stab, .(Batch), function(df) { df[sample(seq(nrow(df)), size=arrays.per.batch),] }) tsmsg("Making fRMA vectors") ## Make fRMA vectors, using normVec from full dataset res <- makeVectorsAffyBatch(subtab$Filename, subtab$Batch, normVec=normVec) tsmsg("Finished.") res }) ## The code below here just takes the trained vectors and packages ## them up into installable R packages. makePackageFromVectors <- function (vecs, version, maintainer, species, annotation, packageName, file.dir = ".", output.dir = ".", unlink = TRUE) { platform <- gsub("cdf$", "", annotation) ## type <- match.arg(type, c("AffyBatch", "FeatureSet")) ## if (type == "AffyBatch") ## platform <- gsub("cdf", "", annotation) ## if (type == "FeatureSet") { ## platform <- annotation ## require(oligo) ## } thispkg <- "frmaTools" desc <- packageDescription(thispkg) thispkgVers <- desc$Version symbolValues <- list(ARRAYTYPE = platform, VERSION = version, CREATOR = paste("package", thispkg, "version", thispkgVers), FRMATOOLSVERSION = thispkgVers, MAINTAINER = maintainer, SPECIES = species) createdPkg <- createPackage(packageName, destinationDir = output.dir, originDir = system.file("VectorPkg-template", package = "frmaTools"), symbolValues = symbolValues, unlink = unlink) ## if (type == "AffyBatch") ## vecs <- makeVectorsAffyBatch(files, batch.id, background, ## normalize, normVec, annotation, file.dir, verbose) ## if (type == "FeatureSet") ## vecs <- makeVectorsFeatureSet(files, batch.id, annotation, ## background, normalize, normVec, file.dir, verbose) assign(packageName, vecs) save(list = eval(packageName), file = file.path(createdPkg$pkgdir, "data", paste(packageName, ".rda", sep = "")), compress = TRUE) } annotation <- cleancdfname(affyio:::read.celfile.header(sample.tables[[1]]$Filename[1])$cdfName, FALSE) dir.create("pkgs", FALSE, TRUE, mode="755") for (i in names(vectors)) { vecs <- vectors[[i]] pkgname <- sprintf("DSalomon.%s.%sfrmavecs", i, annotation) message("Making ", pkgname) makePackageFromVectors( vecs, version="0.1", maintainer="Ryan C. Thompson ", species="Homo_sapiens", annotation=annotation, packageName=pkgname, output.dir = "pkgs") } save.image("train-data.rda")