123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- #!/usr/bin/env Rscript
- # Script to train multiple fRMA vectors in preparation for consistency
- # evaluation
- library(xlsx)
- library(frma)
- library(frmaTools)
- library(stringr)
- library(magrittr)
- library(plyr)
- library(affy)
- library(preprocessCore)
- library(ggplot2)
- library(proto)
- library(dplyr)
- training.data.dir <- "Training Data"
- datasets <- data.frame(Dataset=list.files(training.data.dir))
- rownames(datasets) <- datasets$Dataset
- datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b"))
- tsmsg <- function(...) {
- message(date(), ": ", ...)
- }
- ## Some Scan Dates are marked as identical for multiple batches, which
- ## is bad. But the dates embedded in the file names for these batches
- ## are different, so we use those dates instead.
- parse.date.from.filename <- function(fname) {
- res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)]
- res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1]
- res1[is.na(res1)] <- res2[is.na(res1)]
- colnames(res1) <- c("year", "month", "day")
- res1[,"year"] %<>% str_c("20", .)
- as.Date(do.call(ISOdate, data.frame(res1)))
- }
- ## This reads in the xlsx file for each of the 7 datasets and combines
- ## them into one big table of all samples. The Batch column contains
- ## the partitioning of samples into unique combinations of Dataset,
- ## Scan Date, and Phenotype. Finally, we split based on Tissue type to
- ## get one table for biopsies (BX), and one for blood (PAX).
- sample.tables <- ddply(datasets, .(Dataset), function(df) {
- df <- df[1,]
- rownames(df) <- NULL
- dset.dir <- file.path(training.data.dir, df$Dataset)
- x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>%
- setNames(c("Filename", "Phenotype", "ScanDate"))
- x$Filename <- as.character(x$Filename)
- missing.CEL <- !str_detect(x$Filename, "\\.CEL$")
- x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL")
- stopifnot(all(str_detect(x$Filename, "\\.CEL$")))
- parsed.date <- parse.date.from.filename(x$Filename)
- x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)]
- x %>% cbind(df) %>%
- transform(Filename=file.path(dset.dir, Filename),
- Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>%
- subset(! Filename %in% blacklist) %>%
- subset(!duplicated(Filename))
- }) %>%
- split(.$Tissue) %>%
- lapply(droplevels)
- ## fRMA requires equal-sized batches, so for each batch size from 3 to
- ## 15, compute how many batches have at least that many samples.
- x <- sapply(3:15, function(i) sapply(sample.tables, . %$% Batch %>% table %>% as.vector %>% {sum(. >= i)}))
- colnames(x) <- 3:15
- ## Based on the above and the recommendations in the frmaTools paper,
- ## I chose 5 as the optimal batch size. This could be optimized
- ## empirically, though.
- arrays.per.batch <- 5
- vectors <- lapply(names(sample.tables), function(ttype) {
- stab <- sample.tables[[ttype]]
- tsmsg("Reading full dataset for ", ttype)
- affy <- ReadAffy(filenames=stab$Filename, sampleNames=rownames(stab))
- tsmsg("Getting reference normalziation distribution from full dataset for ", ttype)
- normVec <- normalize.quantiles.determine.target(pm(bg.correct.rma(affy)))
- rm(affy); gc()
- ## Set the random seed for reproducibility.
- set.seed(1986)
- lapply(1:5, function(i) {
- on.exit(gc())
- tsmsg("Starting training run number ", i, " for ", ttype)
- tsmsg("Selecting batches for ", ttype)
- ## Keep only batches with enough samples
- big.enough <- stab$Batch %>% table %>% .[.>= arrays.per.batch] %>% names
- stab <- stab[stab$Batch %in% big.enough,] %>% droplevels
- ## Sample an equal number of arrays from each batch
- subtab <- ddply(stab, .(Batch), function(df) {
- df[sample(seq(nrow(df)), size=arrays.per.batch),]
- })
- tsmsg("Making fRMA vectors")
- ## Make fRMA vectors, using normVec from full dataset
- res <- makeVectorsAffyBatch(subtab$Filename, subtab$Batch, normVec=normVec)
- tsmsg("Finished training run number ", i, " for ", ttype)
- res
- }) %>% setNames(., str_c("V", seq_along(.)))
- }) %>% setNames(names(sample.tables))
- saveRDS(vectors, "consistency-vectors.RDS")
- save.image("consistency.rda")
- ## Continues in consistency-evaluate.R
|