#!/usr/bin/env Rscript

# Script to train multiple fRMA vectors in preparation for consistency
# evaluation

library(xlsx)
library(frma)
library(frmaTools)
library(stringr)
library(magrittr)
library(plyr)
library(affy)
library(preprocessCore)
library(ggplot2)
library(proto)
library(dplyr)

training.data.dir <- "Training Data"
datasets <- data.frame(Dataset=list.files(training.data.dir))
rownames(datasets) <- datasets$Dataset
datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b"))

tsmsg <- function(...) {
  message(date(), ": ", ...)
}

## Some Scan Dates are marked as identical for multiple batches, which
## is bad. But the dates embedded in the file names for these batches
## are different, so we use those dates instead.
parse.date.from.filename <- function(fname) {
    res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)]
    res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1]
    res1[is.na(res1)] <- res2[is.na(res1)]
    colnames(res1) <- c("year", "month", "day")
    res1[,"year"] %<>% str_c("20", .)
    as.Date(do.call(ISOdate, data.frame(res1)))
}

## Error: the following are not valid files:
##     Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL
blacklist <- "Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL"

## This reads in the xlsx file for each of the 7 datasets and combines
## them into one big table of all samples. The Batch column contains
## the partitioning of samples into unique combinations of Dataset,
## Scan Date, and Phenotype. Finally, we split based on Tissue type to
## get one table for biopsies (BX), and one for blood (PAX).
sample.tables <- ddply(datasets, .(Dataset), function(df) {
    df <- df[1,]
    rownames(df) <- NULL
    dset.dir <- file.path(training.data.dir, df$Dataset)
    x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>%
        setNames(c("Filename", "Phenotype", "ScanDate"))
    x$Filename <- as.character(x$Filename)
    missing.CEL <- !str_detect(x$Filename, "\\.CEL$")
    x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL")
    stopifnot(all(str_detect(x$Filename, "\\.CEL$")))
    parsed.date <- parse.date.from.filename(x$Filename)
    x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)]
    x %>% cbind(df) %>%
        transform(Filename=file.path(dset.dir, Filename),
                  Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>%
                      subset(! Filename %in% blacklist) %>%
                          subset(!duplicated(Filename))
}) %>%
    split(.$Tissue) %>%
        lapply(droplevels)

## fRMA requires equal-sized batches, so for each batch size from 3 to
## 15, compute how many batches have at least that many samples.
x <- sapply(3:15, function(i) sapply(sample.tables, . %$% Batch %>% table %>% as.vector %>% {sum(. >= i)}))
colnames(x) <- 3:15

## Based on the above and the recommendations in the frmaTools paper,
## I chose 5 as the optimal batch size. This could be optimized
## empirically, though.
arrays.per.batch <- 5

vectors <- lapply(names(sample.tables), function(ttype) {
    stab <- sample.tables[[ttype]]
    tsmsg("Reading full dataset for ", ttype)
    affy <- ReadAffy(filenames=stab$Filename, sampleNames=rownames(stab))
    tsmsg("Getting reference normalziation distribution from full dataset for ", ttype)
    normVec <- normalize.quantiles.determine.target(pm(bg.correct.rma(affy)))
    rm(affy); gc()
    ## Set the random seed for reproducibility.
    set.seed(1986)

    lapply(1:5, function(i) {
        on.exit(gc())
        tsmsg("Starting training run number ", i, " for ", ttype)
        tsmsg("Selecting batches for ", ttype)
        ## Keep only batches with enough samples
        big.enough <- stab$Batch %>% table %>% .[.>= arrays.per.batch] %>% names
        stab <- stab[stab$Batch %in% big.enough,] %>% droplevels

        ## Sample an equal number of arrays from each batch
        subtab <- ddply(stab, .(Batch), function(df) {
            df[sample(seq(nrow(df)), size=arrays.per.batch),]
        })

        tsmsg("Making fRMA vectors")
        ## Make fRMA vectors, using normVec from full dataset
        res <- makeVectorsAffyBatch(subtab$Filename, subtab$Batch, normVec=normVec)

        tsmsg("Finished training run number ", i, " for ", ttype)
        res
    }) %>% setNames(., str_c("V", seq_along(.)))
}) %>% setNames(names(sample.tables))

saveRDS(vectors, "consistency-vectors.RDS")
save.image("consistency.rda")
## Continues in consistency-evaluate.R