consistency-train.R 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #!/usr/bin/env Rscript
  2. # Script to train multiple fRMA vectors in preparation for consistency
  3. # evaluation
  4. library(xlsx)
  5. library(frma)
  6. library(frmaTools)
  7. library(stringr)
  8. library(magrittr)
  9. library(plyr)
  10. library(affy)
  11. library(preprocessCore)
  12. library(ggplot2)
  13. library(proto)
  14. library(dplyr)
  15. training.data.dir <- "Training Data"
  16. datasets <- data.frame(Dataset=list.files(training.data.dir))
  17. rownames(datasets) <- datasets$Dataset
  18. datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b"))
  19. tsmsg <- function(...) {
  20. message(date(), ": ", ...)
  21. }
  22. ## Some Scan Dates are marked as identical for multiple batches, which
  23. ## is bad. But the dates embedded in the file names for these batches
  24. ## are different, so we use those dates instead.
  25. parse.date.from.filename <- function(fname) {
  26. res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)]
  27. res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1]
  28. res1[is.na(res1)] <- res2[is.na(res1)]
  29. colnames(res1) <- c("year", "month", "day")
  30. res1[,"year"] %<>% str_c("20", .)
  31. as.Date(do.call(ISOdate, data.frame(res1)))
  32. }
  33. ## Error: the following are not valid files:
  34. ## Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL
  35. blacklist <- "Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL"
  36. ## This reads in the xlsx file for each of the 7 datasets and combines
  37. ## them into one big table of all samples. The Batch column contains
  38. ## the partitioning of samples into unique combinations of Dataset,
  39. ## Scan Date, and Phenotype. Finally, we split based on Tissue type to
  40. ## get one table for biopsies (BX), and one for blood (PAX).
  41. sample.tables <- ddply(datasets, .(Dataset), function(df) {
  42. df <- df[1,]
  43. rownames(df) <- NULL
  44. dset.dir <- file.path(training.data.dir, df$Dataset)
  45. x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>%
  46. setNames(c("Filename", "Phenotype", "ScanDate"))
  47. x$Filename <- as.character(x$Filename)
  48. missing.CEL <- !str_detect(x$Filename, "\\.CEL$")
  49. x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL")
  50. stopifnot(all(str_detect(x$Filename, "\\.CEL$")))
  51. parsed.date <- parse.date.from.filename(x$Filename)
  52. x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)]
  53. x %>% cbind(df) %>%
  54. transform(Filename=file.path(dset.dir, Filename),
  55. Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>%
  56. subset(! Filename %in% blacklist) %>%
  57. subset(!duplicated(Filename))
  58. }) %>%
  59. split(.$Tissue) %>%
  60. lapply(droplevels)
  61. ## fRMA requires equal-sized batches, so for each batch size from 3 to
  62. ## 15, compute how many batches have at least that many samples.
  63. x <- sapply(3:15, function(i) sapply(sample.tables, . %$% Batch %>% table %>% as.vector %>% {sum(. >= i)}))
  64. colnames(x) <- 3:15
  65. ## Based on the above and the recommendations in the frmaTools paper,
  66. ## I chose 5 as the optimal batch size. This could be optimized
  67. ## empirically, though.
  68. arrays.per.batch <- 5
  69. vectors <- lapply(names(sample.tables), function(ttype) {
  70. stab <- sample.tables[[ttype]]
  71. tsmsg("Reading full dataset for ", ttype)
  72. affy <- ReadAffy(filenames=stab$Filename, sampleNames=rownames(stab))
  73. tsmsg("Getting reference normalziation distribution from full dataset for ", ttype)
  74. normVec <- normalize.quantiles.determine.target(pm(bg.correct.rma(affy)))
  75. rm(affy); gc()
  76. ## Set the random seed for reproducibility.
  77. set.seed(1986)
  78. lapply(1:5, function(i) {
  79. on.exit(gc())
  80. tsmsg("Starting training run number ", i, " for ", ttype)
  81. tsmsg("Selecting batches for ", ttype)
  82. ## Keep only batches with enough samples
  83. big.enough <- stab$Batch %>% table %>% .[.>= arrays.per.batch] %>% names
  84. stab <- stab[stab$Batch %in% big.enough,] %>% droplevels
  85. ## Sample an equal number of arrays from each batch
  86. subtab <- ddply(stab, .(Batch), function(df) {
  87. df[sample(seq(nrow(df)), size=arrays.per.batch),]
  88. })
  89. tsmsg("Making fRMA vectors")
  90. ## Make fRMA vectors, using normVec from full dataset
  91. res <- makeVectorsAffyBatch(subtab$Filename, subtab$Batch, normVec=normVec)
  92. tsmsg("Finished training run number ", i, " for ", ttype)
  93. res
  94. }) %>% setNames(., str_c("V", seq_along(.)))
  95. }) %>% setNames(names(sample.tables))
  96. saveRDS(vectors, "consistency-vectors.RDS")
  97. save.image("consistency.rda")
  98. ## Continues in consistency-evaluate.R