test.R 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. #!/usr/bin/env Rscript
  2. library(xlsx)
  3. library(frma)
  4. library(frmaTools)
  5. library(stringr)
  6. library(magrittr)
  7. library(plyr)
  8. library(affy)
  9. library(preprocessCore)
  10. training.data.dir <- "Training Data"
  11. datasets <- data.frame(Dataset=list.files(training.data.dir))
  12. rownames(datasets) <- datasets$Dataset
  13. datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b"))
  14. tsmsg <- function(...) {
  15. message(date(), ": ", ...)
  16. }
  17. parse.date.from.filename <- function(fname) {
  18. res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)]
  19. res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1]
  20. res1[is.na(res1)] <- res2[is.na(res1)]
  21. colnames(res1) <- c("year", "month", "day")
  22. res1[,"year"] %<>% str_c("20", .)
  23. as.Date(do.call(ISOdate, data.frame(res1)))
  24. }
  25. ## Error: the following are not valid files:
  26. ## Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL
  27. blacklist <- "Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL"
  28. sample.tables <- ddply(datasets, .(Dataset), function(df) {
  29. df <- df[1,]
  30. rownames(df) <- NULL
  31. dset.dir <- file.path(training.data.dir, df$Dataset)
  32. x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>%
  33. setNames(c("Filename", "Phenotype", "ScanDate"))
  34. x$Filename <- as.character(x$Filename)
  35. missing.CEL <- !str_detect(x$Filename, "\\.CEL$")
  36. x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL")
  37. stopifnot(all(str_detect(x$Filename, "\\.CEL$")))
  38. parsed.date <- parse.date.from.filename(x$Filename)
  39. x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)]
  40. x %>% cbind(df) %>%
  41. transform(Filename=file.path(dset.dir, Filename),
  42. Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>%
  43. subset(! Filename %in% blacklist)
  44. }) %>% split(.$Tissue) %>% lapply(droplevels)
  45. annotation <- cleancdfname(affyio:::read.celfile.header(sample.tables[[1]]$Filename[1])$cdfName, FALSE)
  46. esets <- list()
  47. for (i in names(sample.tables)) {
  48. pkgname <- sprintf("DSalomon.%s.%sfrmavecs", i, annotation)
  49. message("Loading ", pkgname)
  50. require(pkgname, character.only=TRUE, quietly=TRUE)
  51. data(list = pkgname)
  52. message("Loading raw data for ", i)
  53. stab <- sample.tables[[i]]
  54. affy <- ReadAffy(filenames=stab$Filename, phenoData=stab)
  55. message("Running fRMA for ", i)
  56. esets[[i]] <- frma(affy, input.vecs=get(pkgname), verbose=TRUE)
  57. rm(affy)
  58. gc()
  59. }