test.R 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. #!/usr/bin/env Rscript
  2. library(xlsx)
  3. library(frma)
  4. library(frmaTools)
  5. library(stringr)
  6. library(magrittr)
  7. library(plyr)
  8. library(affy)
  9. library(preprocessCore)
  10. training.data.dir <- "Training Data"
  11. datasets <- data.frame(Dataset=list.files(training.data.dir))
  12. rownames(datasets) <- datasets$Dataset
  13. datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b"))
  14. tsmsg <- function(...) {
  15. message(date(), ": ", ...)
  16. }
  17. parse.date.from.filename <- function(fname) {
  18. res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)]
  19. res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1]
  20. res1[is.na(res1)] <- res2[is.na(res1)]
  21. colnames(res1) <- c("year", "month", "day")
  22. res1[,"year"] %<>% str_c("20", .)
  23. as.Date(do.call(ISOdate, data.frame(res1)))
  24. }
  25. sample.tables <- ddply(datasets, .(Dataset), function(df) {
  26. df <- df[1,]
  27. rownames(df) <- NULL
  28. dset.dir <- file.path(training.data.dir, df$Dataset)
  29. x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>%
  30. setNames(c("Filename", "Phenotype", "ScanDate"))
  31. x$Filename <- as.character(x$Filename)
  32. missing.CEL <- !str_detect(x$Filename, "\\.CEL$")
  33. x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL")
  34. stopifnot(all(str_detect(x$Filename, "\\.CEL$")))
  35. parsed.date <- parse.date.from.filename(x$Filename)
  36. x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)]
  37. x %>% cbind(df) %>%
  38. transform(Filename=file.path(dset.dir, Filename),
  39. Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>%
  40. subset(! Filename %in% blacklist)
  41. }) %>% split(.$Tissue) %>% lapply(droplevels)
  42. annotation <- cleancdfname(affyio:::read.celfile.header(sample.tables[[1]]$Filename[1])$cdfName, FALSE)
  43. esets <- list()
  44. for (i in names(sample.tables)) {
  45. pkgname <- sprintf("DSalomon.%s.%sfrmavecs", i, annotation)
  46. message("Loading ", pkgname)
  47. require(pkgname, character.only=TRUE, quietly=TRUE)
  48. data(list = pkgname)
  49. message("Loading raw data for ", i)
  50. stab <- sample.tables[[i]]
  51. affy <- ReadAffy(filenames=stab$Filename, phenoData=stab)
  52. message("Running fRMA for ", i)
  53. esets[[i]] <- frma(affy, input.vecs=get(pkgname), verbose=TRUE)
  54. rm(affy)
  55. gc()
  56. }