train.R 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #!/usr/bin/env Rscript
  2. library(xlsx)
  3. library(frmaTools)
  4. library(stringr)
  5. library(magrittr)
  6. library(plyr)
  7. library(affy)
  8. library(preprocessCore)
  9. training.data.dir <- "Training Data"
  10. datasets <- data.frame(Dataset=list.files(training.data.dir))
  11. rownames(datasets) <- datasets$Dataset
  12. datasets$Tissue <- factor(str_extract(datasets$Dataset, "\\b(PAX|BX)\\b"))
  13. tsmsg <- function(...) {
  14. message(date(), ": ", ...)
  15. }
  16. ## Some Scan Dates are marked as identical for multiple batches, which
  17. ## is bad. But the dates embedded in the file names for these batches
  18. ## are different, so we use those dates instead.
  19. parse.date.from.filename <- function(fname) {
  20. res1 <- str_match(fname, "^(\\d\\d)(\\d\\d)(\\d\\d)")[,c(4,2,3)]
  21. res2 <- str_match(fname, "^20(\\d\\d)_(\\d\\d)_(\\d\\d)")[,-1]
  22. res1[is.na(res1)] <- res2[is.na(res1)]
  23. colnames(res1) <- c("year", "month", "day")
  24. res1[,"year"] %<>% str_c("20", .)
  25. as.Date(do.call(ISOdate, data.frame(res1)))
  26. }
  27. makeVectorsAffyBatch <- function (files, batch.id, background = "rma", normalize = "quantile",
  28. normVec = NULL, cdfname = NULL, file.dir = ".", verbose = TRUE)
  29. {
  30. wd <- getwd()
  31. setwd(file.dir)
  32. object <- ReadAffy(filenames = files, cdfname = cdfname,
  33. verbose = verbose)
  34. setwd(wd)
  35. if (verbose)
  36. message("Data loaded \n")
  37. batch.size <- table(batch.id)[1]
  38. if (!all(table(batch.id) == batch.size))
  39. stop("Batches must be of the same size.")
  40. if (background == "rma") {
  41. object <- bg.correct.rma(object)
  42. if (verbose)
  43. message("Background Corrected \n")
  44. gc()
  45. }
  46. pms <- pm(object)
  47. pns <- probeNames(object)
  48. pmi <- unlist(pmindex(object))
  49. if (!all(sprintf("%i", pmi) == rownames(pms)))
  50. stop("Mismatch between pmindex and rownames of pms")
  51. rm(object)
  52. gc()
  53. if (normalize == "quantile") {
  54. if (is.null(normVec))
  55. normVec <- normalize.quantiles.determine.target(pms)
  56. pms <- normalize.quantiles.use.target(pms, normVec)
  57. names(normVec) <- as.character(pmi)
  58. if (verbose)
  59. message("Normalized \n")
  60. }
  61. pms <- log2(pms)
  62. gc()
  63. N <- 1:dim(pms)[1]
  64. S <- split(N, pns)
  65. nc <- ncol(pms)
  66. nr <- nrow(pms)
  67. resids <- matrix(ncol = nc, nrow = nr)
  68. probeVec <- vector(length = nr)
  69. if (verbose)
  70. message("Beginning Probe Effect Calculation ... \n")
  71. for (k in 1:length(S)) {
  72. fit <- rcModelPLM(pms[S[[k]], , drop = FALSE])
  73. resids[S[[k]], ] <- fit$Residuals
  74. probeVec[S[[k]]] <- fit$Estimates[(nc + 1):length(fit$Estimates)]
  75. if ((k%%1000) == 0) {
  76. message(paste("Finished probeset:", k, "\n"))
  77. gc()
  78. }
  79. }
  80. names(probeVec) <- as.character(pmi)
  81. if (verbose)
  82. message("Probe Effects Calculated \n")
  83. gc()
  84. tmp <- split(t(resids), batch.id)
  85. withinMean <- lapply(tmp, frmaTools:::getProbeMean, batch.size)
  86. withinVar <- lapply(tmp, frmaTools:::getProbeVar, batch.size)
  87. withinAvgVar <- rowMeans(matrix(unlist(withinVar), ncol = length(withinVar)))
  88. btwVar <- apply(matrix(unlist(withinMean), ncol = length(withinMean)),
  89. 1, var)
  90. rm(tmp)
  91. rm(withinMean)
  92. rm(withinVar)
  93. names(withinAvgVar) <- names(btwVar) <- as.character(pmi)
  94. if (verbose)
  95. message("Probe Variances Calculated \n")
  96. gc()
  97. tmp <- split(resids, pns)
  98. psetMAD <- unlist(lapply(tmp, frmaTools:::getPsetMAD, nc, batch.id))
  99. names(psetMAD) <- names(tmp)
  100. rm(tmp)
  101. rm(resids)
  102. if (verbose)
  103. message("Probe Set SDs Calculated \n")
  104. gc()
  105. w <- 1/(withinAvgVar + btwVar)
  106. w[w == Inf] <- 1
  107. medianSE <- vector(length = length(psetMAD))
  108. if (verbose)
  109. message("Beginning Median SE Calculation ... \n")
  110. for (k in 1:length(S)) {
  111. fit <- frmaTools:::rwaFit2(pms[S[[k]], , drop = FALSE], w[S[[k]]],
  112. probeVec[S[[k]]], psetMAD[k])
  113. medianSE[k] <- median(fit$StdErrors)
  114. if ((k%%1000) == 0) {
  115. message(paste("Finished probeset:", k, "\n"))
  116. gc()
  117. }
  118. }
  119. names(medianSE) <- names(psetMAD)
  120. if (verbose)
  121. message("Median SEs Calculated \n")
  122. gc()
  123. rm(w)
  124. rm(pms)
  125. rm(pns)
  126. gc()
  127. if (is.null(cdfname)) {
  128. vers <- ""
  129. } else {
  130. vers <- as.character(packageVersion(cdfname))
  131. }
  132. ## vers <- ifelse(!is.null(cdfname), as.character(packageVersion(cdfname)),
  133. ## "")
  134. return(list(normVec = normVec, probeVec = probeVec, probeVarWithin = withinAvgVar,
  135. probeVarBetween = btwVar, probesetSD = psetMAD, medianSE = medianSE,
  136. version = vers))
  137. }
  138. ## Error: the following are not valid files:
  139. ## Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL
  140. blacklist <- "Training Data/03 - TGCG ARADNRCANTX PAX Samples/10733.CEL"
  141. ## This reads in the xlsx file for each of the 7 datasets and combines
  142. ## them into one big table of all samples. The Batch column contains
  143. ## the partitioning of samples into unique combinations of Dataset,
  144. ## Scan Date, and Phenotype. Finally, we split based on Tissue type to
  145. ## get one table for biopsies (BX), and one for blood (PAX).
  146. sample.tables <- ddply(datasets, .(Dataset), function(df) {
  147. df <- df[1,]
  148. rownames(df) <- NULL
  149. dset.dir <- file.path(training.data.dir, df$Dataset)
  150. x <- read.xlsx(list.files(dset.dir, pattern=glob2rx("*.xlsx"), full.names=TRUE)[1], 1) %>%
  151. setNames(c("Filename", "Phenotype", "ScanDate"))
  152. x$Filename <- as.character(x$Filename)
  153. missing.CEL <- !str_detect(x$Filename, "\\.CEL$")
  154. x$Filename[missing.CEL] <- str_c(x$Filename[missing.CEL], ".CEL")
  155. stopifnot(all(str_detect(x$Filename, "\\.CEL$")))
  156. parsed.date <- parse.date.from.filename(x$Filename)
  157. x$ScanDate[!is.na(parsed.date)] <- parsed.date[!is.na(parsed.date)]
  158. x %>% cbind(df) %>%
  159. transform(Filename=file.path(dset.dir, Filename),
  160. Batch=droplevels(Tissue:Dataset:factor(ScanDate):Phenotype)) %>%
  161. subset(! Filename %in% blacklist) %>%
  162. subset(!duplicated(Filename))
  163. }) %>%
  164. split(.$Tissue) %>%
  165. lapply(droplevels)
  166. ## fRMA requires equal-sized batches, so for each batch size from 3 to
  167. ## 15, compute how many batches have at least that many samples.
  168. x <- sapply(3:15, function(i) sapply(sample.tables, . %$% Batch %>% table %>% as.vector %>% {sum(. >= i)}))
  169. colnames(x) <- 3:15
  170. ## Based on the above and the recommendations in the frmaTools paper,
  171. ## I chose 5 as the optimal batch size. This could be optimized
  172. ## empirically, though.
  173. arrays.per.batch <- 5
  174. ## For each tissue type, compute fRMA vectors.
  175. vectors <- lapply(sample.tables, function(stab) {
  176. set.seed(1986)
  177. tsmsg("Reading full dataset")
  178. affy <- ReadAffy(filenames=stab$Filename, sampleNames=rownames(stab))
  179. tsmsg("Getting reference normalization distribution from full dataset")
  180. normVec <- normalize.quantiles.determine.target(pm(bg.correct.rma(affy)))
  181. rm(affy); gc()
  182. tsmsg("Selecting batches")
  183. ## Keep only arrays with enough samples
  184. big.enough <- stab$Batch %>% table %>% .[.>= arrays.per.batch] %>% names
  185. stab <- stab[stab$Batch %in% big.enough,] %>% droplevels
  186. ## Sample an equal number of arrays from each batch
  187. subtab <- ddply(stab, .(Batch), function(df) {
  188. df[sample(seq(nrow(df)), size=arrays.per.batch),]
  189. })
  190. tsmsg("Making fRMA vectors")
  191. ## Make fRMA vectors, using normVec from full dataset
  192. res <- makeVectorsAffyBatch(subtab$Filename, subtab$Batch, normVec=normVec)
  193. tsmsg("Finished.")
  194. res
  195. })
  196. ## The code below here just takes the trained vectors and packages
  197. ## them up into installable R packages.
  198. makePackageFromVectors <-
  199. function (vecs, version, maintainer, species, annotation,
  200. packageName, file.dir = ".",
  201. output.dir = ".", unlink = TRUE)
  202. {
  203. platform <- gsub("cdf$", "", annotation)
  204. ## type <- match.arg(type, c("AffyBatch", "FeatureSet"))
  205. ## if (type == "AffyBatch")
  206. ## platform <- gsub("cdf", "", annotation)
  207. ## if (type == "FeatureSet") {
  208. ## platform <- annotation
  209. ## require(oligo)
  210. ## }
  211. thispkg <- "frmaTools"
  212. desc <- packageDescription(thispkg)
  213. thispkgVers <- desc$Version
  214. symbolValues <- list(ARRAYTYPE = platform, VERSION = version,
  215. CREATOR = paste("package", thispkg, "version", thispkgVers),
  216. FRMATOOLSVERSION = thispkgVers, MAINTAINER = maintainer,
  217. SPECIES = species)
  218. createdPkg <- createPackage(packageName, destinationDir = output.dir,
  219. originDir = system.file("VectorPkg-template", package = "frmaTools"),
  220. symbolValues = symbolValues, unlink = unlink)
  221. ## if (type == "AffyBatch")
  222. ## vecs <- makeVectorsAffyBatch(files, batch.id, background,
  223. ## normalize, normVec, annotation, file.dir, verbose)
  224. ## if (type == "FeatureSet")
  225. ## vecs <- makeVectorsFeatureSet(files, batch.id, annotation,
  226. ## background, normalize, normVec, file.dir, verbose)
  227. assign(packageName, vecs)
  228. save(list = eval(packageName), file = file.path(createdPkg$pkgdir,
  229. "data", paste(packageName, ".rda", sep = "")), compress = TRUE)
  230. }
  231. annotation <- cleancdfname(affyio:::read.celfile.header(sample.tables[[1]]$Filename[1])$cdfName, FALSE)
  232. dir.create("pkgs", FALSE, TRUE, mode="755")
  233. for (i in names(vectors)) {
  234. vecs <- vectors[[i]]
  235. pkgname <- sprintf("DSalomon.%s.%sfrmavecs", i, annotation)
  236. message("Making ", pkgname)
  237. makePackageFromVectors(
  238. vecs,
  239. version="0.1",
  240. maintainer="Ryan C. Thompson <rcthomps@scripps.edu>",
  241. species="Homo_sapiens",
  242. annotation=annotation,
  243. packageName=pkgname,
  244. output.dir = "pkgs")
  245. }
  246. save.image("train-data.rda")