edger-pipeline.R 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #!/usr/bin/Rscript
  2. source("common.R")
  3. library(rtracklayer)
  4. library(DiffBind)
  5. library(plyr)
  6. library(doMC)
  7. registerDoMC()
  8. options(cores=multicore:::detectCores())
  9. library(snow)
  10. library(stringr)
  11. library(RColorBrewer)
  12. library(xlsx)
  13. library(edgeR)
  14. tsmsg <- function(...) {
  15. message(date(), ": ", ...)
  16. }
  17. tsmsgf <- function(...) {
  18. tsmsg(sprintf(...))
  19. }
  20. ## Additional args are passed to every call of addDataFrame
  21. write.xlsx.multisheet <- function(data.frames, file, sheetNames=names(data.frames), ...) {
  22. if (is.null(sheetNames)) {
  23. sheetNames <- str_c("Sheet", seq_along(data.frames))
  24. }
  25. ## Ensure correct number of sheetNames
  26. stopifnot(length(sheetNames) == length(data.frames))
  27. ## Fill in missing names if needed
  28. sheetNames[is.na(sheetNames)] <- str_c("Sheet", seq_along(data.frames))[is.na(sheetNames)]
  29. wb <- createWorkbook()
  30. sheets <- llply(sheetNames, function(x) createSheet(wb, sheetName=x))
  31. mlply(cbind(sheet=sheets, x=data.frames), .fun=addDataFrame, .parallel=FALSE, ...)
  32. saveWorkbook(wb, file)
  33. }
  34. renice.self <- function(niceness=19) {
  35. system2("renice", args=c("-n", as.character(niceness), Sys.getpid()))
  36. }
  37. makeNiceCluster <- function(...) {
  38. cl <- makeCluster(...)
  39. clusterCall(cl, renice.self)
  40. cl
  41. }
  42. select.nearest <- function(x, y) {
  43. y[nearest(x,y)]
  44. }
  45. kgxref <- get.ucsc.table("knownGene","kgXref")
  46. get.kgxref <- function(kgID) {
  47. x <- merge(x=DataFrame(kgID=kgID), y=kgxref,
  48. all.x=TRUE, all.y=FALSE)
  49. x[names(x) != "kgID"]
  50. }
  51. read.htseq.counts <- function(f) {
  52. x <- read.table(f, header=FALSE, sep="\t")
  53. names(x) <- c("ID", "count")
  54. ## Discard the last 5 lines
  55. exception.rows <- (nrow(x)-4):nrow(x)
  56. attr(x, "exception_counts") <- x[exception.rows,]
  57. x <- x[-exception.rows,]
  58. x
  59. }
  60. get.transcript.lengths <- function(transcript.ids, exon.lengths) {
  61. exons.by.transcript <- split(exon.lengths, transcript.ids)
  62. sapply(exons.by.transcript, sum)
  63. }
  64. get.gene.lengths <- function(gene.ids, transcript.lengths, method="max") {
  65. if (is.character(method)) {
  66. method <- get(method)
  67. }
  68. transcripts.by.genes <- split(transcript.lengths, gene.ids)
  69. sapply(transcripts.by.genes, method)
  70. }
  71. str_matches <- function(string, pattern) {
  72. !is.na(str_match(string, pattern)[,1])
  73. }
  74. annotate.ensp.in.hsap.or.mmul <- function(ensp.ids) {
  75. ensembl=useMart("ensembl")
  76. datasets <- c("mmulatta_gene_ensembl", "hsapiens_gene_ensembl")
  77. x <- Reduce(rbind, llply(datasets, function(x) {
  78. ensembl <- useDataset(x, mart=ensembl)
  79. getBM(filters="ensembl_peptide_id",
  80. values=unique(ensp.ids),
  81. attributes=c("hgnc_symbol", "wikigene_name", "ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "description", "wikigene_description"),
  82. mart=ensembl,
  83. uniqueRows=TRUE)
  84. }))
  85. ## Convert empty string to NA in all columns
  86. x <- data.frame(llply(x, function(column) ifelse(column == "", NA, column)))
  87. ## Unify symbols & descriptions
  88. unified.symbol <- ifelse(is.na(x$hgnc_symbol), as.character(x$wikigene_name), as.character(x$hgnc_symbol))
  89. unified.desc <- ifelse(is.na(x$description), as.character(x$wikigene_description), as.character(x$description))
  90. x <- data.frame(symbol=unified.symbol, x[! names(x) %in% c("hgnc_symbol", "wikigene_name", "description", "wikigene_description")], description=unified.desc)
  91. ## Reorder rows so that more-preferred rows for the same input are on top
  92. x <- x[order(is.na(x$symbol), str_matches(x$symbol, "^LOC\\d+$"), is.na(x$description)),]
  93. ## Keep only the first row for each input
  94. x <- x[!duplicated(x$ensembl_peptide_id),]
  95. data.frame(x, row.names=x$ensembl_peptide_id)
  96. }
  97. ## TODO: Replace by biomart
  98. CE.name.annot <- {
  99. x <- setNames(nm=c("CE_ensembl_peptide_id", "symbol", "description"),
  100. read.table("annotation/CE.name", sep="\t", quote=""))
  101. x$ensembl_peptide_id <- str_replace(x$CE_ensembl_peptide_id, "^CE_", "")
  102. row.names(x) <- x$ensembl_peptide_id
  103. x
  104. }
  105. annotation.gff <- import("cuffmerge_results/merged.gff")
  106. transcripts <- annotation.gff[annotation.gff$type == "transcript", ]
  107. exons <- annotation.gff[annotation.gff$type == "exon", ]
  108. transcript.lengths <- get.transcript.lengths(unlist(exons$Parent), width(exons))
  109. gene.lengths <- get.gene.lengths(transcripts$geneID, transcript.lengths[transcripts$ID])
  110. gene.ref.lists <- sapply(split(transcripts$nearest_ref, transcripts$geneID),
  111. function(x) unique(x[!is.na(x)]))
  112. ## For genes with multiple ref IDs, join them with commas
  113. gene.refs <- sapply(gene.ref.lists, function (x) {
  114. if (length(x) == 0) {
  115. NA
  116. } else {
  117. str_c(x, collapse=",")
  118. }
  119. })
  120. ## For genes with multiple ref IDs, just pick the first one
  121. gene.first.refs <- sapply(gene.ref.lists, function(x) {
  122. if (length(x) == 0) {
  123. NA
  124. } else {
  125. x[[1]]
  126. }
  127. })
  128. ## Use delayed assignment so that the cluster won't be created until it is actually needed
  129. delayedAssign("cl", makeNiceCluster(rep(c("salomon14", "salomon18", "salomon19"), 8)))
  130. mart.raw.annot <- annotate.ensp.in.hsap.or.mmul(na.omit(gene.first.refs))
  131. gene.annot <- data.frame(mart.raw.annot[gene.first.refs,], row.names=names(gene.first.refs))
  132. ## Fill in missing values that are available from the CE.name file
  133. ## provided by the cyno genome paper authors
  134. suppl.gene.annot <- data.frame(CE.name.annot[gene.first.refs,], row.names=names(gene.first.refs))
  135. suppl.ensp <- is.na(gene.annot$ensembl_peptide_id) & !is.na(suppl.gene.annot$ensembl_peptide_id)
  136. suppl.symbols <- is.na(gene.annot$symbol) & !is.na(suppl.gene.annot$symbol)
  137. gene.annot$ensembl_peptide_id <- ifelse(suppl.ensp,
  138. as.character(suppl.gene.annot$ensembl_peptide_id),
  139. as.character(gene.annot$ensembl_peptide_id))
  140. gene.annot$symbol <- ifelse(suppl.symbols,
  141. as.character(suppl.gene.annot$symbol),
  142. as.character(gene.annot$symbol))
  143. ## Replace description whenever we replace the symbol to keep things consistent
  144. gene.annot$description <- ifelse(suppl.symbols,
  145. as.character(suppl.gene.annot$description),
  146. as.character(gene.annot$description))
  147. gene.annot$symbol[is.na(gene.annot$ensembl_peptide_id)] <- "[No annotation]"
  148. gene.annot$symbol[is.na(gene.annot$symbol)] <- "[No symbol for ENSP]"
  149. expdata <- {
  150. x <- read.xlsx("kenyon-cyno-expdata.xls", 1)
  151. x$Animal.ID <- str_trim(x$Animal.ID)
  152. x$Condition <- str_replace_all(x$Condition, "γ", "g")
  153. x$Sample.ID <-
  154. sprintf("%s_%s_%s",
  155. str_replace_all(x$Animal.ID, "[ -]", "_"),
  156. ifelse(x$Condition == "Control", "CTRL", "IFNg"),
  157. x$Passage)
  158. x$Sample.Name <-
  159. sprintf("%s_%s_%s_L%03i",
  160. str_replace_all(x$Animal.ID, "[ -]", "_"),
  161. ifelse(x$Condition == "Control", 1, 2),
  162. x$Index.Seq,
  163. x$Lane)
  164. x$FASTQ.Read1.File <- sprintf("seqprep_results/%s/%s_R1_001.fastq", x$Sample.Name, x$Sample.Name)
  165. x$FASTQ.Read2.File <- sprintf("seqprep_results/%s/%s_R2_001.fastq", x$Sample.Name, x$Sample.Name)
  166. x$BAM.File <- sprintf("tophat_results/%s/accepted_hits.bam", x$Sample.Name)
  167. x$GFF.File <- sprintf("cufflinks_quantification/%s/transcripts.gff", x$Sample.Name)
  168. x$Counts.File <- sprintf("htseq_counts/%s/counts.txt", x$Sample.Name)
  169. mcparallel(write.xlsx(x, "expdata.xlsx"))
  170. rownames(x) <- x$Sample.ID
  171. x
  172. }
  173. counts.vectors <- setNames(llply(expdata$Counts.File, .parallel=TRUE, function(f) {
  174. x <- read.htseq.counts(f)
  175. setNames(x$count, x$ID)
  176. }), expdata$Sample.ID)
  177. ## Put the data into a matrix, making sure we account for the
  178. ## possibility that not all genes are listed in all samples.
  179. all.geneIDs <- sort(unique(unlist(llply(counts.vectors, names))))
  180. counts <- {
  181. x <- matrix(data=0, ncol=length(counts.vectors), nrow=length(all.geneIDs),
  182. dimnames=list(`geneID`=all.geneIDs, `sample`=names(counts.vectors)))
  183. for (i in expdata$Sample.ID) {
  184. cvec <- counts.vectors[[i]]
  185. x[names(cvec),i] <- cbind(cvec)
  186. }
  187. x
  188. }
  189. blocked.design <- model.matrix(~Animal.ID+Passage+Condition, data=expdata)
  190. unblocked.design <- model.matrix(~Condition, data=expdata)
  191. dge <- DGEList(counts=counts,
  192. group=expdata$Condition,
  193. genes=gene.annot[rownames(counts),])
  194. dge <- calcNormFactors(dge)
  195. blocked.dge <- estimateGLMCommonDisp(dge, blocked.design, verbose=TRUE)
  196. blocked.dge <- estimateGLMTrendedDisp(blocked.dge, blocked.design)
  197. blocked.dge <- estimateGLMTagwiseDisp(blocked.dge, blocked.design)
  198. blocked.fit <- glmFit(blocked.dge, blocked.design)
  199. blocked.lrt <- glmLRT(blocked.dge, blocked.fit, coef="ConditionIFNg activated")
  200. blocked.tt <- topTags(blocked.lrt, n=nrow(counts))
  201. unblocked.dge <- estimateGLMCommonDisp(dge, unblocked.design, verbose=TRUE)
  202. unblocked.dge <- estimateGLMTrendedDisp(unblocked.dge, unblocked.design)
  203. unblocked.dge <- estimateGLMTagwiseDisp(unblocked.dge, unblocked.design)
  204. unblocked.fit <- glmFit(unblocked.dge, unblocked.design)
  205. unblocked.lrt <- glmLRT(unblocked.dge, unblocked.fit, coef="ConditionIFNg activated")
  206. unblocked.tt <- topTags(unblocked.lrt, n=nrow(counts))
  207. write.csv(blocked.tt$table, "edgeR-genes-prelim")
  208. write.xlsx.multisheet(list(blocked=blocked.tt$table,
  209. unblocked=unblocked.tt$table),
  210. "edgeR-genes.xlsx")