blockbuster-pipeline.R 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #!/usr/bin/env Rscript
  2. source("common.R")
  3. library(BSgenome.Hsapiens.UCSC.hg19)
  4. textConnectionFromLines <- function(lines, linesep="\n") {
  5. textConnection(str_c(sprintf("%s%s", lines, linesep), collapse=""))
  6. }
  7. ## Takes a GRangesList
  8. calculate.block.entropy <- function(grl, expr.column="tagExpression") {
  9. groupfac <- factor(rep(names(grl), elementLengths(grl)))
  10. exprs <- as.vector(elementMetadata(unlist(grl))[[expr.column]])
  11. total.exprs <- aggregate(exprs, by=list(groupfac), FUN=sum)$x
  12. qi <- exprs / rep(total.exprs, elementLengths(grl))
  13. qi.times.log <- qi * log2(qi)
  14. results <- -aggregate(qi.times.log, by=list(groupfac), FUN=sum)$x
  15. names(results) <- names(grl)
  16. results
  17. }
  18. ## http://hoffmann.bioinf.uni-leipzig.de/LIFE/blockbuster.html
  19. read.blockbuster.tag.output <- function(bbout) {
  20. x <- readLines(bbout)
  21. cluster.line.numbers <- which(str_sub(x, 1,1) == ">")
  22. cluster.lines <- str_sub(x[cluster.line.numbers], 2)
  23. cluster.table <- read.table(textConnectionFromLines(cluster.lines),
  24. col.names=c("clusterID", "chrom", "clusterStart", "clusterEnd", "strand", "ClusterExpression", "tagCount", "blockCount"),
  25. colClasses=c("character", "factor", "integer", "integer", "factor", "numeric", "integer", "integer"))
  26. tag.lines <- x[-cluster.line.numbers]
  27. cluster.tag.line.counts <- cluster.line.numbers[-1] - cluster.line.numbers[-length(cluster.line.numbers)] - 1
  28. cluster.tag.line.counts <- c(cluster.tag.line.counts, length(tag.lines) - sum(cluster.tag.line.counts))
  29. tag.table <- read.table(textConnectionFromLines(tag.lines),
  30. col.names=c("tagChrom", "tagStart", "tagEnd", "tagID", "tagExpression", "tagStrand", "blockNb"),
  31. colClasses=c("factor", "integer", "integer", "character", "numeric", "factor", "integer"))
  32. tag.table$clusterID <- rep(cluster.table$clusterID, cluster.tag.line.counts)
  33. tag.table$blockID <- sprintf("%s/B%s", tag.table$clusterID, tag.table$blockNb)
  34. tags <- table.to.granges(tag.table, seqnames.column="tagChrom", start.column="tagStart", end.column="tagEnd", strand.column="tagStrand", seqlengths="hg19")
  35. tags <- split(tags, as.vector(elementMetadata(tags)$blockID))
  36. return(tags)
  37. }
  38. read.blockbuster.output <- function(bbout, bbout.tags=NULL) {
  39. x <- readLines(bbout)
  40. cluster.line.numbers <- which(str_sub(x, 1,1) == ">")
  41. cluster.lines <- str_sub(x[cluster.line.numbers], 2)
  42. block.lines <- x[-cluster.line.numbers]
  43. cluster.table <- read.table(textConnectionFromLines(cluster.lines),
  44. col.names=c("clusterID", "chrom", "clusterStart", "clusterEnd", "strand", "ClusterExpression", "tagCount", "blockCount"),
  45. colClasses=c("character", "factor", "integer", "integer", "factor", "numeric", "integer", "integer"))
  46. clusters <- table.to.granges(cluster.table, seqnames.column="chrom", start.column="clusterStart", end.column="clusterEnd", seqlengths="hg19")
  47. block.table <- read.table(textConnectionFromLines(block.lines),
  48. col.names=c("blockNb", "blockChrom", "blockStart", "blockEnd", "blockStrand", "blockExpression", "readCount"),
  49. colClasses=c("integer", "factor", "integer", "integer", "factor", "numeric", "integer"))
  50. block.table$clusterID <- rep(cluster.table$clusterID, cluster.table$blockCount)
  51. block.table$blockID <- sprintf("%s/B%s", block.table$clusterID, block.table$blockNb)
  52. blocks. <- table.to.granges(block.table, seqnames.column="blockChrom", start.column="blockStart", end.column="blockEnd", strand.column="blockStrand", seqlengths="hg19")
  53. ## blocks. <- split(blocks., as.vector(elementMetadata(blocks.)$clusterID))
  54. retval <- list(clusters=clusters, blocks=blocks.)
  55. if (!is.null(bbout.tags)) {
  56. retval$tags <- read.blockbuster.tag.output(bbout.tags)
  57. }
  58. return(retval)
  59. }
  60. blockbuster.path <- "/home/ryan/bin/blockbuster"
  61. run.blockbuster <- function(gr, ...) {
  62. temp.bed.file <- tempfile(fileext=".bed")
  63. temp.bbout.file <- tempfile(fileext=".bbout")
  64. temp.bbout.tag.file <- tempfile(fileext=".tag.bbout")
  65. tryCatch({
  66. export(sort(gr), temp.bed.file, format="BED")
  67. extra.args <- list(...)
  68. bbargs <- c(rbind(sprintf("-%s", names(extra.args)), extra.args), "-print", "1", temp.bed.file)
  69. bbargs.tags <- c(rbind(sprintf("-%s", names(extra.args)), extra.args), "-print", "2", temp.bed.file)
  70. system2(blockbuster.path, args=bbargs,
  71. stdout=temp.bbout.file)
  72. system2(blockbuster.path, args=bbargs.tags,
  73. stdout=temp.bbout.tag.file)
  74. x <- read.blockbuster.output(temp.bbout.file, temp.bbout.tag.file)
  75. x
  76. }, finally={
  77. unlink(c(temp.bed.file, temp.bbout.file, temp.bbout.tag.file))
  78. })
  79. }
  80. ## Load the reads from the bam file
  81. infile <- "./all-results.bam"
  82. read.ranges <- {
  83. x <- readAlignedRanges(infile, include.reads=TRUE)
  84. ## Throw away unneeded columns
  85. elementMetadata(x) <- subset(elementMetadata(x), select=-c(id,qual,flag))
  86. read.multi.map.counts <- table(as.vector(elementMetadata(x)$seq))
  87. elementMetadata(x)$multimap <- Rle(as.vector(read.multi.map.counts[as.vector(elementMetadata(x)$seq)]))
  88. x
  89. }
  90. ## Get needed annotations
  91. ## rRNA & tRNA (from the repeats table)
  92. repeat.table <- get.ucsc.table("rmsk", "rmsk", genome="hg19")
  93. repeat.ranges <- table.to.granges(repeat.table, seqnames.column="genoName", start.column="genoStart", end.column="genoEnd", seqlengths="hg19")
  94. trna.ranges <- repeat.ranges[elementMetadata(repeat.ranges)$repClass == "tRNA"]
  95. rrna.ranges <- repeat.ranges[elementMetadata(repeat.ranges)$repClass == "rRNA"]
  96. ## miRNA & snoRNA
  97. small.rna.table <- subset(get.ucsc.table("wgRna", "wgRna", genome="hg19"), select=-c(thickStart,thickEnd,bin))
  98. small.rna.ranges <- table.to.granges(small.rna.table, seqnames.column="chrom", start.column="chromStart", end.column="chromEnd", seqlengths="hg19")
  99. miRNA.types <- "miRNA"
  100. snoRNA.types <- c("CDBox", "HAcaBox")
  101. miRNA.ranges <- small.rna.ranges[ elementMetadata(small.rna.ranges)$type %in% miRNA.types ]
  102. snoRNA.ranges <- small.rna.ranges[ elementMetadata(small.rna.ranges)$type %in% snoRNA.types ]
  103. ## chrM
  104. chrM.range <- GRanges(seqnames="chrM", IRanges(start=1,end=seqlengths(read.ranges)[["chrM"]]), seqlengths=seqlengths(read.ranges))
  105. ## For each sequence that maps once to an anotated rRNA, tRNA, miRNA,
  106. ## or chrM, remove *all* mappings for that sequence.
  107. forbidden.ranges <-
  108. Reduce(append,
  109. llply(list(rrna.ranges,
  110. trna.ranges,
  111. ## miRNA.ranges,
  112. chrM.range),
  113. function(x) { elementMetadata(x) <- NULL; x }))
  114. generate.null.ranges <- function(y) {
  115. x <- GRanges(seqnames=names(seqlengths(y)), ranges=IRanges(1,0), strand="*", seqlengths=seqlengths(y))
  116. for (i in names(elementMetadata(y))) {
  117. elementMetadata(x)[[i]] <- as(NA, class(as.vector(elementMetadata(y)[[i]])))
  118. }
  119. x
  120. }
  121. select.nearest <- function(x, y) {
  122. y <- append(y, generate.null.ranges(y))
  123. y[nearest(x,y)]
  124. }
  125. annotate.by.granges <- function(peaks, gr, annot.columns) {
  126. for (i in names(elementMetadata(gr))) {
  127. elementMetadata(gr)[[i]] <- as.vector(elementMetadata(gr)[[i]])
  128. }
  129. nearest.ranges <- select.nearest(peaks, gr)
  130. nearest.distance <- distance(peaks, nearest.ranges, ignore.strand=TRUE)
  131. in.ranges <- Rle(nearest.distance == 0)
  132. annot.data <- elementMetadata(nearest.ranges)[annot.columns]
  133. if (!is.null(names(annot.columns))) {
  134. names(annot.data) <- names(annot.columns)
  135. }
  136. DataFrame(overlap=in.ranges, distance=nearest.distance, annot.data)
  137. }
  138. forbidden.seqs <- unique(as.vector(elementMetadata(read.ranges)$seq[read.ranges %in% forbidden.ranges]))
  139. forbidden.indices <- !is.na(findOverlaps(read.ranges, forbidden.ranges, select="first", ignore.strand=TRUE))
  140. forbidden.read.ranges <- read.ranges[forbidden.indices]
  141. read.ranges <- read.ranges[!forbidden.indices]
  142. ## Read the counts table
  143. read.counts <- {
  144. x <- read.csv("./data/R21_R82_read_expr_matrix_no_blanks",
  145. stringsAsFactors=FALSE, sep="\t", header=TRUE, row.names=1)
  146. row.names(x) <- str_trim(row.names(x))
  147. names(x) <- str_replace(names(x), "_collapsed_sorted_with_zeros$", "")
  148. x <- x[row.names(x) %in% elementMetadata(read.ranges)$seq,]
  149. x <- x[order(names(x))]
  150. x
  151. }
  152. ## Create per-sample bed files with score = count / multimap
  153. sample.read.ranges <-
  154. llply(names(read.counts),
  155. function (sample) {
  156. x <- read.ranges
  157. ## Score = sample count / multimap
  158. elementMetadata(x)$score <-
  159. as.vector(read.counts[as.vector(elementMetadata(x)$seq), sample] / elementMetadata(x)$multimap)
  160. ## Eliminate reads with zero score
  161. x <- x[elementMetadata(x)$score > 0]
  162. x
  163. }, .parallel=TRUE)
  164. names(sample.read.ranges) <- names(read.counts)
  165. ## Run blockbuster on each sample
  166. ## x <- sample.read.ranges[[1]][1:500]
  167. ## y <- run.blockbuster(x)
  168. ## z <- calculate.block.entropy(y$tags)
  169. blockbuster.results <- llply(sample.read.ranges, function(x) run.blockbuster(unstranded(x), minBlockHeight=5), .parallel=TRUE)
  170. ## calculate.block.entropy(blockbuster.results[[1]]$tags[1:5])
  171. ## Calculate entropy of blocks
  172. block.entropy <- llply(blockbuster.results, function(x) calculate.block.entropy(x$tags), .parallel=TRUE)
  173. for (i in names(blockbuster.results)) {
  174. elementMetadata(blockbuster.results[[i]]$blocks)$entropy <- block.entropy[[i]][as.character(elementMetadata(blockbuster.results[[i]]$blocks)$blockID)]
  175. }
  176. ## Plot entropy vs block length
  177. x <- blockbuster.results[[1]]$blocks
  178. y <- cbind(as.data.frame(elementMetadata(x)), width=width(x))
  179. ggplot(y, aes(x=width, y=entropy, color=..density..)) + stat_density2d(geom="tile", aes(fill=..density..), contour=FALSE) + scale_fill_gradient(low="blue", high="yellow") + scale_color_gradient(low="blue", high="yellow")
  180. ## Compute nearest miRNA/snoRNA to each block and add annotation
  181. block.annot <- llply(blockbuster.results, function(br) {
  182. annotate.by.granges(br$blocks, small.rna.ranges, c(nearest.ncRNA="name", ncRNA.type="type"))
  183. }, .parallel=TRUE)
  184. for (i in names(blockbuster.results)) {
  185. elementMetadata(blockbuster.results[[i]]$blocks)[names(block.annot[[i]])] <- block.annot[[i]]
  186. }
  187. ## write output
  188. saveRDS(blockbuster.results, "blockbuster_results.RDS")
  189. block.tables <- llply(blockbuster.results, function(br) as(granges.to.dataframe(br$blocks, ignore.strand=TRUE, include.width=TRUE), "data.frame"))
  190. write.xlsx.multisheet(block.tables, "blockbuster_results.xlsx", row.names=FALSE)