123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217 |
- #!/usr/bin/env Rscript
- source("common.R")
- library(BSgenome.Hsapiens.UCSC.hg19)
- textConnectionFromLines <- function(lines, linesep="\n") {
- textConnection(str_c(sprintf("%s%s", lines, linesep), collapse=""))
- }
- ## Takes a GRangesList
- calculate.block.entropy <- function(grl, expr.column="tagExpression") {
- groupfac <- factor(rep(names(grl), elementLengths(grl)))
- exprs <- as.vector(elementMetadata(unlist(grl))[[expr.column]])
- total.exprs <- aggregate(exprs, by=list(groupfac), FUN=sum)$x
- qi <- exprs / rep(total.exprs, elementLengths(grl))
- qi.times.log <- qi * log2(qi)
- results <- -aggregate(qi.times.log, by=list(groupfac), FUN=sum)$x
- names(results) <- names(grl)
- results
- }
- ## http://hoffmann.bioinf.uni-leipzig.de/LIFE/blockbuster.html
- read.blockbuster.tag.output <- function(bbout) {
- x <- readLines(bbout)
- cluster.line.numbers <- which(str_sub(x, 1,1) == ">")
- cluster.lines <- str_sub(x[cluster.line.numbers], 2)
- cluster.table <- read.table(textConnectionFromLines(cluster.lines),
- col.names=c("clusterID", "chrom", "clusterStart", "clusterEnd", "strand", "ClusterExpression", "tagCount", "blockCount"),
- colClasses=c("character", "factor", "integer", "integer", "factor", "numeric", "integer", "integer"))
- tag.lines <- x[-cluster.line.numbers]
- cluster.tag.line.counts <- cluster.line.numbers[-1] - cluster.line.numbers[-length(cluster.line.numbers)] - 1
- cluster.tag.line.counts <- c(cluster.tag.line.counts, length(tag.lines) - sum(cluster.tag.line.counts))
- tag.table <- read.table(textConnectionFromLines(tag.lines),
- col.names=c("tagChrom", "tagStart", "tagEnd", "tagID", "tagExpression", "tagStrand", "blockNb"),
- colClasses=c("factor", "integer", "integer", "character", "numeric", "factor", "integer"))
- tag.table$clusterID <- rep(cluster.table$clusterID, cluster.tag.line.counts)
- tag.table$blockID <- sprintf("%s/B%s", tag.table$clusterID, tag.table$blockNb)
- tags <- table.to.granges(tag.table, seqnames.column="tagChrom", start.column="tagStart", end.column="tagEnd", strand.column="tagStrand", seqlengths="hg19")
- tags <- split(tags, as.vector(elementMetadata(tags)$blockID))
- return(tags)
- }
- read.blockbuster.output <- function(bbout, bbout.tags=NULL) {
- x <- readLines(bbout)
- cluster.line.numbers <- which(str_sub(x, 1,1) == ">")
- cluster.lines <- str_sub(x[cluster.line.numbers], 2)
- block.lines <- x[-cluster.line.numbers]
- cluster.table <- read.table(textConnectionFromLines(cluster.lines),
- col.names=c("clusterID", "chrom", "clusterStart", "clusterEnd", "strand", "ClusterExpression", "tagCount", "blockCount"),
- colClasses=c("character", "factor", "integer", "integer", "factor", "numeric", "integer", "integer"))
- clusters <- table.to.granges(cluster.table, seqnames.column="chrom", start.column="clusterStart", end.column="clusterEnd", seqlengths="hg19")
- block.table <- read.table(textConnectionFromLines(block.lines),
- col.names=c("blockNb", "blockChrom", "blockStart", "blockEnd", "blockStrand", "blockExpression", "readCount"),
- colClasses=c("integer", "factor", "integer", "integer", "factor", "numeric", "integer"))
- block.table$clusterID <- rep(cluster.table$clusterID, cluster.table$blockCount)
- block.table$blockID <- sprintf("%s/B%s", block.table$clusterID, block.table$blockNb)
- blocks. <- table.to.granges(block.table, seqnames.column="blockChrom", start.column="blockStart", end.column="blockEnd", strand.column="blockStrand", seqlengths="hg19")
- ## blocks. <- split(blocks., as.vector(elementMetadata(blocks.)$clusterID))
- retval <- list(clusters=clusters, blocks=blocks.)
- if (!is.null(bbout.tags)) {
- retval$tags <- read.blockbuster.tag.output(bbout.tags)
- }
- return(retval)
- }
- blockbuster.path <- "/home/ryan/bin/blockbuster"
- run.blockbuster <- function(gr, ...) {
- temp.bed.file <- tempfile(fileext=".bed")
- temp.bbout.file <- tempfile(fileext=".bbout")
- temp.bbout.tag.file <- tempfile(fileext=".tag.bbout")
- tryCatch({
- export(sort(gr), temp.bed.file, format="BED")
- extra.args <- list(...)
- bbargs <- c(rbind(sprintf("-%s", names(extra.args)), extra.args), "-print", "1", temp.bed.file)
- bbargs.tags <- c(rbind(sprintf("-%s", names(extra.args)), extra.args), "-print", "2", temp.bed.file)
- system2(blockbuster.path, args=bbargs,
- stdout=temp.bbout.file)
- system2(blockbuster.path, args=bbargs.tags,
- stdout=temp.bbout.tag.file)
- x <- read.blockbuster.output(temp.bbout.file, temp.bbout.tag.file)
- x
- }, finally={
- unlink(c(temp.bed.file, temp.bbout.file, temp.bbout.tag.file))
- })
- }
- ## Load the reads from the bam file
- infile <- "./all-results.bam"
- read.ranges <- {
- x <- readAlignedRanges(infile, include.reads=TRUE)
- ## Throw away unneeded columns
- elementMetadata(x) <- subset(elementMetadata(x), select=-c(id,qual,flag))
- read.multi.map.counts <- table(as.vector(elementMetadata(x)$seq))
- elementMetadata(x)$multimap <- Rle(as.vector(read.multi.map.counts[as.vector(elementMetadata(x)$seq)]))
- x
- }
- ## Get needed annotations
- ## rRNA & tRNA (from the repeats table)
- repeat.table <- get.ucsc.table("rmsk", "rmsk", genome="hg19")
- repeat.ranges <- table.to.granges(repeat.table, seqnames.column="genoName", start.column="genoStart", end.column="genoEnd", seqlengths="hg19")
- trna.ranges <- repeat.ranges[elementMetadata(repeat.ranges)$repClass == "tRNA"]
- rrna.ranges <- repeat.ranges[elementMetadata(repeat.ranges)$repClass == "rRNA"]
- ## miRNA & snoRNA
- small.rna.table <- subset(get.ucsc.table("wgRna", "wgRna", genome="hg19"), select=-c(thickStart,thickEnd,bin))
- small.rna.ranges <- table.to.granges(small.rna.table, seqnames.column="chrom", start.column="chromStart", end.column="chromEnd", seqlengths="hg19")
- miRNA.types <- "miRNA"
- snoRNA.types <- c("CDBox", "HAcaBox")
- miRNA.ranges <- small.rna.ranges[ elementMetadata(small.rna.ranges)$type %in% miRNA.types ]
- snoRNA.ranges <- small.rna.ranges[ elementMetadata(small.rna.ranges)$type %in% snoRNA.types ]
- ## chrM
- chrM.range <- GRanges(seqnames="chrM", IRanges(start=1,end=seqlengths(read.ranges)[["chrM"]]), seqlengths=seqlengths(read.ranges))
- ## For each sequence that maps once to an anotated rRNA, tRNA, miRNA,
- ## or chrM, remove *all* mappings for that sequence.
- forbidden.ranges <-
- Reduce(append,
- llply(list(rrna.ranges,
- trna.ranges,
- ## miRNA.ranges,
- chrM.range),
- function(x) { elementMetadata(x) <- NULL; x }))
- generate.null.ranges <- function(y) {
- x <- GRanges(seqnames=names(seqlengths(y)), ranges=IRanges(1,0), strand="*", seqlengths=seqlengths(y))
- for (i in names(elementMetadata(y))) {
- elementMetadata(x)[[i]] <- as(NA, class(as.vector(elementMetadata(y)[[i]])))
- }
- x
- }
- select.nearest <- function(x, y) {
- y <- append(y, generate.null.ranges(y))
- y[nearest(x,y)]
- }
- annotate.by.granges <- function(peaks, gr, annot.columns) {
- for (i in names(elementMetadata(gr))) {
- elementMetadata(gr)[[i]] <- as.vector(elementMetadata(gr)[[i]])
- }
- nearest.ranges <- select.nearest(peaks, gr)
- nearest.distance <- distance(peaks, nearest.ranges, ignore.strand=TRUE)
- in.ranges <- Rle(nearest.distance == 0)
- annot.data <- elementMetadata(nearest.ranges)[annot.columns]
- if (!is.null(names(annot.columns))) {
- names(annot.data) <- names(annot.columns)
- }
- DataFrame(overlap=in.ranges, distance=nearest.distance, annot.data)
- }
- forbidden.seqs <- unique(as.vector(elementMetadata(read.ranges)$seq[read.ranges %in% forbidden.ranges]))
- forbidden.indices <- !is.na(findOverlaps(read.ranges, forbidden.ranges, select="first", ignore.strand=TRUE))
- forbidden.read.ranges <- read.ranges[forbidden.indices]
- read.ranges <- read.ranges[!forbidden.indices]
- ## Read the counts table
- read.counts <- {
- x <- read.csv("./data/R21_R82_read_expr_matrix_no_blanks",
- stringsAsFactors=FALSE, sep="\t", header=TRUE, row.names=1)
- row.names(x) <- str_trim(row.names(x))
- names(x) <- str_replace(names(x), "_collapsed_sorted_with_zeros$", "")
- x <- x[row.names(x) %in% elementMetadata(read.ranges)$seq,]
- x <- x[order(names(x))]
- x
- }
- ## Create per-sample bed files with score = count / multimap
- sample.read.ranges <-
- llply(names(read.counts),
- function (sample) {
- x <- read.ranges
- ## Score = sample count / multimap
- elementMetadata(x)$score <-
- as.vector(read.counts[as.vector(elementMetadata(x)$seq), sample] / elementMetadata(x)$multimap)
- ## Eliminate reads with zero score
- x <- x[elementMetadata(x)$score > 0]
- x
- }, .parallel=TRUE)
- names(sample.read.ranges) <- names(read.counts)
- ## Run blockbuster on each sample
- ## x <- sample.read.ranges[[1]][1:500]
- ## y <- run.blockbuster(x)
- ## z <- calculate.block.entropy(y$tags)
- blockbuster.results <- llply(sample.read.ranges, function(x) run.blockbuster(unstranded(x), minBlockHeight=5), .parallel=TRUE)
- ## calculate.block.entropy(blockbuster.results[[1]]$tags[1:5])
- ## Calculate entropy of blocks
- block.entropy <- llply(blockbuster.results, function(x) calculate.block.entropy(x$tags), .parallel=TRUE)
- for (i in names(blockbuster.results)) {
- elementMetadata(blockbuster.results[[i]]$blocks)$entropy <- block.entropy[[i]][as.character(elementMetadata(blockbuster.results[[i]]$blocks)$blockID)]
- }
- ## Plot entropy vs block length
- x <- blockbuster.results[[1]]$blocks
- y <- cbind(as.data.frame(elementMetadata(x)), width=width(x))
- ggplot(y, aes(x=width, y=entropy, color=..density..)) + stat_density2d(geom="tile", aes(fill=..density..), contour=FALSE) + scale_fill_gradient(low="blue", high="yellow") + scale_color_gradient(low="blue", high="yellow")
- ## Compute nearest miRNA/snoRNA to each block and add annotation
- block.annot <- llply(blockbuster.results, function(br) {
- annotate.by.granges(br$blocks, small.rna.ranges, c(nearest.ncRNA="name", ncRNA.type="type"))
- }, .parallel=TRUE)
- for (i in names(blockbuster.results)) {
- elementMetadata(blockbuster.results[[i]]$blocks)[names(block.annot[[i]])] <- block.annot[[i]]
- }
- ## write output
- saveRDS(blockbuster.results, "blockbuster_results.RDS")
- block.tables <- llply(blockbuster.results, function(br) as(granges.to.dataframe(br$blocks, ignore.strand=TRUE, include.width=TRUE), "data.frame"))
- write.xlsx.multisheet(block.tables, "blockbuster_results.xlsx", row.names=FALSE)
|