#!/usr/bin/env Rscript source("common.R") library(BSgenome.Hsapiens.UCSC.hg19) textConnectionFromLines <- function(lines, linesep="\n") { textConnection(str_c(sprintf("%s%s", lines, linesep), collapse="")) } ## Takes a GRangesList calculate.block.entropy <- function(grl, expr.column="tagExpression") { groupfac <- factor(rep(names(grl), elementLengths(grl))) exprs <- as.vector(elementMetadata(unlist(grl))[[expr.column]]) total.exprs <- aggregate(exprs, by=list(groupfac), FUN=sum)$x qi <- exprs / rep(total.exprs, elementLengths(grl)) qi.times.log <- qi * log2(qi) results <- -aggregate(qi.times.log, by=list(groupfac), FUN=sum)$x names(results) <- names(grl) results } ## http://hoffmann.bioinf.uni-leipzig.de/LIFE/blockbuster.html read.blockbuster.tag.output <- function(bbout) { x <- readLines(bbout) cluster.line.numbers <- which(str_sub(x, 1,1) == ">") cluster.lines <- str_sub(x[cluster.line.numbers], 2) cluster.table <- read.table(textConnectionFromLines(cluster.lines), col.names=c("clusterID", "chrom", "clusterStart", "clusterEnd", "strand", "ClusterExpression", "tagCount", "blockCount"), colClasses=c("character", "factor", "integer", "integer", "factor", "numeric", "integer", "integer")) tag.lines <- x[-cluster.line.numbers] cluster.tag.line.counts <- cluster.line.numbers[-1] - cluster.line.numbers[-length(cluster.line.numbers)] - 1 cluster.tag.line.counts <- c(cluster.tag.line.counts, length(tag.lines) - sum(cluster.tag.line.counts)) tag.table <- read.table(textConnectionFromLines(tag.lines), col.names=c("tagChrom", "tagStart", "tagEnd", "tagID", "tagExpression", "tagStrand", "blockNb"), colClasses=c("factor", "integer", "integer", "character", "numeric", "factor", "integer")) tag.table$clusterID <- rep(cluster.table$clusterID, cluster.tag.line.counts) tag.table$blockID <- sprintf("%s/B%s", tag.table$clusterID, tag.table$blockNb) tags <- table.to.granges(tag.table, seqnames.column="tagChrom", start.column="tagStart", end.column="tagEnd", strand.column="tagStrand", seqlengths="hg19") tags <- split(tags, as.vector(elementMetadata(tags)$blockID)) return(tags) } read.blockbuster.output <- function(bbout, bbout.tags=NULL) { x <- readLines(bbout) cluster.line.numbers <- which(str_sub(x, 1,1) == ">") cluster.lines <- str_sub(x[cluster.line.numbers], 2) block.lines <- x[-cluster.line.numbers] cluster.table <- read.table(textConnectionFromLines(cluster.lines), col.names=c("clusterID", "chrom", "clusterStart", "clusterEnd", "strand", "ClusterExpression", "tagCount", "blockCount"), colClasses=c("character", "factor", "integer", "integer", "factor", "numeric", "integer", "integer")) clusters <- table.to.granges(cluster.table, seqnames.column="chrom", start.column="clusterStart", end.column="clusterEnd", seqlengths="hg19") block.table <- read.table(textConnectionFromLines(block.lines), col.names=c("blockNb", "blockChrom", "blockStart", "blockEnd", "blockStrand", "blockExpression", "readCount"), colClasses=c("integer", "factor", "integer", "integer", "factor", "numeric", "integer")) block.table$clusterID <- rep(cluster.table$clusterID, cluster.table$blockCount) block.table$blockID <- sprintf("%s/B%s", block.table$clusterID, block.table$blockNb) blocks. <- table.to.granges(block.table, seqnames.column="blockChrom", start.column="blockStart", end.column="blockEnd", strand.column="blockStrand", seqlengths="hg19") ## blocks. <- split(blocks., as.vector(elementMetadata(blocks.)$clusterID)) retval <- list(clusters=clusters, blocks=blocks.) if (!is.null(bbout.tags)) { retval$tags <- read.blockbuster.tag.output(bbout.tags) } return(retval) } blockbuster.path <- "/home/ryan/bin/blockbuster" run.blockbuster <- function(gr, ...) { temp.bed.file <- tempfile(fileext=".bed") temp.bbout.file <- tempfile(fileext=".bbout") temp.bbout.tag.file <- tempfile(fileext=".tag.bbout") tryCatch({ export(sort(gr), temp.bed.file, format="BED") extra.args <- list(...) bbargs <- c(rbind(sprintf("-%s", names(extra.args)), extra.args), "-print", "1", temp.bed.file) bbargs.tags <- c(rbind(sprintf("-%s", names(extra.args)), extra.args), "-print", "2", temp.bed.file) system2(blockbuster.path, args=bbargs, stdout=temp.bbout.file) system2(blockbuster.path, args=bbargs.tags, stdout=temp.bbout.tag.file) x <- read.blockbuster.output(temp.bbout.file, temp.bbout.tag.file) x }, finally={ unlink(c(temp.bed.file, temp.bbout.file, temp.bbout.tag.file)) }) } ## Load the reads from the bam file infile <- "./all-results.bam" read.ranges <- { x <- readAlignedRanges(infile, include.reads=TRUE) ## Throw away unneeded columns elementMetadata(x) <- subset(elementMetadata(x), select=-c(id,qual,flag)) read.multi.map.counts <- table(as.vector(elementMetadata(x)$seq)) elementMetadata(x)$multimap <- Rle(as.vector(read.multi.map.counts[as.vector(elementMetadata(x)$seq)])) x } ## Get needed annotations ## rRNA & tRNA (from the repeats table) repeat.table <- get.ucsc.table("rmsk", "rmsk", genome="hg19") repeat.ranges <- table.to.granges(repeat.table, seqnames.column="genoName", start.column="genoStart", end.column="genoEnd", seqlengths="hg19") trna.ranges <- repeat.ranges[elementMetadata(repeat.ranges)$repClass == "tRNA"] rrna.ranges <- repeat.ranges[elementMetadata(repeat.ranges)$repClass == "rRNA"] ## miRNA & snoRNA small.rna.table <- subset(get.ucsc.table("wgRna", "wgRna", genome="hg19"), select=-c(thickStart,thickEnd,bin)) small.rna.ranges <- table.to.granges(small.rna.table, seqnames.column="chrom", start.column="chromStart", end.column="chromEnd", seqlengths="hg19") miRNA.types <- "miRNA" snoRNA.types <- c("CDBox", "HAcaBox") miRNA.ranges <- small.rna.ranges[ elementMetadata(small.rna.ranges)$type %in% miRNA.types ] snoRNA.ranges <- small.rna.ranges[ elementMetadata(small.rna.ranges)$type %in% snoRNA.types ] ## chrM chrM.range <- GRanges(seqnames="chrM", IRanges(start=1,end=seqlengths(read.ranges)[["chrM"]]), seqlengths=seqlengths(read.ranges)) ## For each sequence that maps once to an anotated rRNA, tRNA, miRNA, ## or chrM, remove *all* mappings for that sequence. forbidden.ranges <- Reduce(append, llply(list(rrna.ranges, trna.ranges, ## miRNA.ranges, chrM.range), function(x) { elementMetadata(x) <- NULL; x })) generate.null.ranges <- function(y) { x <- GRanges(seqnames=names(seqlengths(y)), ranges=IRanges(1,0), strand="*", seqlengths=seqlengths(y)) for (i in names(elementMetadata(y))) { elementMetadata(x)[[i]] <- as(NA, class(as.vector(elementMetadata(y)[[i]]))) } x } select.nearest <- function(x, y) { y <- append(y, generate.null.ranges(y)) y[nearest(x,y)] } annotate.by.granges <- function(peaks, gr, annot.columns) { for (i in names(elementMetadata(gr))) { elementMetadata(gr)[[i]] <- as.vector(elementMetadata(gr)[[i]]) } nearest.ranges <- select.nearest(peaks, gr) nearest.distance <- distance(peaks, nearest.ranges, ignore.strand=TRUE) in.ranges <- Rle(nearest.distance == 0) annot.data <- elementMetadata(nearest.ranges)[annot.columns] if (!is.null(names(annot.columns))) { names(annot.data) <- names(annot.columns) } DataFrame(overlap=in.ranges, distance=nearest.distance, annot.data) } forbidden.seqs <- unique(as.vector(elementMetadata(read.ranges)$seq[read.ranges %in% forbidden.ranges])) forbidden.indices <- !is.na(findOverlaps(read.ranges, forbidden.ranges, select="first", ignore.strand=TRUE)) forbidden.read.ranges <- read.ranges[forbidden.indices] read.ranges <- read.ranges[!forbidden.indices] ## Read the counts table read.counts <- { x <- read.csv("./data/R21_R82_read_expr_matrix_no_blanks", stringsAsFactors=FALSE, sep="\t", header=TRUE, row.names=1) row.names(x) <- str_trim(row.names(x)) names(x) <- str_replace(names(x), "_collapsed_sorted_with_zeros$", "") x <- x[row.names(x) %in% elementMetadata(read.ranges)$seq,] x <- x[order(names(x))] x } ## Create per-sample bed files with score = count / multimap sample.read.ranges <- llply(names(read.counts), function (sample) { x <- read.ranges ## Score = sample count / multimap elementMetadata(x)$score <- as.vector(read.counts[as.vector(elementMetadata(x)$seq), sample] / elementMetadata(x)$multimap) ## Eliminate reads with zero score x <- x[elementMetadata(x)$score > 0] x }, .parallel=TRUE) names(sample.read.ranges) <- names(read.counts) ## Run blockbuster on each sample ## x <- sample.read.ranges[[1]][1:500] ## y <- run.blockbuster(x) ## z <- calculate.block.entropy(y$tags) blockbuster.results <- llply(sample.read.ranges, function(x) run.blockbuster(unstranded(x), minBlockHeight=5), .parallel=TRUE) ## calculate.block.entropy(blockbuster.results[[1]]$tags[1:5]) ## Calculate entropy of blocks block.entropy <- llply(blockbuster.results, function(x) calculate.block.entropy(x$tags), .parallel=TRUE) for (i in names(blockbuster.results)) { elementMetadata(blockbuster.results[[i]]$blocks)$entropy <- block.entropy[[i]][as.character(elementMetadata(blockbuster.results[[i]]$blocks)$blockID)] } ## Plot entropy vs block length x <- blockbuster.results[[1]]$blocks y <- cbind(as.data.frame(elementMetadata(x)), width=width(x)) ggplot(y, aes(x=width, y=entropy, color=..density..)) + stat_density2d(geom="tile", aes(fill=..density..), contour=FALSE) + scale_fill_gradient(low="blue", high="yellow") + scale_color_gradient(low="blue", high="yellow") ## Compute nearest miRNA/snoRNA to each block and add annotation block.annot <- llply(blockbuster.results, function(br) { annotate.by.granges(br$blocks, small.rna.ranges, c(nearest.ncRNA="name", ncRNA.type="type")) }, .parallel=TRUE) for (i in names(blockbuster.results)) { elementMetadata(blockbuster.results[[i]]$blocks)[names(block.annot[[i]])] <- block.annot[[i]] } ## write output saveRDS(blockbuster.results, "blockbuster_results.RDS") block.tables <- llply(blockbuster.results, function(br) as(granges.to.dataframe(br$blocks, ignore.strand=TRUE, include.width=TRUE), "data.frame")) write.xlsx.multisheet(block.tables, "blockbuster_results.xlsx", row.names=FALSE)