#!/usr/bin/env Rscript
default.align.opts <- list(match=1, mismatch=3,
gapOpening=5, gapExtension=2)
parse_arguments <- function() {
suppressMessages({
library(optparse)
library(parallel)
})
option_list <-
list(make_option(c("-c", "--min-call"), type="integer", default=10, metavar="10",
help="Minimum perfect overlap required to call the presence of the subject (paired only). Imperfect overlap will need to be longer, based on specified mismatch and gap penalties."),
make_option(c("-l", "--min-length"), type="integer", default=36, metavar="36",
help="Minimum length allowed after trimming a read. Any reads shorter than this after trimming will be discarded."),
make_option(c("-i", "--interleaved"), action="store_true", default=FALSE,
help="Specify this option if you have paired-end sequences interleaved in a single FASTQ file. The default is to read paired-end sequences from a matched pair of files, and this option is ignored if two fastq files are provided. When you use this option, skip the \"READ2.fastq\" argument."),
make_option(c("-o", "--read1-orientation"), type="character", default="in", metavar="in/out",
help="Orientation of read1. Can be either \"in\" or \"out\" (paired only). Note that Illumina reads are \"in\"."),
make_option(c("-q", "--read2-orientation"), type="character", default="in", metavar="in/out",
help="Orientation of read2. Can be either \"in\" or \"out\" (paired only)"),
make_option(c("-j", "--jobs"), type="integer",
default=parallel:::detectCores(),
metavar=as.character(parallel:::detectCores()),
help="Number of jobs to run in parallel for alignment. This should be autodetected by default."),
make_option(c("-y", "--yield-size"), type="integer",
default=100000,
metavar="100000",
help="Number of reads to process at a time. Setting this higher will read more data into memory at once and result in faster runtime. Setting this lower will require less memory."),
make_option(c("-m", "--match-bonus"), type="double",
default=default.align.opts$match,
metavar=as.character(default.align.opts$match),
help="Score bonus for a matching nucleotide"),
make_option(c("-p", "--mismatch-penalty"), type="double",
default=default.align.opts$mismatch,
metavar=as.character(default.align.opts$mismatch),
help="Score penalty for a mismatched nucleotide (specify as a positive number)"),
make_option(c("-g", "--gap-open-penalty"), type="double",
default=default.align.opts$gapOpening,
metavar=as.character(default.align.opts$gapOpening),
help="Score penalty for opening a gap in the alignment (specifiy as a positive number)"),
make_option(c("-e", "--gap-extension-penalty"), type="double",
default=default.align.opts$match,
metavar=as.character(default.align.opts$gapExtension),
help="Score penalty for extending an alignment gap by two nucleotides (specify as a positive number)"),
make_option(c("-s", "--single-read-mode"), action="store_true", default=FALSE,
help="Tell DeLoxer to run in single-end mode instead of paired-end mode. In this mode, the only a single input fastq file is provided, and only a single output file is created. No classification is performed, only trimming. When you use this option, skip the \"READ2.fastq\" argument, and specify the full file name for OUTPUT_NAME instead of just the base name."))
option_parser <- OptionParser(option_list=option_list,
usage="%prog [options] adapter.fasta READ1.fastq READ2.fastq OUTPUT_NAME")
opt <- parse_args(option_parser, positional_arguments=TRUE)
return(opt)
}
## Call this here to handle --help quickly, before we waste 10 seconds
## loading all the libraries
invisible(parse_arguments())
print.option.list <- function(opt=parse_arguments()) {
args <- opt$args
opts <- opt$options
message("Options:")
foreach (o=opts, n=names(opts)) %do% {
if (n != "help")
message(" ", n, ": ", o)
}
message("Args: ", paste("\"", args, "\"", sep="", collapse=", "))
}
unimplemented <- function() stop("UNIMPLEMENTED")
## Timestampped message
tsmsg <- function(...) {
message("# ", date(), ": ", ...)
}
tsmsg("Starting deloxer and loading required packages")
suppressMessages({
library(ShortRead)
library(optparse)
library(foreach)
library(iterators)
library(itertools)
library(doMC)
registerDoMC()
mcoptions <- list(preschedule=TRUE, set.seed=FALSE)
})
## Merge l1 and l2 by names
merge.lists <- function(l1, l2) {
new.names <- setdiff(names(l2), names(l1))
l1[new.names] <- l2[new.names]
l1
}
## Return an object sans names
strip.names <- function(x) {
names(x) <- NULL
x
}
## Define some missing type coercions
setAs(from="ShortRead", to="DNAStringSet", def=function(from) sread(from))
setAs(from="PhredQuality", to="FastqQuality", def=function(from) FastqQuality(BStringSet(from)))
setAs(from="SolexaQuality", to="SFastqQuality", def=function(from) SFastqQuality(BStringSet(from)))
setAs(from="QualityScaledXStringSet", to="ShortReadQ", def=function(from) {
q <- quality(from)
new.quality.class <- switch(class(q),
SolexaQuality="SFastqQuality",
PhredQuality="FastqQuality",
stop("Unknown quality type: ", class(q)))
q <- as(q, new.quality.class)
ShortReadQ(sread=as(from, "DNAStringSet"),
quality=q,
id=BStringSet(names(from)))
})
## Override the provided method to keep the sequence names
setAs(from="ShortReadQ", to="QualityScaledDNAStringSet",
def=function (from, to = "QualityScaledDNAStringSet", strict = TRUE) {
q <- quality(from)
new.quality.class <- switch(class(q),
SFastqQuality="SolexaQuality",
FastqQuality="PhredQuality",
"XStringQuality")
q <- as(q, new.quality.class)
x <- QualityScaledDNAStringSet(sread(from), q)
names(x) <- as.character(id(from))
x
})
## Define functions for reading fastq into standard Biostrings object
## and writing it back out. The standard functions readFastq and
## writeFastq operate on ShortRead objects. These simply wrap them in
## conversion to/from QualityScaledDNAStringSet.
read.QualityScaledDNAStringSet <- function(filepath, format = "fastq", ...) {
switch(format,
fastq=as(readFastq(filepath, withIds=TRUE, ...), "QualityScaledDNAStringSet"),
## Default
stop("Unknown quality-scaled sequence format: ", format))
}
write.QualityScaledDNAStringSet <- function (x, filepath, append = FALSE, format = "fastq") {
if(length(x) > 0) {
sr <- as(x, "ShortReadQ")
switch(format,
fastq={
if (!append)
unlink(filepath);
writeFastq(object=sr,
file=filepath, mode=ifelse(append, "a", "w"))
},
## Default
stop("Unknown quality-scaled sequence format: ", format))
} else {
## Zero-length sequence; just truncate/touch the file
sink(file=filepath, append=append)
sink()
}
}
discard.short.reads <- function(reads, min.length=1) {
kept.reads <- reads[width(reads) >= min.length]
return(kept.reads)
}
## Takes a set of interleaved reads (or anything else) and
## de-interleaves them
deinterleave.pairs <- function(reads) {
stopifnot(length(reads) %% 2 == 0)
mask <- seq(from=1, to=length(reads), by=2)
return(list(read1=reads[mask], read2=reads[-mask]))
}
.delox.trimmed.ranges <- function(subj, reads, min.length=36,
include.scores=TRUE,
include.deleted.ranges=TRUE,
align.opts=list()) {
align.opts <- merge.lists(align.opts, default.align.opts)
aln <- list(forward=pairwiseAlignment(pattern=reads,
subject=subj,
type="overlap",
substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension),
revcomp=pairwiseAlignment(pattern=reads,
subject=reverseComplement(DNAString(subj)),
type="overlap",
substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension))
aln.scores <- Map(score, aln)
aln.pat <- Map(pattern, aln)
aln.ranges <- Map(function(x) IRanges(start=start(x), end=end(x)), aln.pat)
aln.threebands <- Map(function (x) threebands(IRanges(start=1, end=width(reads)),
start=start(x), end=end(x)),
aln.ranges)
## For each read, decide whether the forward or reverse alignment
## was better.
revcomp.better <- aln.scores$forward < aln.scores$revcomp
## For each read, take the threebands for the better alignment.
best.threebands <- aln.threebands$forward
for (band in names(best.threebands)) {
best.threebands[[band]][revcomp.better] <- aln.threebands$revcomp[[band]][revcomp.better]
}
## Use the left band if it is longer than either min.length or
## length of right band.
use.right.band <- width(best.threebands$left) < pmin(min.length, width(best.threebands$right))
ranges <- best.threebands$left
ranges[use.right.band] <- best.threebands$right[use.right.band]
## Record which ranges are shorter than min.length
too.short <- width(ranges) < min.length
## ranges[too.short] <- IRanges(start=1,end=0)
## Record what was trimmed off of each read (NOT what was kept!)
trim <- factor(ifelse(use.right.band, "left", "right"), levels=c("right", "left", "all", "none"))
## If it's too short, then we trim "all", i.e. discard the whole
## read.
trim[too.short] <- "all"
## If the read is not shorter after trimming, then nothing was
## actually trimmed.
trim[width(ranges) == width(reads)] <- "none"
emeta <- list()
emeta$trim <- trim
if (include.deleted.ranges) {
deleted.start <- ifelse(too.short, 1,
ifelse(use.right.band,
start(best.threebands$left),
start(best.threebands$middle)))
deleted.end <- ifelse(too.short, width(reads),
ifelse(use.right.band,
end(best.threebands$middle),
end(best.threebands$right)))
emeta$deleted.range <- IRanges(deleted.start, deleted.end)
}
if (include.scores) {
## If requested, take the best score out of each pair of forward
## and reverse scores.
scores <- ifelse(revcomp.better, aln.scores$revcomp, aln.scores$forward)
emeta$score <- scores
}
mcols(ranges) <- DataFrame(emeta)
return(ranges)
}
## Always call delox on the underlying DNAStringSet object when called
## on something more complicated.
suppressMessages({
invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="ShortRead"),
function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
}))
invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledDNAStringSet"),
function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
}))
invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledXStringSet"),
function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
callGeneric(subj, as(reads, "XStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
}))
})
delox.single <- function(subj, reads , min.length=36,
include.scores=TRUE, align.opts=list()) {
tsmsg("Saving read names")
saved.names <- BStringSet(names(reads))
reads <- strip.names(reads)
invisible(gc())
tsmsg("Doing alignments")
nchunks <- min(getDoParWorkers(), ceiling(length(reads)/1000))
deloxed.ranges <- foreach(reads=isplitVector(reads, chunks=nchunks), .combine=c) %dopar% {
.delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
include.scores=include.scores,
include.deleted.ranges=FALSE,
align.opts=align.opts)
}
## maybe.chunkapply(.delox.trimmed.ranges,
## VECTOR.ARGS=list(reads=reads),
## SCALAR.ARGS=list(subj=subj, min.length=min.length,
## include.scores=include.scores,
## include.deleted.ranges=FALSE,
## align.opts=align.opts),
## min.chunk.size=1000,
## MERGE=c)
tsmsg("Trimming reads")
trimmed.reads <- narrow(reads, start(deloxed.ranges), end(deloxed.ranges))
tsmsg("Restoring read names")
names(trimmed.reads) <- as.character(saved.names)
tsmsg("Adding metadata")
emeta <- list()
if (include.scores) {
emeta$score <- mcols(deloxed.ranges)$score
}
if (length(emeta) > 0) {
mcols(trimmed.reads) <- DataFrame(emeta)
}
return(discard.short.reads(trimmed.reads, min.length))
}
delox.paired <- function(subj, read1, read2,
min.call=10, min.length=36,
include.scores=TRUE, align.opts=list()) {
align.opts <- merge.lists(align.opts, default.align.opts)
tsmsg("Checking read counts")
stopifnot(length(read1) == length(read2))
tsmsg("Listing reads")
original.reads <- list(read1=read1,
read2=read2)
rm(read1, read2)
tsmsg("Saving read names")
read.names <- foreach(r=original.reads) %do% BStringSet(names(r))
names(read.names) <- names(original.reads)
original.reads <- Map(strip.names, original.reads)
invisible(gc())
tsmsg("Doing alignments")
deloxed.ranges <- lapply(original.reads, function(x) {
nchunks <- min(getDoParWorkers(), ceiling(length(x)/1000))
foreach(reads=isplitVector(x, chunks=nchunks), .combine=c) %dopar% {
.delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
include.scores=include.scores,
include.deleted.ranges=FALSE,
align.opts=align.opts)
}
## maybe.chunkapply(.delox.trimmed.ranges,
## VECTOR.ARGS=list(reads=strip.names(x)),
## SCALAR.ARGS=list(subj=subj,
## min.length=min.length,
## include.scores=TRUE,
## include.deleted.ranges=TRUE,
## align.opts=align.opts),
## MERGE=c,
## min.chunk.size=1000)
})
tsmsg("Extracting metadata")
delox.meta <- lapply(deloxed.ranges, mcols)
## Decide whether enough was trimmed on the inside (right end) of
## either read to call it a mate-pair.
tsmsg("Calculating inside trim score")
inside.trim.score <- Reduce(pmax,
lapply(delox.meta,
function(x) ifelse(x$trim == "right", x$score, 0)))
## Decide whether enough was trimmed on the outside (left end) of
## either read to call it a non-mate-pair.
tsmsg("Calculating outside trim score")
outside.trim.score <- Reduce(pmax,
lapply(delox.meta,
function(x) ifelse(x$trim == "left", x$score, 0)))
tsmsg("Calling presence of subject")
calls <- list(inside=inside.trim.score >= min.call * align.opts$match,
outside=outside.trim.score >= min.call * align.opts$match)
tsmsg("Categorizing reads")
category <- factor(rep(NA, length(original.reads$read1)), levels=c("mate", "non-mate", "negative", "unpaired", "discard"))
category[calls$inside] <- "mate"
category[calls$outside] <- "non-mate"
## If they're either both true or both false, then it's ambiguous
category[calls$inside == calls$outside] <- "negative"
## All categories should be filled in now
stopifnot(all(!is.na(category)))
too.short <- lapply(deloxed.ranges, function(x) width(x) < min.length)
## If either read in a pair is too short, then its partner is no
## longer paired at all.
one.too.short <- Reduce(`|`, too.short)
category[one.too.short] <- "unpaired"
## If both reads in a pair are too short, then the entire pair is
## discarded. This is highly unlikely, since Cre-Lox should not
## appear in the middle of both sequences.
both.too.short <- Reduce(`&`, too.short)
category[both.too.short] <- "discard"
tsmsg("Trimming reads and restoring read names")
trimmed.reads <- lapply(names(original.reads), function(x) {
trimmed <- narrow(original.reads[[x]],
start=start(deloxed.ranges[[x]]),
end=end(deloxed.ranges[[x]]))
names(trimmed) <- as.character(read.names[[x]])
trimmed
})
names(trimmed.reads) <- names(original.reads)
tsmsg("Assembling metadata")
foreach (r=names(trimmed.reads)) %do% {
emeta <- list()
emeta$category <- category
emeta$category[too.short[[r]]] <- "discard"
if (include.scores) {
emeta$score <- delox.meta[[r]]$score
}
mcols(trimmed.reads[[r]]) <- DataFrame(emeta)
}
return(trimmed.reads)
}
## Wrapper for both single and paired as appropriate
delox <- function(subj, read1, read2=NULL,
min.call=10, min.length=36,
interleaved=FALSE,
read1.orientation=c("in", "out")[1],
read2.orientation=c("in", "out")[1],
align.opts=list()) {
if (is.null(read2)) {
if (interleaved) {
x <- deinterleave.pairs(read1)
read1 <- x$read1
read2 <- x$read2
} else {
tsmsg("Doing single-read delox")
return(delox.single(subj=subj, reads=read1, min.length=min.length, align.opts=align.opts))
}
}
## Make sure both reads are oriented "in" before calling
tsmsg("Ensuring correct read orientation")
if (tolower(read1.orientation) == "out") {
read1 <- reverseComplement(read1)
}
if (!is.null(read2) && tolower(read2.orientation) == "out") {
read2 <- reverseComplement(read2)
}
tsmsg("Doing paired-end delox")
deloxed.reads <- delox.paired(subj, read1, read2,
min.call=min.call, min.length=min.length,
align.opts=align.opts)
## If reads started "out", put them back that way before returning
tsmsg("Restoring original read orientation")
if (tolower(read1.orientation) == "out") {
deloxed.reads$read1 <- reverseComplement(deloxed.reads$read1)
}
if (tolower(read2.orientation) == "out") {
deloxed.reads$read2 <- reverseComplement(deloxed.reads$read2)
}
return(deloxed.reads)
}
## ## Hack to work around a bug in BioConductor that prevents subsetting
## ## of named XStringSet objects. Apparently, since DeLoxer was first
## ## published, the BioConductor devs broke the XStringSet subsetting
## ## code so that it can no longer handle XStringSets with names. The
## ## code below strips the names from the XStringSet, then calls the old
## ## code to subset the nameless object while subsetting the names
## ## separately, then finally puts the names back on and returns the
## ## result.
## old.XStringSet.subset.method <- selectMethod("[", "XStringSet")
## invisible(setMethod("[", signature="XStringSet", definition=function(x, i, j, ..., drop=TRUE) {
## ## Save the names into a seaprate variable
## xnames <- names(x)
## ## Do the old behavior, which works on unnamed objects
## x <- old.XStringSet.subset.method(unname(x), i, j, ..., drop=drop)
## ## Put the names back on and return
## setNames(x, xnames[i])
## }))
save.deloxed.pairs.as.fastq <- function(read1, read2, output.base,
mate.ext="matepaired",
nonmate.ext="paired",
negative.ext="negative",
unpaired.ext="unpaired",
append=FALSE) {
extension <- c(mate=mate.ext,
`non-mate`=nonmate.ext,
negative=negative.ext,
unpaired=unpaired.ext)
## ## Make sure that read1 and read2 are a match for each other
## stopifnot(identical(as.character(mcols(read1)$category),
## as.character(mcols(read2)$category)))
## ## Discard the shorter read on "unpaired"
## read1.shorter <- width(read1) < width(read2)
## mcols(read1)$category[mcols(read1)$category == "unpaired" & read1.shorter] <- NA
## mcols(read2)$category[mcols(read2)$category == "unpaired" & !read1.shorter] <- NA
filename.template <- "%s_read%s.%s.fastq"
for (cat in names(extension)) {
read1.for.category <- read1[mcols(read1)$category == cat]
read1.file.for.category <- sprintf(filename.template, output.base, 1, extension[[cat]])
tsmsg("Writing ", read1.file.for.category)
write.QualityScaledDNAStringSet(read1.for.category,
file=read1.file.for.category,
append=append)
read2.for.category <- read2[mcols(read2)$category == cat]
read2.file.for.category <- sprintf(filename.template, output.base, 2, extension[[cat]])
tsmsg("Writing ", read2.file.for.category)
write.QualityScaledDNAStringSet(read2.for.category,
file=read2.file.for.category,
append=append)
}
return(TRUE)
}
get.category.counts <- function(deloxed.pairs) {
r1cat <- mcols(deloxed.pairs$read1)$category
r2cat <- mcols(deloxed.pairs$read2)$category
x <- table(r1cat)[c("mate", "non-mate", "negative")]
x["r1.single"] <- sum(r1cat == "unpaired")
x["r2.single"] <- sum(r2cat == "unpaired")
x["discard"] <- length(r1cat) - sum(x)
x
}
mcparallel.quiet <- function(expr, ...) {
parallel:::mcparallel(suppressMessages(expr), ...)
}
print.stats <- function(category.counts) {
category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
names(category.counts))
x <- rbind(Counts=category.counts, Fractions=category.pct)
names(dimnames(x)) <- c("Stat", "Category")
print(x, quote=FALSE, justify="right")
}
main <- function() {
opt <- parse_arguments()
print.option.list(opt)
args <- opt$args
opts <- opt$options
if (!(tolower(opts[["read1-orientation"]]) %in% c("in", "out") &&
tolower(opts[["read2-orientation"]]) %in% c("in", "out") )) {
stop("Valid orientations are \"in\" and \"out\"")
}
align.opts <- list(match = opts[["match-bonus"]],
mismatch = opts[["mismatch-penalty"]],
gapOpening = opts[["gap-open-penalty"]],
gapExtension = opts[["gap-extension-penalty"]])
stopifnot(opts$`min-call` >= 1 &&
opts$`min-length` >= 0 &&
opts$`jobs` >= 0)
## Set jobs if requested
if (opts$jobs > 0) {
options(cores=opts$jobs)
}
tsmsg("Using ", getDoParWorkers(), " cores.")
paired <- !opts[["single-read-mode"]]
interleaved <- opts[["interleaved"]]
if (!paired && interleaved) {
stop("ERROR: You cannot specify both --interleaved and --single-read-mode")
} else if (!paired) {
if (length(args) != 3) {
stop("DeLoxer in single-read mode requires exactly 3 arguments")
}
subject.file <- args[[1]]
read1.file <- args[[2]]
read2.file <- NULL
output.file <- args[[3]]
} else if (interleaved) {
if (length(args) != 3) {
stop("DeLoxer interleaved input mode requires exactly 3 arguments")
}
subject.file <- args[[1]]
read1.file <- args[[2]]
read2.file <- NULL
output.basename <- args[[3]]
} else {
if (length(args) != 4) {
stop("DeLoxer requires exactly 4 arguments")
}
subject.file <- args[[1]]
read1.file <- args[[2]]
read2.file <- args[[3]]
output.basename <- args[[4]]
}
subj <- readDNAStringSet(subject.file, format="fasta", nrec=1)[[1]]
yieldSize <- opts[["yield-size"]]
if (paired) {
tsmsg("Deloxing and classifying paired sequences")
read1.stream <- FastqStreamer(read1.file, n=yieldSize)
read2.stream <- if (!interleaved) FastqStreamer(read2.file, n=yieldSize)
process.chunk <- function(fq1, fq2, append) {
if (length(fq1) < 1)
return(TRUE)
if (interleaved) {
stopifnot(is.null(fq2))
deint <- deinterleave.pairs(fq1)
fq1 <- deint[[1]]
fq2 <- deint[[2]]
} else {
if (length(fq1) != length(fq2))
stop("Both input files must have equal numbers of reads")
}
read1 <- as(fq1, "QualityScaledDNAStringSet")
read2 <- as(fq2, "QualityScaledDNAStringSet")
deloxed.pairs <-
delox(subj, read1, read2,
min.call=opts[["min-call"]],
interleaved=interleaved,
read1.orientation=opts[["read1-orientation"]],
read2.orientation=opts[["read2-orientation"]],
align.opts=align.opts)
save.deloxed.pairs.as.fastq(deloxed.pairs$read1, deloxed.pairs$read2, output.basename, append=append)
ret <- get.category.counts(deloxed.pairs)
return(ret)
}
fq1 <- yield(read1.stream)
fq2 <- if (!interleaved) yield(read2.stream)
if (length(fq1) == 0)
warning("No reads were read from the input file.")
proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=FALSE))
reads.processed <- length(fq1) / ifelse(interleaved, 2, 1)
category.stats <-
category.counts <- NULL
while (length(fq1 <- yield(read1.stream))) {
if (!interleaved)
fq2 <- yield(read2.stream)
prev.result <- mccollect(proc)[[1]]
if (is(prev.result, "try-error")) {
tsmsg("Encountered error in deloxing subprocess:")
stop(attr(prev.result, "condition"))
}
if (is.null(category.counts)) {
category.counts <- prev.result
} else {
category.counts <- category.counts + prev.result
}
tsmsg("Category stats after processing ", reads.processed, " reads:")
## category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
## names(category.counts))
print.stats(category.counts)
proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=TRUE))
reads.processed <- reads.processed + length(fq1) / ifelse(interleaved, 2, 1)
}
close(read1.stream)
if (!interleaved) close(read2.stream)
prev.result <- mccollect(proc)[[1]]
if (is.null(category.counts)) {
category.counts <- prev.result
} else {
category.counts <- category.counts + prev.result
}
if (is(prev.result, "try-error")) {
tsmsg("Encountered error in deloxing subprocess:")
stop(attr(prev.result, "condition"))
stop("Encountered error in deloxing")
}
tsmsg("Final category stats after processing ", reads.processed, " reads:")
print.stats(category.counts)
} else {
tsmsg("Deloxing single sequences")
read1.stream <- FastqStreamer(read1.file, n=yieldSize)
process.chunk <- function(fq, append) {
if (length(fq) < 1)
return(TRUE)
reads <- as(fq, "QualityScaledDNAStringSet")
deloxed.reads <-
delox(subj, reads, NULL,
min.call=opts[["min-call"]],
interleaved=interleaved,
read1.orientation=opts[["read1-orientation"]],
read2.orientation=opts[["read2-orientation"]],
align.opts=align.opts)
write.QualityScaledDNAStringSet(deloxed.reads, output.file, append=append)
return(TRUE)
}
## First chunk is processed with append=FALSE to start the file
fq <- yield(read1.stream)
if (length(fq) == 0)
warning("No reads were read from the input file.")
proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=FALSE)))
reads.processed <- length(fq)
while (length(fq <- yield(read1.stream))) {
prev.result <- mccollect(proc)[[1]]
if (is(prev.result, "try-error")) {
tsmsg("Encountered error in deloxing subprocess:")
stop(attr(prev.result, "condition"))
stop("Encountered error in deloxing")
}
tsmsg("Processed ", reads.processed, " reads")
proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=TRUE)))
reads.processed <- reads.processed + length(fq)
}
close(read1.stream)
prev.result <- mccollect(proc)[[1]]
if (is(prev.result, "try-error")) {
tsmsg("Encountered error in deloxing subprocess:")
stop(attr(prev.result, "condition"))
stop("Encountered error in deloxing")
}
tsmsg("Processed ", reads.processed, " reads")
}
tsmsg("Finished successful run")
}
main()