123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725 |
- #!/usr/bin/env Rscript
- default.align.opts <- list(match=1, mismatch=3,
- gapOpening=5, gapExtension=2)
- parse_arguments <- function() {
- suppressMessages({
- library(optparse)
- library(parallel)
- })
- option_list <-
- list(make_option(c("-c", "--min-call"), type="integer", default=10, metavar="10",
- help="Minimum perfect overlap required to call the presence of the subject (paired only). Imperfect overlap will need to be longer, based on specified mismatch and gap penalties."),
- make_option(c("-l", "--min-length"), type="integer", default=36, metavar="36",
- help="Minimum length allowed after trimming a read. Any reads shorter than this after trimming will be discarded."),
- make_option(c("-i", "--interleaved"), action="store_true", default=FALSE,
- help="Specify this option if you have paired-end sequences interleaved in a single FASTQ file. The default is to read paired-end sequences from a matched pair of files, and this option is ignored if two fastq files are provided. When you use this option, skip the \"READ2.fastq\" argument."),
- make_option(c("-o", "--read1-orientation"), type="character", default="in", metavar="in/out",
- help="Orientation of read1. Can be either \"in\" or \"out\" (paired only). Note that Illumina reads are \"in\"."),
- make_option(c("-q", "--read2-orientation"), type="character", default="in", metavar="in/out",
- help="Orientation of read2. Can be either \"in\" or \"out\" (paired only)"),
- make_option(c("-j", "--jobs"), type="integer",
- default=parallel:::detectCores(),
- metavar=as.character(parallel:::detectCores()),
- help="Number of jobs to run in parallel for alignment. This should be autodetected by default."),
- make_option(c("-y", "--yield-size"), type="integer",
- default=100000,
- metavar="100000",
- help="Number of reads to process at a time. Setting this higher will read more data into memory at once and result in faster runtime. Setting this lower will require less memory."),
- make_option(c("-m", "--match-bonus"), type="double",
- default=default.align.opts$match,
- metavar=as.character(default.align.opts$match),
- help="Score bonus for a matching nucleotide"),
- make_option(c("-p", "--mismatch-penalty"), type="double",
- default=default.align.opts$mismatch,
- metavar=as.character(default.align.opts$mismatch),
- help="Score penalty for a mismatched nucleotide (specify as a positive number)"),
- make_option(c("-g", "--gap-open-penalty"), type="double",
- default=default.align.opts$gapOpening,
- metavar=as.character(default.align.opts$gapOpening),
- help="Score penalty for opening a gap in the alignment (specifiy as a positive number)"),
- make_option(c("-e", "--gap-extension-penalty"), type="double",
- default=default.align.opts$match,
- metavar=as.character(default.align.opts$gapExtension),
- help="Score penalty for extending an alignment gap by two nucleotides (specify as a positive number)"),
- make_option(c("-s", "--single-read-mode"), action="store_true", default=FALSE,
- help="Tell DeLoxer to run in single-end mode instead of paired-end mode. In this mode, the only a single input fastq file is provided, and only a single output file is created. No classification is performed, only trimming. When you use this option, skip the \"READ2.fastq\" argument, and specify the full file name for OUTPUT_NAME instead of just the base name."))
- option_parser <- OptionParser(option_list=option_list,
- usage="%prog [options] adapter.fasta READ1.fastq READ2.fastq OUTPUT_NAME")
- opt <- parse_args(option_parser, positional_arguments=TRUE)
- return(opt)
- }
- ## Call this here to handle --help quickly, before we waste 10 seconds
- ## loading all the libraries
- invisible(parse_arguments())
- print.option.list <- function(opt=parse_arguments()) {
- args <- opt$args
- opts <- opt$options
- message("Options:")
- foreach (o=opts, n=names(opts)) %do% {
- if (n != "help")
- message(" ", n, ": ", o)
- }
- message("Args: ", paste("\"", args, "\"", sep="", collapse=", "))
- }
- unimplemented <- function() stop("UNIMPLEMENTED")
- ## Timestampped message
- tsmsg <- function(...) {
- message("# ", date(), ": ", ...)
- }
- tsmsg("Starting deloxer and loading required packages")
- suppressMessages({
- library(ShortRead)
- library(optparse)
- library(foreach)
- library(iterators)
- library(itertools)
- library(doMC)
- registerDoMC()
- mcoptions <- list(preschedule=TRUE, set.seed=FALSE)
- })
- ## Merge l1 and l2 by names
- merge.lists <- function(l1, l2) {
- new.names <- setdiff(names(l2), names(l1))
- l1[new.names] <- l2[new.names]
- l1
- }
- ## Return an object sans names
- strip.names <- function(x) {
- names(x) <- NULL
- x
- }
- ## Define some missing type coercions
- setAs(from="ShortRead", to="DNAStringSet", def=function(from) sread(from))
- setAs(from="PhredQuality", to="FastqQuality", def=function(from) FastqQuality(BStringSet(from)))
- setAs(from="SolexaQuality", to="SFastqQuality", def=function(from) SFastqQuality(BStringSet(from)))
- setAs(from="QualityScaledXStringSet", to="ShortReadQ", def=function(from) {
- q <- quality(from)
- new.quality.class <- switch(class(q),
- SolexaQuality="SFastqQuality",
- PhredQuality="FastqQuality",
- stop("Unknown quality type: ", class(q)))
- q <- as(q, new.quality.class)
- ShortReadQ(sread=as(from, "DNAStringSet"),
- quality=q,
- id=BStringSet(names(from)))
- })
- ## Override the provided method to keep the sequence names
- setAs(from="ShortReadQ", to="QualityScaledDNAStringSet",
- def=function (from, to = "QualityScaledDNAStringSet", strict = TRUE) {
- q <- quality(from)
- new.quality.class <- switch(class(q),
- SFastqQuality="SolexaQuality",
- FastqQuality="PhredQuality",
- "XStringQuality")
- q <- as(q, new.quality.class)
- x <- QualityScaledDNAStringSet(sread(from), q)
- names(x) <- as.character(id(from))
- x
- })
- ## Define functions for reading fastq into standard Biostrings object
- ## and writing it back out. The standard functions readFastq and
- ## writeFastq operate on ShortRead objects. These simply wrap them in
- ## conversion to/from QualityScaledDNAStringSet.
- read.QualityScaledDNAStringSet <- function(filepath, format = "fastq", ...) {
- switch(format,
- fastq=as(readFastq(filepath, withIds=TRUE, ...), "QualityScaledDNAStringSet"),
- ## Default
- stop("Unknown quality-scaled sequence format: ", format))
- }
- write.QualityScaledDNAStringSet <- function (x, filepath, append = FALSE, format = "fastq") {
- if(length(x) > 0) {
- sr <- as(x, "ShortReadQ")
- switch(format,
- fastq={
- if (!append)
- unlink(filepath);
- writeFastq(object=sr,
- file=filepath, mode=ifelse(append, "a", "w"))
- },
- ## Default
- stop("Unknown quality-scaled sequence format: ", format))
- } else {
- ## Zero-length sequence; just truncate/touch the file
- sink(file=filepath, append=append)
- sink()
- }
- }
- discard.short.reads <- function(reads, min.length=1) {
- kept.reads <- reads[width(reads) >= min.length]
- return(kept.reads)
- }
- ## Takes a set of interleaved reads (or anything else) and
- ## de-interleaves them
- deinterleave.pairs <- function(reads) {
- stopifnot(length(reads) %% 2 == 0)
- mask <- seq(from=1, to=length(reads), by=2)
- return(list(read1=reads[mask], read2=reads[-mask]))
- }
- .delox.trimmed.ranges <- function(subj, reads, min.length=36,
- include.scores=TRUE,
- include.deleted.ranges=TRUE,
- align.opts=list()) {
- align.opts <- merge.lists(align.opts, default.align.opts)
- aln <- list(forward=pairwiseAlignment(pattern=reads,
- subject=subj,
- type="overlap",
- substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
- gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension),
- revcomp=pairwiseAlignment(pattern=reads,
- subject=reverseComplement(DNAString(subj)),
- type="overlap",
- substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
- gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension))
- aln.scores <- Map(score, aln)
- aln.pat <- Map(pattern, aln)
- aln.ranges <- Map(function(x) IRanges(start=start(x), end=end(x)), aln.pat)
- aln.threebands <- Map(function (x) threebands(IRanges(start=1, end=width(reads)),
- start=start(x), end=end(x)),
- aln.ranges)
- ## For each read, decide whether the forward or reverse alignment
- ## was better.
- revcomp.better <- aln.scores$forward < aln.scores$revcomp
- ## For each read, take the threebands for the better alignment.
- best.threebands <- aln.threebands$forward
- for (band in names(best.threebands)) {
- best.threebands[[band]][revcomp.better] <- aln.threebands$revcomp[[band]][revcomp.better]
- }
- ## Use the left band if it is longer than either min.length or
- ## length of right band.
- use.right.band <- width(best.threebands$left) < pmin(min.length, width(best.threebands$right))
- ranges <- best.threebands$left
- ranges[use.right.band] <- best.threebands$right[use.right.band]
- ## Record which ranges are shorter than min.length
- too.short <- width(ranges) < min.length
- ## ranges[too.short] <- IRanges(start=1,end=0)
- ## Record what was trimmed off of each read (NOT what was kept!)
- trim <- factor(ifelse(use.right.band, "left", "right"), levels=c("right", "left", "all", "none"))
- ## If it's too short, then we trim "all", i.e. discard the whole
- ## read.
- trim[too.short] <- "all"
- ## If the read is not shorter after trimming, then nothing was
- ## actually trimmed.
- trim[width(ranges) == width(reads)] <- "none"
- emeta <- list()
- emeta$trim <- trim
- if (include.deleted.ranges) {
- deleted.start <- ifelse(too.short, 1,
- ifelse(use.right.band,
- start(best.threebands$left),
- start(best.threebands$middle)))
- deleted.end <- ifelse(too.short, width(reads),
- ifelse(use.right.band,
- end(best.threebands$middle),
- end(best.threebands$right)))
- emeta$deleted.range <- IRanges(deleted.start, deleted.end)
- }
- if (include.scores) {
- ## If requested, take the best score out of each pair of forward
- ## and reverse scores.
- scores <- ifelse(revcomp.better, aln.scores$revcomp, aln.scores$forward)
- emeta$score <- scores
- }
- mcols(ranges) <- DataFrame(emeta)
- return(ranges)
- }
- ## Always call delox on the underlying DNAStringSet object when called
- ## on something more complicated.
- suppressMessages({
- invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="ShortRead"),
- function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
- callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
- }))
- invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledDNAStringSet"),
- function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
- callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
- }))
- invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledXStringSet"),
- function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
- callGeneric(subj, as(reads, "XStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
- }))
- })
- delox.single <- function(subj, reads , min.length=36,
- include.scores=TRUE, align.opts=list()) {
- tsmsg("Saving read names")
- saved.names <- BStringSet(names(reads))
- reads <- strip.names(reads)
- invisible(gc())
- tsmsg("Doing alignments")
- nchunks <- min(getDoParWorkers(), ceiling(length(reads)/1000))
- deloxed.ranges <- foreach(reads=isplitVector(reads, chunks=nchunks), .combine=c) %dopar% {
- .delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
- include.scores=include.scores,
- include.deleted.ranges=FALSE,
- align.opts=align.opts)
- }
- ## maybe.chunkapply(.delox.trimmed.ranges,
- ## VECTOR.ARGS=list(reads=reads),
- ## SCALAR.ARGS=list(subj=subj, min.length=min.length,
- ## include.scores=include.scores,
- ## include.deleted.ranges=FALSE,
- ## align.opts=align.opts),
- ## min.chunk.size=1000,
- ## MERGE=c)
- tsmsg("Trimming reads")
- trimmed.reads <- narrow(reads, start(deloxed.ranges), end(deloxed.ranges))
- tsmsg("Restoring read names")
- names(trimmed.reads) <- as.character(saved.names)
- tsmsg("Adding metadata")
- emeta <- list()
- if (include.scores) {
- emeta$score <- mcols(deloxed.ranges)$score
- }
- if (length(emeta) > 0) {
- mcols(trimmed.reads) <- DataFrame(emeta)
- }
- return(discard.short.reads(trimmed.reads, min.length))
- }
- delox.paired <- function(subj, read1, read2,
- min.call=10, min.length=36,
- include.scores=TRUE, align.opts=list()) {
- align.opts <- merge.lists(align.opts, default.align.opts)
- tsmsg("Checking read counts")
- stopifnot(length(read1) == length(read2))
- tsmsg("Listing reads")
- original.reads <- list(read1=read1,
- read2=read2)
- rm(read1, read2)
- tsmsg("Saving read names")
- read.names <- foreach(r=original.reads) %do% BStringSet(names(r))
- names(read.names) <- names(original.reads)
- original.reads <- Map(strip.names, original.reads)
- invisible(gc())
- tsmsg("Doing alignments")
- deloxed.ranges <- lapply(original.reads, function(x) {
- nchunks <- min(getDoParWorkers(), ceiling(length(x)/1000))
- foreach(reads=isplitVector(x, chunks=nchunks), .combine=c) %dopar% {
- .delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
- include.scores=include.scores,
- include.deleted.ranges=FALSE,
- align.opts=align.opts)
- }
- ## maybe.chunkapply(.delox.trimmed.ranges,
- ## VECTOR.ARGS=list(reads=strip.names(x)),
- ## SCALAR.ARGS=list(subj=subj,
- ## min.length=min.length,
- ## include.scores=TRUE,
- ## include.deleted.ranges=TRUE,
- ## align.opts=align.opts),
- ## MERGE=c,
- ## min.chunk.size=1000)
- })
- tsmsg("Extracting metadata")
- delox.meta <- lapply(deloxed.ranges, mcols)
- ## Decide whether enough was trimmed on the inside (right end) of
- ## either read to call it a mate-pair.
- tsmsg("Calculating inside trim score")
- inside.trim.score <- Reduce(pmax,
- lapply(delox.meta,
- function(x) ifelse(x$trim == "right", x$score, 0)))
- ## Decide whether enough was trimmed on the outside (left end) of
- ## either read to call it a non-mate-pair.
- tsmsg("Calculating outside trim score")
- outside.trim.score <- Reduce(pmax,
- lapply(delox.meta,
- function(x) ifelse(x$trim == "left", x$score, 0)))
- tsmsg("Calling presence of subject")
- calls <- list(inside=inside.trim.score >= min.call * align.opts$match,
- outside=outside.trim.score >= min.call * align.opts$match)
- tsmsg("Categorizing reads")
- category <- factor(rep(NA, length(original.reads$read1)), levels=c("mate", "non-mate", "negative", "unpaired", "discard"))
- category[calls$inside] <- "mate"
- category[calls$outside] <- "non-mate"
- ## If they're either both true or both false, then it's ambiguous
- category[calls$inside == calls$outside] <- "negative"
- ## All categories should be filled in now
- stopifnot(all(!is.na(category)))
- too.short <- lapply(deloxed.ranges, function(x) width(x) < min.length)
- ## If either read in a pair is too short, then its partner is no
- ## longer paired at all.
- one.too.short <- Reduce(`|`, too.short)
- category[one.too.short] <- "unpaired"
- ## If both reads in a pair are too short, then the entire pair is
- ## discarded. This is highly unlikely, since Cre-Lox should not
- ## appear in the middle of both sequences.
- both.too.short <- Reduce(`&`, too.short)
- category[both.too.short] <- "discard"
- tsmsg("Trimming reads and restoring read names")
- trimmed.reads <- lapply(names(original.reads), function(x) {
- trimmed <- narrow(original.reads[[x]],
- start=start(deloxed.ranges[[x]]),
- end=end(deloxed.ranges[[x]]))
- names(trimmed) <- as.character(read.names[[x]])
- trimmed
- })
- names(trimmed.reads) <- names(original.reads)
- tsmsg("Assembling metadata")
- foreach (r=names(trimmed.reads)) %do% {
- emeta <- list()
- emeta$category <- category
- emeta$category[too.short[[r]]] <- "discard"
- if (include.scores) {
- emeta$score <- delox.meta[[r]]$score
- }
- mcols(trimmed.reads[[r]]) <- DataFrame(emeta)
- }
- return(trimmed.reads)
- }
- ## Wrapper for both single and paired as appropriate
- delox <- function(subj, read1, read2=NULL,
- min.call=10, min.length=36,
- interleaved=FALSE,
- read1.orientation=c("in", "out")[1],
- read2.orientation=c("in", "out")[1],
- align.opts=list()) {
- if (is.null(read2)) {
- if (interleaved) {
- x <- deinterleave.pairs(read1)
- read1 <- x$read1
- read2 <- x$read2
- } else {
- tsmsg("Doing single-read delox")
- return(delox.single(subj=subj, reads=read1, min.length=min.length, align.opts=align.opts))
- }
- }
- ## Make sure both reads are oriented "in" before calling
- tsmsg("Ensuring correct read orientation")
- if (tolower(read1.orientation) == "out") {
- read1 <- reverseComplement(read1)
- }
- if (!is.null(read2) && tolower(read2.orientation) == "out") {
- read2 <- reverseComplement(read2)
- }
- tsmsg("Doing paired-end delox")
- deloxed.reads <- delox.paired(subj, read1, read2,
- min.call=min.call, min.length=min.length,
- align.opts=align.opts)
- ## If reads started "out", put them back that way before returning
- tsmsg("Restoring original read orientation")
- if (tolower(read1.orientation) == "out") {
- deloxed.reads$read1 <- reverseComplement(deloxed.reads$read1)
- }
- if (tolower(read2.orientation) == "out") {
- deloxed.reads$read2 <- reverseComplement(deloxed.reads$read2)
- }
- return(deloxed.reads)
- }
- ## ## Hack to work around a bug in BioConductor that prevents subsetting
- ## ## of named XStringSet objects. Apparently, since DeLoxer was first
- ## ## published, the BioConductor devs broke the XStringSet subsetting
- ## ## code so that it can no longer handle XStringSets with names. The
- ## ## code below strips the names from the XStringSet, then calls the old
- ## ## code to subset the nameless object while subsetting the names
- ## ## separately, then finally puts the names back on and returns the
- ## ## result.
- ## old.XStringSet.subset.method <- selectMethod("[", "XStringSet")
- ## invisible(setMethod("[", signature="XStringSet", definition=function(x, i, j, ..., drop=TRUE) {
- ## ## Save the names into a seaprate variable
- ## xnames <- names(x)
- ## ## Do the old behavior, which works on unnamed objects
- ## x <- old.XStringSet.subset.method(unname(x), i, j, ..., drop=drop)
- ## ## Put the names back on and return
- ## setNames(x, xnames[i])
- ## }))
- save.deloxed.pairs.as.fastq <- function(read1, read2, output.base,
- mate.ext="matepaired",
- nonmate.ext="paired",
- negative.ext="negative",
- unpaired.ext="unpaired",
- append=FALSE) {
- extension <- c(mate=mate.ext,
- `non-mate`=nonmate.ext,
- negative=negative.ext,
- unpaired=unpaired.ext)
- ## ## Make sure that read1 and read2 are a match for each other
- ## stopifnot(identical(as.character(mcols(read1)$category),
- ## as.character(mcols(read2)$category)))
- ## ## Discard the shorter read on "unpaired"
- ## read1.shorter <- width(read1) < width(read2)
- ## mcols(read1)$category[mcols(read1)$category == "unpaired" & read1.shorter] <- NA
- ## mcols(read2)$category[mcols(read2)$category == "unpaired" & !read1.shorter] <- NA
- filename.template <- "%s_read%s.%s.fastq"
- for (cat in names(extension)) {
- read1.for.category <- read1[mcols(read1)$category == cat]
- read1.file.for.category <- sprintf(filename.template, output.base, 1, extension[[cat]])
- tsmsg("Writing ", read1.file.for.category)
- write.QualityScaledDNAStringSet(read1.for.category,
- file=read1.file.for.category,
- append=append)
- read2.for.category <- read2[mcols(read2)$category == cat]
- read2.file.for.category <- sprintf(filename.template, output.base, 2, extension[[cat]])
- tsmsg("Writing ", read2.file.for.category)
- write.QualityScaledDNAStringSet(read2.for.category,
- file=read2.file.for.category,
- append=append)
- }
- return(TRUE)
- }
- get.category.counts <- function(deloxed.pairs) {
- r1cat <- mcols(deloxed.pairs$read1)$category
- r2cat <- mcols(deloxed.pairs$read2)$category
- x <- table(r1cat)[c("mate", "non-mate", "negative")]
- x["r1.single"] <- sum(r1cat == "unpaired")
- x["r2.single"] <- sum(r2cat == "unpaired")
- x["discard"] <- length(r1cat) - sum(x)
- x
- }
- mcparallel.quiet <- function(expr, ...) {
- parallel:::mcparallel(suppressMessages(expr), ...)
- }
- print.stats <- function(category.counts) {
- category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
- names(category.counts))
- x <- rbind(Counts=category.counts, Fractions=category.pct)
- names(dimnames(x)) <- c("Stat", "Category")
- print(x, quote=FALSE, justify="right")
- }
- main <- function() {
- opt <- parse_arguments()
- print.option.list(opt)
- args <- opt$args
- opts <- opt$options
- if (!(tolower(opts[["read1-orientation"]]) %in% c("in", "out") &&
- tolower(opts[["read2-orientation"]]) %in% c("in", "out") )) {
- stop("Valid orientations are \"in\" and \"out\"")
- }
- align.opts <- list(match = opts[["match-bonus"]],
- mismatch = opts[["mismatch-penalty"]],
- gapOpening = opts[["gap-open-penalty"]],
- gapExtension = opts[["gap-extension-penalty"]])
- stopifnot(opts$`min-call` >= 1 &&
- opts$`min-length` >= 0 &&
- opts$`jobs` >= 0)
- ## Set jobs if requested
- if (opts$jobs > 0) {
- options(cores=opts$jobs)
- }
- tsmsg("Using ", getDoParWorkers(), " cores.")
- paired <- !opts[["single-read-mode"]]
- interleaved <- opts[["interleaved"]]
- if (!paired && interleaved) {
- stop("ERROR: You cannot specify both --interleaved and --single-read-mode")
- } else if (!paired) {
- if (length(args) != 3) {
- stop("DeLoxer in single-read mode requires exactly 3 arguments")
- }
- subject.file <- args[[1]]
- read1.file <- args[[2]]
- read2.file <- NULL
- output.file <- args[[3]]
- } else if (interleaved) {
- if (length(args) != 3) {
- stop("DeLoxer interleaved input mode requires exactly 3 arguments")
- }
- subject.file <- args[[1]]
- read1.file <- args[[2]]
- read2.file <- NULL
- output.basename <- args[[3]]
- } else {
- if (length(args) != 4) {
- stop("DeLoxer requires exactly 4 arguments")
- }
- subject.file <- args[[1]]
- read1.file <- args[[2]]
- read2.file <- args[[3]]
- output.basename <- args[[4]]
- }
- subj <- readDNAStringSet(subject.file, format="fasta", nrec=1)[[1]]
- yieldSize <- opts[["yield-size"]]
- if (paired) {
- tsmsg("Deloxing and classifying paired sequences")
- read1.stream <- FastqStreamer(read1.file, n=yieldSize)
- read2.stream <- if (!interleaved) FastqStreamer(read2.file, n=yieldSize)
- process.chunk <- function(fq1, fq2, append) {
- if (length(fq1) < 1)
- return(TRUE)
- if (interleaved) {
- stopifnot(is.null(fq2))
- deint <- deinterleave.pairs(fq1)
- fq1 <- deint[[1]]
- fq2 <- deint[[2]]
- } else {
- if (length(fq1) != length(fq2))
- stop("Both input files must have equal numbers of reads")
- }
- read1 <- as(fq1, "QualityScaledDNAStringSet")
- read2 <- as(fq2, "QualityScaledDNAStringSet")
- deloxed.pairs <-
- delox(subj, read1, read2,
- min.call=opts[["min-call"]],
- interleaved=interleaved,
- read1.orientation=opts[["read1-orientation"]],
- read2.orientation=opts[["read2-orientation"]],
- align.opts=align.opts)
- save.deloxed.pairs.as.fastq(deloxed.pairs$read1, deloxed.pairs$read2, output.basename, append=append)
- ret <- get.category.counts(deloxed.pairs)
- return(ret)
- }
- fq1 <- yield(read1.stream)
- fq2 <- if (!interleaved) yield(read2.stream)
- if (length(fq1) == 0)
- warning("No reads were read from the input file.")
- proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=FALSE))
- reads.processed <- length(fq1) / ifelse(interleaved, 2, 1)
- category.stats <-
- category.counts <- NULL
- while (length(fq1 <- yield(read1.stream))) {
- if (!interleaved)
- fq2 <- yield(read2.stream)
- prev.result <- mccollect(proc)[[1]]
- if (is(prev.result, "try-error")) {
- tsmsg("Encountered error in deloxing subprocess:")
- stop(attr(prev.result, "condition"))
- }
- if (is.null(category.counts)) {
- category.counts <- prev.result
- } else {
- category.counts <- category.counts + prev.result
- }
- tsmsg("Category stats after processing ", reads.processed, " reads:")
- ## category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
- ## names(category.counts))
- print.stats(category.counts)
- proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=TRUE))
- reads.processed <- reads.processed + length(fq1) / ifelse(interleaved, 2, 1)
- }
- close(read1.stream)
- if (!interleaved) close(read2.stream)
- prev.result <- mccollect(proc)[[1]]
- if (is.null(category.counts)) {
- category.counts <- prev.result
- } else {
- category.counts <- category.counts + prev.result
- }
- if (is(prev.result, "try-error")) {
- tsmsg("Encountered error in deloxing subprocess:")
- stop(attr(prev.result, "condition"))
- stop("Encountered error in deloxing")
- }
- tsmsg("Final category stats after processing ", reads.processed, " reads:")
- print.stats(category.counts)
- } else {
- tsmsg("Deloxing single sequences")
- read1.stream <- FastqStreamer(read1.file, n=yieldSize)
- process.chunk <- function(fq, append) {
- if (length(fq) < 1)
- return(TRUE)
- reads <- as(fq, "QualityScaledDNAStringSet")
- deloxed.reads <-
- delox(subj, reads, NULL,
- min.call=opts[["min-call"]],
- interleaved=interleaved,
- read1.orientation=opts[["read1-orientation"]],
- read2.orientation=opts[["read2-orientation"]],
- align.opts=align.opts)
- write.QualityScaledDNAStringSet(deloxed.reads, output.file, append=append)
- return(TRUE)
- }
- ## First chunk is processed with append=FALSE to start the file
- fq <- yield(read1.stream)
- if (length(fq) == 0)
- warning("No reads were read from the input file.")
- proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=FALSE)))
- reads.processed <- length(fq)
- while (length(fq <- yield(read1.stream))) {
- prev.result <- mccollect(proc)[[1]]
- if (is(prev.result, "try-error")) {
- tsmsg("Encountered error in deloxing subprocess:")
- stop(attr(prev.result, "condition"))
- stop("Encountered error in deloxing")
- }
- tsmsg("Processed ", reads.processed, " reads")
- proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=TRUE)))
- reads.processed <- reads.processed + length(fq)
- }
- close(read1.stream)
- prev.result <- mccollect(proc)[[1]]
- if (is(prev.result, "try-error")) {
- tsmsg("Encountered error in deloxing subprocess:")
- stop(attr(prev.result, "condition"))
- stop("Encountered error in deloxing")
- }
- tsmsg("Processed ", reads.processed, " reads")
- }
- tsmsg("Finished successful run")
- }
- main()
|