delox.R 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725
  1. #!/usr/bin/env Rscript
  2. default.align.opts <- list(match=1, mismatch=3,
  3. gapOpening=5, gapExtension=2)
  4. parse_arguments <- function() {
  5. suppressMessages({
  6. library(optparse)
  7. library(parallel)
  8. })
  9. option_list <-
  10. list(make_option(c("-c", "--min-call"), type="integer", default=10, metavar="10",
  11. help="Minimum perfect overlap required to call the presence of the subject (paired only). Imperfect overlap will need to be longer, based on specified mismatch and gap penalties."),
  12. make_option(c("-l", "--min-length"), type="integer", default=36, metavar="36",
  13. help="Minimum length allowed after trimming a read. Any reads shorter than this after trimming will be discarded."),
  14. make_option(c("-i", "--interleaved"), action="store_true", default=FALSE,
  15. help="Specify this option if you have paired-end sequences interleaved in a single FASTQ file. The default is to read paired-end sequences from a matched pair of files, and this option is ignored if two fastq files are provided. When you use this option, skip the \"READ2.fastq\" argument."),
  16. make_option(c("-o", "--read1-orientation"), type="character", default="in", metavar="in/out",
  17. help="Orientation of read1. Can be either \"in\" or \"out\" (paired only). Note that Illumina reads are \"in\"."),
  18. make_option(c("-q", "--read2-orientation"), type="character", default="in", metavar="in/out",
  19. help="Orientation of read2. Can be either \"in\" or \"out\" (paired only)"),
  20. make_option(c("-j", "--jobs"), type="integer",
  21. default=parallel:::detectCores(),
  22. metavar=as.character(parallel:::detectCores()),
  23. help="Number of jobs to run in parallel for alignment. This should be autodetected by default."),
  24. make_option(c("-y", "--yield-size"), type="integer",
  25. default=100000,
  26. metavar="100000",
  27. help="Number of reads to process at a time. Setting this higher will read more data into memory at once and result in faster runtime. Setting this lower will require less memory."),
  28. make_option(c("-m", "--match-bonus"), type="double",
  29. default=default.align.opts$match,
  30. metavar=as.character(default.align.opts$match),
  31. help="Score bonus for a matching nucleotide"),
  32. make_option(c("-p", "--mismatch-penalty"), type="double",
  33. default=default.align.opts$mismatch,
  34. metavar=as.character(default.align.opts$mismatch),
  35. help="Score penalty for a mismatched nucleotide (specify as a positive number)"),
  36. make_option(c("-g", "--gap-open-penalty"), type="double",
  37. default=default.align.opts$gapOpening,
  38. metavar=as.character(default.align.opts$gapOpening),
  39. help="Score penalty for opening a gap in the alignment (specifiy as a positive number)"),
  40. make_option(c("-e", "--gap-extension-penalty"), type="double",
  41. default=default.align.opts$match,
  42. metavar=as.character(default.align.opts$gapExtension),
  43. help="Score penalty for extending an alignment gap by two nucleotides (specify as a positive number)"),
  44. make_option(c("-s", "--single-read-mode"), action="store_true", default=FALSE,
  45. help="Tell DeLoxer to run in single-end mode instead of paired-end mode. In this mode, the only a single input fastq file is provided, and only a single output file is created. No classification is performed, only trimming. When you use this option, skip the \"READ2.fastq\" argument, and specify the full file name for OUTPUT_NAME instead of just the base name."))
  46. option_parser <- OptionParser(option_list=option_list,
  47. usage="%prog [options] adapter.fasta READ1.fastq READ2.fastq OUTPUT_NAME")
  48. opt <- parse_args(option_parser, positional_arguments=TRUE)
  49. return(opt)
  50. }
  51. ## Call this here to handle --help quickly, before we waste 10 seconds
  52. ## loading all the libraries
  53. invisible(parse_arguments())
  54. print.option.list <- function(opt=parse_arguments()) {
  55. args <- opt$args
  56. opts <- opt$options
  57. message("Options:")
  58. foreach (o=opts, n=names(opts)) %do% {
  59. if (n != "help")
  60. message(" ", n, ": ", o)
  61. }
  62. message("Args: ", paste("\"", args, "\"", sep="", collapse=", "))
  63. }
  64. unimplemented <- function() stop("UNIMPLEMENTED")
  65. ## Timestampped message
  66. tsmsg <- function(...) {
  67. message("# ", date(), ": ", ...)
  68. }
  69. tsmsg("Starting deloxer and loading required packages")
  70. suppressMessages({
  71. library(ShortRead)
  72. library(optparse)
  73. library(foreach)
  74. library(iterators)
  75. library(itertools)
  76. library(doMC)
  77. registerDoMC()
  78. mcoptions <- list(preschedule=TRUE, set.seed=FALSE)
  79. })
  80. ## Merge l1 and l2 by names
  81. merge.lists <- function(l1, l2) {
  82. new.names <- setdiff(names(l2), names(l1))
  83. l1[new.names] <- l2[new.names]
  84. l1
  85. }
  86. ## Return an object sans names
  87. strip.names <- function(x) {
  88. names(x) <- NULL
  89. x
  90. }
  91. ## Define some missing type coercions
  92. setAs(from="ShortRead", to="DNAStringSet", def=function(from) sread(from))
  93. setAs(from="PhredQuality", to="FastqQuality", def=function(from) FastqQuality(BStringSet(from)))
  94. setAs(from="SolexaQuality", to="SFastqQuality", def=function(from) SFastqQuality(BStringSet(from)))
  95. setAs(from="QualityScaledXStringSet", to="ShortReadQ", def=function(from) {
  96. q <- quality(from)
  97. new.quality.class <- switch(class(q),
  98. SolexaQuality="SFastqQuality",
  99. PhredQuality="FastqQuality",
  100. stop("Unknown quality type: ", class(q)))
  101. q <- as(q, new.quality.class)
  102. ShortReadQ(sread=as(from, "DNAStringSet"),
  103. quality=q,
  104. id=BStringSet(names(from)))
  105. })
  106. ## Override the provided method to keep the sequence names
  107. setAs(from="ShortReadQ", to="QualityScaledDNAStringSet",
  108. def=function (from, to = "QualityScaledDNAStringSet", strict = TRUE) {
  109. q <- quality(from)
  110. new.quality.class <- switch(class(q),
  111. SFastqQuality="SolexaQuality",
  112. FastqQuality="PhredQuality",
  113. "XStringQuality")
  114. q <- as(q, new.quality.class)
  115. x <- QualityScaledDNAStringSet(sread(from), q)
  116. names(x) <- as.character(id(from))
  117. x
  118. })
  119. ## Define functions for reading fastq into standard Biostrings object
  120. ## and writing it back out. The standard functions readFastq and
  121. ## writeFastq operate on ShortRead objects. These simply wrap them in
  122. ## conversion to/from QualityScaledDNAStringSet.
  123. read.QualityScaledDNAStringSet <- function(filepath, format = "fastq", ...) {
  124. switch(format,
  125. fastq=as(readFastq(filepath, withIds=TRUE, ...), "QualityScaledDNAStringSet"),
  126. ## Default
  127. stop("Unknown quality-scaled sequence format: ", format))
  128. }
  129. write.QualityScaledDNAStringSet <- function (x, filepath, append = FALSE, format = "fastq") {
  130. if(length(x) > 0) {
  131. sr <- as(x, "ShortReadQ")
  132. switch(format,
  133. fastq={
  134. if (!append)
  135. unlink(filepath);
  136. writeFastq(object=sr,
  137. file=filepath, mode=ifelse(append, "a", "w"))
  138. },
  139. ## Default
  140. stop("Unknown quality-scaled sequence format: ", format))
  141. } else {
  142. ## Zero-length sequence; just truncate/touch the file
  143. sink(file=filepath, append=append)
  144. sink()
  145. }
  146. }
  147. discard.short.reads <- function(reads, min.length=1) {
  148. kept.reads <- reads[width(reads) >= min.length]
  149. return(kept.reads)
  150. }
  151. ## Takes a set of interleaved reads (or anything else) and
  152. ## de-interleaves them
  153. deinterleave.pairs <- function(reads) {
  154. stopifnot(length(reads) %% 2 == 0)
  155. mask <- seq(from=1, to=length(reads), by=2)
  156. return(list(read1=reads[mask], read2=reads[-mask]))
  157. }
  158. .delox.trimmed.ranges <- function(subj, reads, min.length=36,
  159. include.scores=TRUE,
  160. include.deleted.ranges=TRUE,
  161. align.opts=list()) {
  162. align.opts <- merge.lists(align.opts, default.align.opts)
  163. aln <- list(forward=pairwiseAlignment(pattern=reads,
  164. subject=subj,
  165. type="overlap",
  166. substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
  167. gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension),
  168. revcomp=pairwiseAlignment(pattern=reads,
  169. subject=reverseComplement(DNAString(subj)),
  170. type="overlap",
  171. substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
  172. gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension))
  173. aln.scores <- Map(score, aln)
  174. aln.pat <- Map(pattern, aln)
  175. aln.ranges <- Map(function(x) IRanges(start=start(x), end=end(x)), aln.pat)
  176. aln.threebands <- Map(function (x) threebands(IRanges(start=1, end=width(reads)),
  177. start=start(x), end=end(x)),
  178. aln.ranges)
  179. ## For each read, decide whether the forward or reverse alignment
  180. ## was better.
  181. revcomp.better <- aln.scores$forward < aln.scores$revcomp
  182. ## For each read, take the threebands for the better alignment.
  183. best.threebands <- aln.threebands$forward
  184. for (band in names(best.threebands)) {
  185. best.threebands[[band]][revcomp.better] <- aln.threebands$revcomp[[band]][revcomp.better]
  186. }
  187. ## Use the left band if it is longer than either min.length or
  188. ## length of right band.
  189. use.right.band <- width(best.threebands$left) < pmin(min.length, width(best.threebands$right))
  190. ranges <- best.threebands$left
  191. ranges[use.right.band] <- best.threebands$right[use.right.band]
  192. ## Record which ranges are shorter than min.length
  193. too.short <- width(ranges) < min.length
  194. ## ranges[too.short] <- IRanges(start=1,end=0)
  195. ## Record what was trimmed off of each read (NOT what was kept!)
  196. trim <- factor(ifelse(use.right.band, "left", "right"), levels=c("right", "left", "all", "none"))
  197. ## If it's too short, then we trim "all", i.e. discard the whole
  198. ## read.
  199. trim[too.short] <- "all"
  200. ## If the read is not shorter after trimming, then nothing was
  201. ## actually trimmed.
  202. trim[width(ranges) == width(reads)] <- "none"
  203. emeta <- list()
  204. emeta$trim <- trim
  205. if (include.deleted.ranges) {
  206. deleted.start <- ifelse(too.short, 1,
  207. ifelse(use.right.band,
  208. start(best.threebands$left),
  209. start(best.threebands$middle)))
  210. deleted.end <- ifelse(too.short, width(reads),
  211. ifelse(use.right.band,
  212. end(best.threebands$middle),
  213. end(best.threebands$right)))
  214. emeta$deleted.range <- IRanges(deleted.start, deleted.end)
  215. }
  216. if (include.scores) {
  217. ## If requested, take the best score out of each pair of forward
  218. ## and reverse scores.
  219. scores <- ifelse(revcomp.better, aln.scores$revcomp, aln.scores$forward)
  220. emeta$score <- scores
  221. }
  222. mcols(ranges) <- DataFrame(emeta)
  223. return(ranges)
  224. }
  225. ## Always call delox on the underlying DNAStringSet object when called
  226. ## on something more complicated.
  227. suppressMessages({
  228. invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="ShortRead"),
  229. function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
  230. callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
  231. }))
  232. invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledDNAStringSet"),
  233. function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
  234. callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
  235. }))
  236. invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledXStringSet"),
  237. function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
  238. callGeneric(subj, as(reads, "XStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
  239. }))
  240. })
  241. delox.single <- function(subj, reads , min.length=36,
  242. include.scores=TRUE, align.opts=list()) {
  243. tsmsg("Saving read names")
  244. saved.names <- BStringSet(names(reads))
  245. reads <- strip.names(reads)
  246. invisible(gc())
  247. tsmsg("Doing alignments")
  248. nchunks <- min(getDoParWorkers(), ceiling(length(reads)/1000))
  249. deloxed.ranges <- foreach(reads=isplitVector(reads, chunks=nchunks), .combine=c) %dopar% {
  250. .delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
  251. include.scores=include.scores,
  252. include.deleted.ranges=FALSE,
  253. align.opts=align.opts)
  254. }
  255. ## maybe.chunkapply(.delox.trimmed.ranges,
  256. ## VECTOR.ARGS=list(reads=reads),
  257. ## SCALAR.ARGS=list(subj=subj, min.length=min.length,
  258. ## include.scores=include.scores,
  259. ## include.deleted.ranges=FALSE,
  260. ## align.opts=align.opts),
  261. ## min.chunk.size=1000,
  262. ## MERGE=c)
  263. tsmsg("Trimming reads")
  264. trimmed.reads <- narrow(reads, start(deloxed.ranges), end(deloxed.ranges))
  265. tsmsg("Restoring read names")
  266. names(trimmed.reads) <- as.character(saved.names)
  267. tsmsg("Adding metadata")
  268. emeta <- list()
  269. if (include.scores) {
  270. emeta$score <- mcols(deloxed.ranges)$score
  271. }
  272. if (length(emeta) > 0) {
  273. mcols(trimmed.reads) <- DataFrame(emeta)
  274. }
  275. return(discard.short.reads(trimmed.reads, min.length))
  276. }
  277. delox.paired <- function(subj, read1, read2,
  278. min.call=10, min.length=36,
  279. include.scores=TRUE, align.opts=list()) {
  280. align.opts <- merge.lists(align.opts, default.align.opts)
  281. tsmsg("Checking read counts")
  282. stopifnot(length(read1) == length(read2))
  283. tsmsg("Listing reads")
  284. original.reads <- list(read1=read1,
  285. read2=read2)
  286. rm(read1, read2)
  287. tsmsg("Saving read names")
  288. read.names <- foreach(r=original.reads) %do% BStringSet(names(r))
  289. names(read.names) <- names(original.reads)
  290. original.reads <- Map(strip.names, original.reads)
  291. invisible(gc())
  292. tsmsg("Doing alignments")
  293. deloxed.ranges <- lapply(original.reads, function(x) {
  294. nchunks <- min(getDoParWorkers(), ceiling(length(x)/1000))
  295. foreach(reads=isplitVector(x, chunks=nchunks), .combine=c) %dopar% {
  296. .delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
  297. include.scores=include.scores,
  298. include.deleted.ranges=FALSE,
  299. align.opts=align.opts)
  300. }
  301. ## maybe.chunkapply(.delox.trimmed.ranges,
  302. ## VECTOR.ARGS=list(reads=strip.names(x)),
  303. ## SCALAR.ARGS=list(subj=subj,
  304. ## min.length=min.length,
  305. ## include.scores=TRUE,
  306. ## include.deleted.ranges=TRUE,
  307. ## align.opts=align.opts),
  308. ## MERGE=c,
  309. ## min.chunk.size=1000)
  310. })
  311. tsmsg("Extracting metadata")
  312. delox.meta <- lapply(deloxed.ranges, mcols)
  313. ## Decide whether enough was trimmed on the inside (right end) of
  314. ## either read to call it a mate-pair.
  315. tsmsg("Calculating inside trim score")
  316. inside.trim.score <- Reduce(pmax,
  317. lapply(delox.meta,
  318. function(x) ifelse(x$trim == "right", x$score, 0)))
  319. ## Decide whether enough was trimmed on the outside (left end) of
  320. ## either read to call it a non-mate-pair.
  321. tsmsg("Calculating outside trim score")
  322. outside.trim.score <- Reduce(pmax,
  323. lapply(delox.meta,
  324. function(x) ifelse(x$trim == "left", x$score, 0)))
  325. tsmsg("Calling presence of subject")
  326. calls <- list(inside=inside.trim.score >= min.call * align.opts$match,
  327. outside=outside.trim.score >= min.call * align.opts$match)
  328. tsmsg("Categorizing reads")
  329. category <- factor(rep(NA, length(original.reads$read1)), levels=c("mate", "non-mate", "negative", "unpaired", "discard"))
  330. category[calls$inside] <- "mate"
  331. category[calls$outside] <- "non-mate"
  332. ## If they're either both true or both false, then it's ambiguous
  333. category[calls$inside == calls$outside] <- "negative"
  334. ## All categories should be filled in now
  335. stopifnot(all(!is.na(category)))
  336. too.short <- lapply(deloxed.ranges, function(x) width(x) < min.length)
  337. ## If either read in a pair is too short, then its partner is no
  338. ## longer paired at all.
  339. one.too.short <- Reduce(`|`, too.short)
  340. category[one.too.short] <- "unpaired"
  341. ## If both reads in a pair are too short, then the entire pair is
  342. ## discarded. This is highly unlikely, since Cre-Lox should not
  343. ## appear in the middle of both sequences.
  344. both.too.short <- Reduce(`&`, too.short)
  345. category[both.too.short] <- "discard"
  346. tsmsg("Trimming reads and restoring read names")
  347. trimmed.reads <- lapply(names(original.reads), function(x) {
  348. trimmed <- narrow(original.reads[[x]],
  349. start=start(deloxed.ranges[[x]]),
  350. end=end(deloxed.ranges[[x]]))
  351. names(trimmed) <- as.character(read.names[[x]])
  352. trimmed
  353. })
  354. names(trimmed.reads) <- names(original.reads)
  355. tsmsg("Assembling metadata")
  356. foreach (r=names(trimmed.reads)) %do% {
  357. emeta <- list()
  358. emeta$category <- category
  359. emeta$category[too.short[[r]]] <- "discard"
  360. if (include.scores) {
  361. emeta$score <- delox.meta[[r]]$score
  362. }
  363. mcols(trimmed.reads[[r]]) <- DataFrame(emeta)
  364. }
  365. return(trimmed.reads)
  366. }
  367. ## Wrapper for both single and paired as appropriate
  368. delox <- function(subj, read1, read2=NULL,
  369. min.call=10, min.length=36,
  370. interleaved=FALSE,
  371. read1.orientation=c("in", "out")[1],
  372. read2.orientation=c("in", "out")[1],
  373. align.opts=list()) {
  374. if (is.null(read2)) {
  375. if (interleaved) {
  376. x <- deinterleave.pairs(read1)
  377. read1 <- x$read1
  378. read2 <- x$read2
  379. } else {
  380. tsmsg("Doing single-read delox")
  381. return(delox.single(subj=subj, reads=read1, min.length=min.length, align.opts=align.opts))
  382. }
  383. }
  384. ## Make sure both reads are oriented "in" before calling
  385. tsmsg("Ensuring correct read orientation")
  386. if (tolower(read1.orientation) == "out") {
  387. read1 <- reverseComplement(read1)
  388. }
  389. if (!is.null(read2) && tolower(read2.orientation) == "out") {
  390. read2 <- reverseComplement(read2)
  391. }
  392. tsmsg("Doing paired-end delox")
  393. deloxed.reads <- delox.paired(subj, read1, read2,
  394. min.call=min.call, min.length=min.length,
  395. align.opts=align.opts)
  396. ## If reads started "out", put them back that way before returning
  397. tsmsg("Restoring original read orientation")
  398. if (tolower(read1.orientation) == "out") {
  399. deloxed.reads$read1 <- reverseComplement(deloxed.reads$read1)
  400. }
  401. if (tolower(read2.orientation) == "out") {
  402. deloxed.reads$read2 <- reverseComplement(deloxed.reads$read2)
  403. }
  404. return(deloxed.reads)
  405. }
  406. ## ## Hack to work around a bug in BioConductor that prevents subsetting
  407. ## ## of named XStringSet objects. Apparently, since DeLoxer was first
  408. ## ## published, the BioConductor devs broke the XStringSet subsetting
  409. ## ## code so that it can no longer handle XStringSets with names. The
  410. ## ## code below strips the names from the XStringSet, then calls the old
  411. ## ## code to subset the nameless object while subsetting the names
  412. ## ## separately, then finally puts the names back on and returns the
  413. ## ## result.
  414. ## old.XStringSet.subset.method <- selectMethod("[", "XStringSet")
  415. ## invisible(setMethod("[", signature="XStringSet", definition=function(x, i, j, ..., drop=TRUE) {
  416. ## ## Save the names into a seaprate variable
  417. ## xnames <- names(x)
  418. ## ## Do the old behavior, which works on unnamed objects
  419. ## x <- old.XStringSet.subset.method(unname(x), i, j, ..., drop=drop)
  420. ## ## Put the names back on and return
  421. ## setNames(x, xnames[i])
  422. ## }))
  423. save.deloxed.pairs.as.fastq <- function(read1, read2, output.base,
  424. mate.ext="matepaired",
  425. nonmate.ext="paired",
  426. negative.ext="negative",
  427. unpaired.ext="unpaired",
  428. append=FALSE) {
  429. extension <- c(mate=mate.ext,
  430. `non-mate`=nonmate.ext,
  431. negative=negative.ext,
  432. unpaired=unpaired.ext)
  433. ## ## Make sure that read1 and read2 are a match for each other
  434. ## stopifnot(identical(as.character(mcols(read1)$category),
  435. ## as.character(mcols(read2)$category)))
  436. ## ## Discard the shorter read on "unpaired"
  437. ## read1.shorter <- width(read1) < width(read2)
  438. ## mcols(read1)$category[mcols(read1)$category == "unpaired" & read1.shorter] <- NA
  439. ## mcols(read2)$category[mcols(read2)$category == "unpaired" & !read1.shorter] <- NA
  440. filename.template <- "%s_read%s.%s.fastq"
  441. for (cat in names(extension)) {
  442. read1.for.category <- read1[mcols(read1)$category == cat]
  443. read1.file.for.category <- sprintf(filename.template, output.base, 1, extension[[cat]])
  444. tsmsg("Writing ", read1.file.for.category)
  445. write.QualityScaledDNAStringSet(read1.for.category,
  446. file=read1.file.for.category,
  447. append=append)
  448. read2.for.category <- read2[mcols(read2)$category == cat]
  449. read2.file.for.category <- sprintf(filename.template, output.base, 2, extension[[cat]])
  450. tsmsg("Writing ", read2.file.for.category)
  451. write.QualityScaledDNAStringSet(read2.for.category,
  452. file=read2.file.for.category,
  453. append=append)
  454. }
  455. return(TRUE)
  456. }
  457. get.category.counts <- function(deloxed.pairs) {
  458. r1cat <- mcols(deloxed.pairs$read1)$category
  459. r2cat <- mcols(deloxed.pairs$read2)$category
  460. x <- table(r1cat)[c("mate", "non-mate", "negative")]
  461. x["r1.single"] <- sum(r1cat == "unpaired")
  462. x["r2.single"] <- sum(r2cat == "unpaired")
  463. x["discard"] <- length(r1cat) - sum(x)
  464. x
  465. }
  466. mcparallel.quiet <- function(expr, ...) {
  467. parallel:::mcparallel(suppressMessages(expr), ...)
  468. }
  469. print.stats <- function(category.counts) {
  470. category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
  471. names(category.counts))
  472. x <- rbind(Counts=category.counts, Fractions=category.pct)
  473. names(dimnames(x)) <- c("Stat", "Category")
  474. print(x, quote=FALSE, justify="right")
  475. }
  476. main <- function() {
  477. opt <- parse_arguments()
  478. print.option.list(opt)
  479. args <- opt$args
  480. opts <- opt$options
  481. if (!(tolower(opts[["read1-orientation"]]) %in% c("in", "out") &&
  482. tolower(opts[["read2-orientation"]]) %in% c("in", "out") )) {
  483. stop("Valid orientations are \"in\" and \"out\"")
  484. }
  485. align.opts <- list(match = opts[["match-bonus"]],
  486. mismatch = opts[["mismatch-penalty"]],
  487. gapOpening = opts[["gap-open-penalty"]],
  488. gapExtension = opts[["gap-extension-penalty"]])
  489. stopifnot(opts$`min-call` >= 1 &&
  490. opts$`min-length` >= 0 &&
  491. opts$`jobs` >= 0)
  492. ## Set jobs if requested
  493. if (opts$jobs > 0) {
  494. options(cores=opts$jobs)
  495. }
  496. tsmsg("Using ", getDoParWorkers(), " cores.")
  497. paired <- !opts[["single-read-mode"]]
  498. interleaved <- opts[["interleaved"]]
  499. if (!paired && interleaved) {
  500. stop("ERROR: You cannot specify both --interleaved and --single-read-mode")
  501. } else if (!paired) {
  502. if (length(args) != 3) {
  503. stop("DeLoxer in single-read mode requires exactly 3 arguments")
  504. }
  505. subject.file <- args[[1]]
  506. read1.file <- args[[2]]
  507. read2.file <- NULL
  508. output.file <- args[[3]]
  509. } else if (interleaved) {
  510. if (length(args) != 3) {
  511. stop("DeLoxer interleaved input mode requires exactly 3 arguments")
  512. }
  513. subject.file <- args[[1]]
  514. read1.file <- args[[2]]
  515. read2.file <- NULL
  516. output.basename <- args[[3]]
  517. } else {
  518. if (length(args) != 4) {
  519. stop("DeLoxer requires exactly 4 arguments")
  520. }
  521. subject.file <- args[[1]]
  522. read1.file <- args[[2]]
  523. read2.file <- args[[3]]
  524. output.basename <- args[[4]]
  525. }
  526. subj <- readDNAStringSet(subject.file, format="fasta", nrec=1)[[1]]
  527. yieldSize <- opts[["yield-size"]]
  528. if (paired) {
  529. tsmsg("Deloxing and classifying paired sequences")
  530. read1.stream <- FastqStreamer(read1.file, n=yieldSize)
  531. read2.stream <- if (!interleaved) FastqStreamer(read2.file, n=yieldSize)
  532. process.chunk <- function(fq1, fq2, append) {
  533. if (length(fq1) < 1)
  534. return(TRUE)
  535. if (interleaved) {
  536. stopifnot(is.null(fq2))
  537. deint <- deinterleave.pairs(fq1)
  538. fq1 <- deint[[1]]
  539. fq2 <- deint[[2]]
  540. } else {
  541. if (length(fq1) != length(fq2))
  542. stop("Both input files must have equal numbers of reads")
  543. }
  544. read1 <- as(fq1, "QualityScaledDNAStringSet")
  545. read2 <- as(fq2, "QualityScaledDNAStringSet")
  546. deloxed.pairs <-
  547. delox(subj, read1, read2,
  548. min.call=opts[["min-call"]],
  549. interleaved=interleaved,
  550. read1.orientation=opts[["read1-orientation"]],
  551. read2.orientation=opts[["read2-orientation"]],
  552. align.opts=align.opts)
  553. save.deloxed.pairs.as.fastq(deloxed.pairs$read1, deloxed.pairs$read2, output.basename, append=append)
  554. ret <- get.category.counts(deloxed.pairs)
  555. return(ret)
  556. }
  557. fq1 <- yield(read1.stream)
  558. fq2 <- if (!interleaved) yield(read2.stream)
  559. if (length(fq1) == 0)
  560. warning("No reads were read from the input file.")
  561. proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=FALSE))
  562. reads.processed <- length(fq1) / ifelse(interleaved, 2, 1)
  563. category.stats <-
  564. category.counts <- NULL
  565. while (length(fq1 <- yield(read1.stream))) {
  566. if (!interleaved)
  567. fq2 <- yield(read2.stream)
  568. prev.result <- mccollect(proc)[[1]]
  569. if (is(prev.result, "try-error")) {
  570. tsmsg("Encountered error in deloxing subprocess:")
  571. stop(attr(prev.result, "condition"))
  572. }
  573. if (is.null(category.counts)) {
  574. category.counts <- prev.result
  575. } else {
  576. category.counts <- category.counts + prev.result
  577. }
  578. tsmsg("Category stats after processing ", reads.processed, " reads:")
  579. ## category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
  580. ## names(category.counts))
  581. print.stats(category.counts)
  582. proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=TRUE))
  583. reads.processed <- reads.processed + length(fq1) / ifelse(interleaved, 2, 1)
  584. }
  585. close(read1.stream)
  586. if (!interleaved) close(read2.stream)
  587. prev.result <- mccollect(proc)[[1]]
  588. if (is.null(category.counts)) {
  589. category.counts <- prev.result
  590. } else {
  591. category.counts <- category.counts + prev.result
  592. }
  593. if (is(prev.result, "try-error")) {
  594. tsmsg("Encountered error in deloxing subprocess:")
  595. stop(attr(prev.result, "condition"))
  596. stop("Encountered error in deloxing")
  597. }
  598. tsmsg("Final category stats after processing ", reads.processed, " reads:")
  599. print.stats(category.counts)
  600. } else {
  601. tsmsg("Deloxing single sequences")
  602. read1.stream <- FastqStreamer(read1.file, n=yieldSize)
  603. process.chunk <- function(fq, append) {
  604. if (length(fq) < 1)
  605. return(TRUE)
  606. reads <- as(fq, "QualityScaledDNAStringSet")
  607. deloxed.reads <-
  608. delox(subj, reads, NULL,
  609. min.call=opts[["min-call"]],
  610. interleaved=interleaved,
  611. read1.orientation=opts[["read1-orientation"]],
  612. read2.orientation=opts[["read2-orientation"]],
  613. align.opts=align.opts)
  614. write.QualityScaledDNAStringSet(deloxed.reads, output.file, append=append)
  615. return(TRUE)
  616. }
  617. ## First chunk is processed with append=FALSE to start the file
  618. fq <- yield(read1.stream)
  619. if (length(fq) == 0)
  620. warning("No reads were read from the input file.")
  621. proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=FALSE)))
  622. reads.processed <- length(fq)
  623. while (length(fq <- yield(read1.stream))) {
  624. prev.result <- mccollect(proc)[[1]]
  625. if (is(prev.result, "try-error")) {
  626. tsmsg("Encountered error in deloxing subprocess:")
  627. stop(attr(prev.result, "condition"))
  628. stop("Encountered error in deloxing")
  629. }
  630. tsmsg("Processed ", reads.processed, " reads")
  631. proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=TRUE)))
  632. reads.processed <- reads.processed + length(fq)
  633. }
  634. close(read1.stream)
  635. prev.result <- mccollect(proc)[[1]]
  636. if (is(prev.result, "try-error")) {
  637. tsmsg("Encountered error in deloxing subprocess:")
  638. stop(attr(prev.result, "condition"))
  639. stop("Encountered error in deloxing")
  640. }
  641. tsmsg("Processed ", reads.processed, " reads")
  642. }
  643. tsmsg("Finished successful run")
  644. }
  645. main()