浏览代码

Convert R files to html

This allows Github Pages to display them instead of telling the
browser to download them. It also allows me to syntax-highlight them
using Pygments.
Ryan C. Thompson 8 年之前
父节点
当前提交
317743fad5

+ 68 - 35
Snakefile

@@ -3,6 +3,7 @@
 import locale
 import os.path
 import regex
+import urllib.parse
 
 from collections import Iterable, Mapping  # in Python 3 use from collections.abc
 from distutils.spawn import find_executable
@@ -16,7 +17,7 @@ except ImportError:
     from scandir import scandir, walk
 
 def unnest(*args):
-    """Un-nest list- and tuple-like elements in arguments.
+    '''Un-nest list- and tuple-like elements in arguments.
 
 "List-like" means anything with a len() and whose elments can be
 accessed with numeric indexing, except for string-like elements. It
@@ -26,7 +27,7 @@ Dict-like elements and iterators/generators are not affected.
 This function always returns a list, even if it is passed a single
 scalar argument.
 
-    """
+    '''
     result = []
     for arg in args:
         if isinstance(arg, str):
@@ -50,37 +51,37 @@ scalar argument.
     return result
 
 def check_output_decode(*args, encoding=locale.getpreferredencoding(), **kwargs):
-    """Shortcut for check.output + str.decode"""
+    '''Shortcut for check.output + str.decode'''
     return check_output(*args, **kwargs).decode(encoding)
 
 def find_mac_app(name):
     try:
         return check_output_decode(
-            ["mdfind",
-             "kMDItemDisplayName=={name}&&kMDItemKind==Application".format(name=name)]).split("\n")[0]
+            ['mdfind',
+             'kMDItemDisplayName=={name}&&kMDItemKind==Application'.format(name=name)]).split('\n')[0]
     except Exception:
         return None
 
-def glob_recursive(pattern, top=".", include_hidden=False, *args, **kwargs):
-    """Combination of glob.glob and os.walk.
+def glob_recursive(pattern, top='.', include_hidden=False, *args, **kwargs):
+    '''Combination of glob.glob and os.walk.
 
 Reutrns the relative path to every file or directory matching the
 pattern anywhere in the specified directory hierarchy. Defaults to the
 current working directory. Any additional arguments are passed to
-os.walk."""
+os.walk.'''
     for (path, dirs, files) in walk(top, *args, **kwargs):
         for f in dirs + files:
-            if include_hidden or f.startswith("."):
+            if include_hidden or f.startswith('.'):
                 continue
             if fnmatch(f, pattern):
                 yield os.path.normpath(os.path.join(path, f))
 
-LYXPATH = find_executable("lyx") or \
-    os.path.join(find_mac_app("LyX"), "Contents/MacOS/lyx") or \
+LYXPATH = find_executable('lyx') or \
+    os.path.join(find_mac_app('LyX'), 'Contents/MacOS/lyx') or \
     '/bin/false'
 
 def rsync_list_files(*paths, extra_rsync_args=(), include_dirs=False):
-    """Iterate over the files in path that rsync would copy.
+    '''Iterate over the files in path that rsync would copy.
 
 By default, only files are listed, not directories, since doit doesn't
 like dependencies on directories because it can't hash them.
@@ -89,43 +90,71 @@ This uses "rsync --list-only" to make rsync directly indicate which
 files it would copy, so any exclusion/inclusion rules are taken into
 account.
 
-    """
-    rsync_list_cmd = [ 'rsync', '-r', "--list-only" ] + unnest(extra_rsync_args) + unnest(paths) + [ "." ]
+    '''
+    rsync_list_cmd = [ 'rsync', '-r', '--list-only' ] + unnest(extra_rsync_args) + unnest(paths) + [ '.' ]
     rsync_out = check_output_decode(rsync_list_cmd).splitlines()
     for line in rsync_out:
-        s = regex.search("^(-|d)(?:\S+\s+){4}(.*)", line)
+        s = regex.search('^(-|d)(?:\S+\s+){4}(.*)', line)
         if s is not None:
             if include_dirs or s.group(1) == '-':
                 yield s.group(2)
 
-def lyx_image_deps(wildcards):
-    lyxfile = wildcards.filename + ".lyx"
-
-
-def lyx_bib_deps(wildcards):
-    # Cheat: Assume every bib file is a dependency of any LaTeX
-    # operation
-    return list(glob_recursive('*.bib'))
-
-readme_files = list(glob_recursive("README.mkdn", top="examples"))
-index_files = [ os.path.join(os.path.dirname(f), "index.html") for f in readme_files ]
-
-rsync_common_args = ["-rL", "--size-only", "--delete", "--exclude", ".DS_Store", "--delete-excluded",]
+def lyx_bib_deps(lyxfile):
+    '''Return an iterator over bib files referenced by a Lyx file.'''
+    # Cheat: Assume every bib file in the folder is a dependency of
+    # any LaTeX operation. Doing this properly is tricky without
+    # implementing the full bibfile-finding logic of LyX/LaTeX.
+    return glob_recursive('*.bib')
+
+def lyx_hrefs(lyxfile):
+    '''Return an iterator over hrefs in a LyX file.'''
+    pattern = '''
+    (?xsm)
+    ^ LatexCommand \\s+ href \\s* \\n
+    (?: name \\b [^\\n]+ \\n )?
+    target \\s+ "(.*?)" $
+    '''
+    with open(lyxfile) as f:
+        return (urllib.parse.unquote(m.group(1)) for m in
+                re.finditer(pattern, f.read()))
+
+examples_base_url = 'https://darwinawardwinner.github.io/resume/examples/'
+examples_dir = 'examples'
+
+def resume_example_deps(lyxfile):
+    '''Iterate over all referenced example files in a LyX file.'''
+    for href in lyx_hrefs(lyxfile):
+        if href.startswith(examples_base_url) and not href.endswith('/'):
+            expath = href[len(examples_base_url):]
+            yield os.path.join(examples_dir, expath)
+
+readme_files = list(glob_recursive('README.mkdn', top='examples'))
+index_files = [ os.path.join(os.path.dirname(f), 'index.html') for f in readme_files ]
+
+rsync_common_args = ['-rL', '--size-only', '--delete', '--exclude', '.DS_Store', '--delete-excluded',]
 
 all_example_files = set(rsync_list_files('examples', extra_rsync_args=rsync_common_args))
+r_html_files = [ f + '.html' for f in all_example_files if f.endswith('.R') ]
 all_example_files = all_example_files.union(index_files)
+all_example_files = all_example_files.union(r_html_files)
 
 rule build_all:
-    input: "ryan_thompson_resume.pdf", "ryan_thompson_resume.html", index_files
+    input: 'ryan_thompson_resume.pdf', 'ryan_thompson_resume.html', index_files, r_html_files
 
 rule create_resume_pdf:
-    input: lyxfile="ryan_thompson_resume.lyx", bibfile="citations.bib", headshot="headshot-crop.jpg"
-    output: pdf="ryan_thompson_resume.pdf"
+    input: lyxfile='ryan_thompson_resume.lyx',
+           bibfiles=list(lyx_bib_deps('ryan_thompson_resume.lyx')),
+           example_files=list(resume_example_deps('ryan_thompson_resume.lyx')),
+           headshot='headshot-crop.jpg',
+    output: pdf='ryan_thompson_resume.pdf'
     shell: '{LYXPATH:q} --export-to pdf4 {output.pdf:q} {input.lyxfile:q}'
 
 rule create_resume_html:
-    input: lyxfile="ryan_thompson_resume.lyx", bibfile="citations.bib", headshot="headshot-crop.jpg"
-    output: html="ryan_thompson_resume.html"
+    input: lyxfile='ryan_thompson_resume.lyx',
+           bibfiles=list(lyx_bib_deps('ryan_thompson_resume.lyx')),
+           example_files=list(resume_example_deps('ryan_thompson_resume.lyx')),
+           headshot='headshot-crop.jpg',
+    output: html='ryan_thompson_resume.html'
     run:
         with NamedTemporaryFile() as tempf:
             shell('{LYXPATH:q} --export-to xhtml {tempf.name:q} {input.lyxfile:q}')
@@ -137,7 +166,11 @@ rule link_resume_to_index_html:
     shell: 'ln -s {input:q} {output:q}'
 
 rule readme_to_index_html:
-    input: "{dirname}/README.mkdn"
-    output: "{dirname}/index.html"
+    input: '{dirname}/README.mkdn'
+    output: '{dirname}/index.html'
     shell: 'pandoc -t html -o {output[0]:q} {input[0]:q}'
 
+rule R_to_html:
+    input: '{dirname}/{basename,[^/]+}.R'
+    output: '{dirname}/{basename}.R.html'
+    shell: 'pygmentize -f html -O full -l R -o {output:q} {input:q}'

+ 725 - 0
examples/Gaasterland/delox.R

@@ -0,0 +1,725 @@
+#!/usr/bin/env Rscript
+
+default.align.opts <- list(match=1, mismatch=3,
+                           gapOpening=5, gapExtension=2)
+
+parse_arguments <- function() {
+    suppressMessages({
+        library(optparse)
+        library(parallel)
+    })
+    option_list <-
+        list(make_option(c("-c", "--min-call"), type="integer", default=10, metavar="10",
+                         help="Minimum perfect overlap required to call the presence of the subject (paired only). Imperfect overlap will need to be longer, based on specified mismatch and gap penalties."),
+             make_option(c("-l", "--min-length"), type="integer", default=36, metavar="36",
+                         help="Minimum length allowed after trimming a read. Any reads shorter than this after trimming will be discarded."),
+             make_option(c("-i", "--interleaved"), action="store_true", default=FALSE,
+                         help="Specify this option if you have paired-end sequences interleaved in a single FASTQ file. The default is to read paired-end sequences from a matched pair of files, and this option is ignored if two fastq files are provided. When you use this option, skip the \"READ2.fastq\" argument."),
+             make_option(c("-o", "--read1-orientation"), type="character", default="in", metavar="in/out",
+                         help="Orientation of read1. Can be either \"in\" or \"out\" (paired only). Note that Illumina reads are \"in\"."),
+             make_option(c("-q", "--read2-orientation"), type="character", default="in", metavar="in/out",
+                         help="Orientation of read2. Can be either \"in\" or \"out\" (paired only)"),
+             make_option(c("-j", "--jobs"), type="integer",
+                         default=parallel:::detectCores(),
+                         metavar=as.character(parallel:::detectCores()),
+                         help="Number of jobs to run in parallel for alignment. This should be autodetected by default."),
+             make_option(c("-y", "--yield-size"), type="integer",
+                         default=100000,
+                         metavar="100000",
+                         help="Number of reads to process at a time. Setting this higher will read more data into memory at once and result in faster runtime. Setting this lower will require less memory."),
+             make_option(c("-m", "--match-bonus"), type="double",
+                         default=default.align.opts$match,
+                         metavar=as.character(default.align.opts$match),
+                         help="Score bonus for a matching nucleotide"),
+             make_option(c("-p", "--mismatch-penalty"), type="double",
+                         default=default.align.opts$mismatch,
+                         metavar=as.character(default.align.opts$mismatch),
+                         help="Score penalty for a mismatched nucleotide (specify as a positive number)"),
+             make_option(c("-g", "--gap-open-penalty"), type="double",
+                         default=default.align.opts$gapOpening,
+                         metavar=as.character(default.align.opts$gapOpening),
+                         help="Score penalty for opening a gap in the alignment (specifiy as a positive number)"),
+             make_option(c("-e", "--gap-extension-penalty"), type="double",
+                         default=default.align.opts$match,
+                         metavar=as.character(default.align.opts$gapExtension),
+                         help="Score penalty for extending an alignment gap by two nucleotides (specify as a positive number)"),
+             make_option(c("-s", "--single-read-mode"), action="store_true", default=FALSE,
+                         help="Tell DeLoxer to run in single-end mode instead of paired-end mode. In this mode, the only a single input fastq file is provided, and only a single output file is created. No classification is performed, only trimming.  When you use this option, skip the \"READ2.fastq\" argument, and specify the full file name for OUTPUT_NAME instead of just the base name."))
+    option_parser <- OptionParser(option_list=option_list,
+                                  usage="%prog [options] adapter.fasta READ1.fastq READ2.fastq OUTPUT_NAME")
+    opt <- parse_args(option_parser, positional_arguments=TRUE)
+    return(opt)
+}
+
+## Call this here to handle --help quickly, before we waste 10 seconds
+## loading all the libraries
+invisible(parse_arguments())
+
+print.option.list <- function(opt=parse_arguments()) {
+    args <- opt$args
+    opts <- opt$options
+    message("Options:")
+    foreach (o=opts, n=names(opts)) %do% {
+        if (n != "help")
+            message(" ", n, ": ", o)
+    }
+    message("Args: ", paste("\"", args, "\"", sep="", collapse=", "))
+}
+
+unimplemented <- function() stop("UNIMPLEMENTED")
+
+## Timestampped message
+tsmsg <- function(...) {
+    message("# ", date(), ": ", ...)
+}
+
+tsmsg("Starting deloxer and loading required packages")
+
+suppressMessages({
+    library(ShortRead)
+    library(optparse)
+    library(foreach)
+    library(iterators)
+    library(itertools)
+    library(doMC)
+    registerDoMC()
+    mcoptions <- list(preschedule=TRUE, set.seed=FALSE)
+})
+
+## Merge l1 and l2 by names
+merge.lists <- function(l1, l2) {
+    new.names <- setdiff(names(l2), names(l1))
+    l1[new.names] <- l2[new.names]
+    l1
+}
+
+## Return an object sans names
+strip.names <- function(x) {
+    names(x) <- NULL
+    x
+}
+
+## Define some missing type coercions
+setAs(from="ShortRead", to="DNAStringSet", def=function(from) sread(from))
+setAs(from="PhredQuality", to="FastqQuality", def=function(from) FastqQuality(BStringSet(from)))
+setAs(from="SolexaQuality", to="SFastqQuality", def=function(from) SFastqQuality(BStringSet(from)))
+setAs(from="QualityScaledXStringSet", to="ShortReadQ", def=function(from) {
+    q <- quality(from)
+    new.quality.class <- switch(class(q),
+                                SolexaQuality="SFastqQuality",
+                                PhredQuality="FastqQuality",
+                                stop("Unknown quality type: ", class(q)))
+    q <- as(q, new.quality.class)
+    ShortReadQ(sread=as(from, "DNAStringSet"),
+               quality=q,
+               id=BStringSet(names(from)))
+})
+## Override the provided method to keep the sequence names
+setAs(from="ShortReadQ", to="QualityScaledDNAStringSet",
+      def=function (from, to = "QualityScaledDNAStringSet", strict = TRUE) {
+          q <- quality(from)
+          new.quality.class <- switch(class(q),
+                                      SFastqQuality="SolexaQuality",
+                                      FastqQuality="PhredQuality",
+                                      "XStringQuality")
+          q <- as(q, new.quality.class)
+          x <- QualityScaledDNAStringSet(sread(from), q)
+          names(x) <- as.character(id(from))
+          x
+      })
+
+## Define functions for reading fastq into standard Biostrings object
+## and writing it back out. The standard functions readFastq and
+## writeFastq operate on ShortRead objects. These simply wrap them in
+## conversion to/from QualityScaledDNAStringSet.
+read.QualityScaledDNAStringSet <- function(filepath, format = "fastq", ...) {
+    switch(format,
+           fastq=as(readFastq(filepath, withIds=TRUE, ...), "QualityScaledDNAStringSet"),
+           ## Default
+           stop("Unknown quality-scaled sequence format: ", format))
+}
+
+write.QualityScaledDNAStringSet <- function (x, filepath, append = FALSE, format = "fastq") {
+    if(length(x) > 0) {
+        sr <- as(x, "ShortReadQ")
+        switch(format,
+               fastq={
+                   if (!append)
+                       unlink(filepath);
+                   writeFastq(object=sr,
+                              file=filepath, mode=ifelse(append, "a", "w"))
+               },
+               ## Default
+               stop("Unknown quality-scaled sequence format: ", format))
+    } else {
+        ## Zero-length sequence; just truncate/touch the file
+        sink(file=filepath, append=append)
+        sink()
+    }
+}
+
+discard.short.reads <- function(reads, min.length=1) {
+    kept.reads <- reads[width(reads) >= min.length]
+    return(kept.reads)
+}
+
+## Takes a set of interleaved reads (or anything else) and
+## de-interleaves them
+deinterleave.pairs <- function(reads) {
+    stopifnot(length(reads) %% 2 == 0)
+    mask <- seq(from=1, to=length(reads), by=2)
+    return(list(read1=reads[mask], read2=reads[-mask]))
+}
+
+.delox.trimmed.ranges <- function(subj, reads, min.length=36,
+                                  include.scores=TRUE,
+                                  include.deleted.ranges=TRUE,
+                                  align.opts=list()) {
+
+    align.opts <- merge.lists(align.opts, default.align.opts)
+
+    aln <- list(forward=pairwiseAlignment(pattern=reads,
+                subject=subj,
+                type="overlap",
+                substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
+                gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension),
+                revcomp=pairwiseAlignment(pattern=reads,
+                subject=reverseComplement(DNAString(subj)),
+                type="overlap",
+                substitutionMatrix=nucleotideSubstitutionMatrix(match = align.opts$match, mismatch = -align.opts$mismatch),
+                gapOpening=-align.opts$gapOpening, gapExtension=-align.opts$gapExtension))
+
+    aln.scores <- Map(score, aln)
+    aln.pat <- Map(pattern, aln)
+    aln.ranges <- Map(function(x) IRanges(start=start(x), end=end(x)), aln.pat)
+    aln.threebands <- Map(function (x) threebands(IRanges(start=1, end=width(reads)),
+                                                  start=start(x), end=end(x)),
+                          aln.ranges)
+
+    ## For each read, decide whether the forward or reverse alignment
+    ## was better.
+    revcomp.better <- aln.scores$forward < aln.scores$revcomp
+
+    ## For each read, take the threebands for the better alignment.
+    best.threebands <- aln.threebands$forward
+    for (band in names(best.threebands)) {
+        best.threebands[[band]][revcomp.better] <- aln.threebands$revcomp[[band]][revcomp.better]
+    }
+
+    ## Use the left band if it is longer than either min.length or
+    ## length of right band.
+    use.right.band <- width(best.threebands$left) < pmin(min.length, width(best.threebands$right))
+    ranges <- best.threebands$left
+    ranges[use.right.band] <- best.threebands$right[use.right.band]
+
+    ## Record which ranges are shorter than min.length
+    too.short <- width(ranges) < min.length
+    ## ranges[too.short] <- IRanges(start=1,end=0)
+
+    ## Record what was trimmed off of each read (NOT what was kept!)
+    trim <- factor(ifelse(use.right.band, "left", "right"), levels=c("right", "left", "all", "none"))
+    ## If it's too short, then we trim "all", i.e. discard the whole
+    ## read.
+    trim[too.short] <- "all"
+    ## If the read is not shorter after trimming, then nothing was
+    ## actually trimmed.
+    trim[width(ranges) == width(reads)] <- "none"
+
+    emeta <- list()
+
+    emeta$trim <- trim
+
+    if (include.deleted.ranges) {
+        deleted.start <- ifelse(too.short, 1,
+                                ifelse(use.right.band,
+                                       start(best.threebands$left),
+                                       start(best.threebands$middle)))
+        deleted.end <- ifelse(too.short, width(reads),
+                              ifelse(use.right.band,
+                                     end(best.threebands$middle),
+                                     end(best.threebands$right)))
+        emeta$deleted.range <- IRanges(deleted.start, deleted.end)
+    }
+
+    if (include.scores) {
+        ## If requested, take the best score out of each pair of forward
+        ## and reverse scores.
+        scores <- ifelse(revcomp.better, aln.scores$revcomp, aln.scores$forward)
+        emeta$score <- scores
+    }
+
+    mcols(ranges) <- DataFrame(emeta)
+
+    return(ranges)
+}
+
+## Always call delox on the underlying DNAStringSet object when called
+## on something more complicated.
+suppressMessages({
+    invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="ShortRead"),
+                        function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
+                            callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
+                        }))
+    invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledDNAStringSet"),
+                        function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
+                            callGeneric(subj, as(reads, "DNAStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
+                        }))
+    invisible(setMethod(".delox.trimmed.ranges", signature=c(reads="QualityScaledXStringSet"),
+                        function (subj, reads, min.length, include.scores, include.deleted.ranges, align.opts) {
+                            callGeneric(subj, as(reads, "XStringSet"), min.length, include.scores, include.deleted.ranges, align.opts)
+                        }))
+})
+
+delox.single <- function(subj, reads , min.length=36,
+                         include.scores=TRUE, align.opts=list()) {
+    tsmsg("Saving read names")
+    saved.names <- BStringSet(names(reads))
+    reads <- strip.names(reads)
+    invisible(gc())
+
+    tsmsg("Doing alignments")
+    nchunks <- min(getDoParWorkers(), ceiling(length(reads)/1000))
+    deloxed.ranges <- foreach(reads=isplitVector(reads, chunks=nchunks), .combine=c) %dopar% {
+        .delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
+                              include.scores=include.scores,
+                              include.deleted.ranges=FALSE,
+                              align.opts=align.opts)
+    }
+    ## maybe.chunkapply(.delox.trimmed.ranges,
+    ##                  VECTOR.ARGS=list(reads=reads),
+    ##                  SCALAR.ARGS=list(subj=subj, min.length=min.length,
+    ##                    include.scores=include.scores,
+    ##                    include.deleted.ranges=FALSE,
+    ##                    align.opts=align.opts),
+    ##                  min.chunk.size=1000,
+    ##                  MERGE=c)
+
+    tsmsg("Trimming reads")
+    trimmed.reads <- narrow(reads, start(deloxed.ranges), end(deloxed.ranges))
+
+    tsmsg("Restoring read names")
+    names(trimmed.reads) <- as.character(saved.names)
+
+    tsmsg("Adding metadata")
+    emeta <- list()
+    if (include.scores) {
+        emeta$score <- mcols(deloxed.ranges)$score
+    }
+    if (length(emeta) > 0) {
+        mcols(trimmed.reads) <- DataFrame(emeta)
+    }
+
+    return(discard.short.reads(trimmed.reads, min.length))
+}
+
+delox.paired <- function(subj, read1, read2,
+                         min.call=10, min.length=36,
+                         include.scores=TRUE, align.opts=list()) {
+    align.opts <- merge.lists(align.opts, default.align.opts)
+
+    tsmsg("Checking read counts")
+    stopifnot(length(read1) == length(read2))
+
+    tsmsg("Listing reads")
+    original.reads <- list(read1=read1,
+                           read2=read2)
+    rm(read1, read2)
+
+    tsmsg("Saving read names")
+    read.names <- foreach(r=original.reads) %do% BStringSet(names(r))
+    names(read.names) <- names(original.reads)
+    original.reads <- Map(strip.names, original.reads)
+    invisible(gc())
+
+    tsmsg("Doing alignments")
+    deloxed.ranges <- lapply(original.reads, function(x) {
+        nchunks <- min(getDoParWorkers(), ceiling(length(x)/1000))
+        foreach(reads=isplitVector(x, chunks=nchunks), .combine=c) %dopar% {
+            .delox.trimmed.ranges(reads=reads, subj=subj, min.length=min.length,
+                                  include.scores=include.scores,
+                                  include.deleted.ranges=FALSE,
+                                  align.opts=align.opts)
+        }
+
+        ## maybe.chunkapply(.delox.trimmed.ranges,
+        ##                  VECTOR.ARGS=list(reads=strip.names(x)),
+        ##                  SCALAR.ARGS=list(subj=subj,
+        ##                    min.length=min.length,
+        ##                    include.scores=TRUE,
+        ##                    include.deleted.ranges=TRUE,
+        ##                    align.opts=align.opts),
+        ##                  MERGE=c,
+        ##                  min.chunk.size=1000)
+    })
+
+    tsmsg("Extracting metadata")
+    delox.meta <- lapply(deloxed.ranges, mcols)
+
+    ## Decide whether enough was trimmed on the inside (right end) of
+    ## either read to call it a mate-pair.
+    tsmsg("Calculating inside trim score")
+    inside.trim.score <- Reduce(pmax,
+                                lapply(delox.meta,
+                                       function(x) ifelse(x$trim == "right", x$score, 0)))
+
+    ## Decide whether enough was trimmed on the outside (left end) of
+    ## either read to call it a non-mate-pair.
+    tsmsg("Calculating outside trim score")
+    outside.trim.score <- Reduce(pmax,
+                                 lapply(delox.meta,
+                                        function(x) ifelse(x$trim == "left", x$score, 0)))
+
+    tsmsg("Calling presence of subject")
+    calls <- list(inside=inside.trim.score >= min.call * align.opts$match,
+                  outside=outside.trim.score >= min.call * align.opts$match)
+
+    tsmsg("Categorizing reads")
+    category <- factor(rep(NA, length(original.reads$read1)), levels=c("mate", "non-mate", "negative", "unpaired", "discard"))
+    category[calls$inside] <- "mate"
+    category[calls$outside] <- "non-mate"
+    ## If they're either both true or both false, then it's ambiguous
+    category[calls$inside == calls$outside] <- "negative"
+    ## All categories should be filled in now
+    stopifnot(all(!is.na(category)))
+
+    too.short <- lapply(deloxed.ranges, function(x) width(x) < min.length)
+    ## If either read in a pair is too short, then its partner is no
+    ## longer paired at all.
+    one.too.short <- Reduce(`|`, too.short)
+    category[one.too.short] <- "unpaired"
+    ## If both reads in a pair are too short, then the entire pair is
+    ## discarded. This is highly unlikely, since Cre-Lox should not
+    ## appear in the middle of both sequences.
+    both.too.short <- Reduce(`&`, too.short)
+    category[both.too.short] <- "discard"
+
+    tsmsg("Trimming reads and restoring read names")
+    trimmed.reads <- lapply(names(original.reads), function(x) {
+        trimmed <- narrow(original.reads[[x]],
+                          start=start(deloxed.ranges[[x]]),
+                          end=end(deloxed.ranges[[x]]))
+        names(trimmed) <- as.character(read.names[[x]])
+        trimmed
+    })
+    names(trimmed.reads) <- names(original.reads)
+
+    tsmsg("Assembling metadata")
+    foreach (r=names(trimmed.reads)) %do% {
+        emeta <- list()
+        emeta$category <- category
+        emeta$category[too.short[[r]]] <- "discard"
+        if (include.scores) {
+            emeta$score <- delox.meta[[r]]$score
+        }
+        mcols(trimmed.reads[[r]]) <- DataFrame(emeta)
+    }
+
+    return(trimmed.reads)
+}
+
+## Wrapper for both single and paired as appropriate
+delox <- function(subj, read1, read2=NULL,
+                  min.call=10, min.length=36,
+                  interleaved=FALSE,
+                  read1.orientation=c("in", "out")[1],
+                  read2.orientation=c("in", "out")[1],
+                  align.opts=list()) {
+    if (is.null(read2)) {
+        if (interleaved) {
+            x <- deinterleave.pairs(read1)
+            read1 <- x$read1
+            read2 <- x$read2
+        } else {
+            tsmsg("Doing single-read delox")
+            return(delox.single(subj=subj, reads=read1, min.length=min.length, align.opts=align.opts))
+        }
+    }
+
+    ## Make sure both reads are oriented "in" before calling
+    tsmsg("Ensuring correct read orientation")
+    if (tolower(read1.orientation) == "out") {
+        read1 <- reverseComplement(read1)
+    }
+    if (!is.null(read2) && tolower(read2.orientation) == "out") {
+        read2 <- reverseComplement(read2)
+    }
+
+    tsmsg("Doing paired-end delox")
+    deloxed.reads <- delox.paired(subj, read1, read2,
+                                  min.call=min.call, min.length=min.length,
+                                  align.opts=align.opts)
+
+    ## If reads started "out", put them back that way before returning
+    tsmsg("Restoring original read orientation")
+    if (tolower(read1.orientation) == "out") {
+        deloxed.reads$read1 <- reverseComplement(deloxed.reads$read1)
+    }
+    if (tolower(read2.orientation) == "out") {
+        deloxed.reads$read2 <- reverseComplement(deloxed.reads$read2)
+    }
+
+    return(deloxed.reads)
+}
+
+## ## Hack to work around a bug in BioConductor that prevents subsetting
+## ## of named XStringSet objects. Apparently, since DeLoxer was first
+## ## published, the BioConductor devs broke the XStringSet subsetting
+## ## code so that it can no longer handle XStringSets with names. The
+## ## code below strips the names from the XStringSet, then calls the old
+## ## code to subset the nameless object while subsetting the names
+## ## separately, then finally puts the names back on and returns the
+## ## result.
+## old.XStringSet.subset.method <- selectMethod("[", "XStringSet")
+## invisible(setMethod("[", signature="XStringSet", definition=function(x, i, j, ..., drop=TRUE) {
+##     ## Save the names into a seaprate variable
+##     xnames <- names(x)
+##     ## Do the old behavior, which works on unnamed objects
+##     x <- old.XStringSet.subset.method(unname(x), i, j, ..., drop=drop)
+##     ## Put the names back on and return
+##     setNames(x, xnames[i])
+## }))
+
+save.deloxed.pairs.as.fastq <- function(read1, read2, output.base,
+                                        mate.ext="matepaired",
+                                        nonmate.ext="paired",
+                                        negative.ext="negative",
+                                        unpaired.ext="unpaired",
+                                        append=FALSE) {
+
+    extension <- c(mate=mate.ext,
+                   `non-mate`=nonmate.ext,
+                   negative=negative.ext,
+                   unpaired=unpaired.ext)
+
+    ## ## Make sure that read1 and read2 are a match for each other
+    ## stopifnot(identical(as.character(mcols(read1)$category),
+    ##                     as.character(mcols(read2)$category)))
+
+    ## ## Discard the shorter read on "unpaired"
+    ## read1.shorter <- width(read1) < width(read2)
+    ## mcols(read1)$category[mcols(read1)$category == "unpaired" & read1.shorter] <- NA
+    ## mcols(read2)$category[mcols(read2)$category == "unpaired" & !read1.shorter] <- NA
+
+    filename.template <- "%s_read%s.%s.fastq"
+
+    for (cat in names(extension)) {
+        read1.for.category <- read1[mcols(read1)$category == cat]
+        read1.file.for.category <- sprintf(filename.template, output.base, 1, extension[[cat]])
+        tsmsg("Writing ", read1.file.for.category)
+        write.QualityScaledDNAStringSet(read1.for.category,
+                                        file=read1.file.for.category,
+                                        append=append)
+
+        read2.for.category <- read2[mcols(read2)$category == cat]
+        read2.file.for.category <- sprintf(filename.template, output.base, 2, extension[[cat]])
+        tsmsg("Writing ", read2.file.for.category)
+        write.QualityScaledDNAStringSet(read2.for.category,
+                                        file=read2.file.for.category,
+                                        append=append)
+    }
+
+    return(TRUE)
+}
+
+get.category.counts <- function(deloxed.pairs) {
+    r1cat <- mcols(deloxed.pairs$read1)$category
+    r2cat <- mcols(deloxed.pairs$read2)$category
+    x <- table(r1cat)[c("mate", "non-mate", "negative")]
+    x["r1.single"] <- sum(r1cat == "unpaired")
+    x["r2.single"] <- sum(r2cat == "unpaired")
+    x["discard"] <- length(r1cat) - sum(x)
+    x
+}
+
+mcparallel.quiet <- function(expr, ...) {
+    parallel:::mcparallel(suppressMessages(expr), ...)
+}
+
+print.stats <- function(category.counts) {
+    category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
+                             names(category.counts))
+    x <- rbind(Counts=category.counts, Fractions=category.pct)
+    names(dimnames(x)) <- c("Stat", "Category")
+    print(x, quote=FALSE, justify="right")
+}
+
+main <- function() {
+    opt <- parse_arguments()
+    print.option.list(opt)
+    args <- opt$args
+    opts <- opt$options
+
+    if (!(tolower(opts[["read1-orientation"]]) %in% c("in", "out") &&
+          tolower(opts[["read2-orientation"]]) %in% c("in", "out") )) {
+        stop("Valid orientations are \"in\" and \"out\"")
+    }
+
+    align.opts <- list(match = opts[["match-bonus"]],
+                       mismatch = opts[["mismatch-penalty"]],
+                       gapOpening = opts[["gap-open-penalty"]],
+                       gapExtension = opts[["gap-extension-penalty"]])
+
+    stopifnot(opts$`min-call` >= 1 &&
+              opts$`min-length` >= 0 &&
+              opts$`jobs` >= 0)
+
+    ## Set jobs if requested
+    if (opts$jobs > 0) {
+        options(cores=opts$jobs)
+    }
+    tsmsg("Using ", getDoParWorkers(), " cores.")
+
+    paired <- !opts[["single-read-mode"]]
+    interleaved <- opts[["interleaved"]]
+
+    if (!paired && interleaved) {
+        stop("ERROR: You cannot specify both --interleaved and --single-read-mode")
+    } else if (!paired) {
+        if (length(args) != 3) {
+            stop("DeLoxer in single-read mode requires exactly 3 arguments")
+        }
+        subject.file <- args[[1]]
+        read1.file <- args[[2]]
+        read2.file <- NULL
+        output.file <- args[[3]]
+    } else if (interleaved) {
+        if (length(args) != 3) {
+            stop("DeLoxer interleaved input mode requires exactly 3 arguments")
+        }
+        subject.file <- args[[1]]
+        read1.file <- args[[2]]
+        read2.file <- NULL
+        output.basename <- args[[3]]
+    } else {
+        if (length(args) != 4) {
+            stop("DeLoxer requires exactly 4 arguments")
+        }
+        subject.file <- args[[1]]
+        read1.file <- args[[2]]
+        read2.file <- args[[3]]
+        output.basename <- args[[4]]
+    }
+
+    subj <- readDNAStringSet(subject.file, format="fasta", nrec=1)[[1]]
+
+    yieldSize <- opts[["yield-size"]]
+    if (paired) {
+        tsmsg("Deloxing and classifying paired sequences")
+        read1.stream <- FastqStreamer(read1.file, n=yieldSize)
+        read2.stream <- if (!interleaved) FastqStreamer(read2.file, n=yieldSize)
+        process.chunk <- function(fq1, fq2, append) {
+            if (length(fq1) < 1)
+                return(TRUE)
+            if (interleaved) {
+                stopifnot(is.null(fq2))
+                deint <- deinterleave.pairs(fq1)
+                fq1 <- deint[[1]]
+                fq2 <- deint[[2]]
+            } else {
+                if (length(fq1) != length(fq2))
+                    stop("Both input files must have equal numbers of reads")
+            }
+            read1 <- as(fq1, "QualityScaledDNAStringSet")
+            read2 <- as(fq2, "QualityScaledDNAStringSet")
+            deloxed.pairs <-
+                delox(subj, read1, read2,
+                      min.call=opts[["min-call"]],
+                      interleaved=interleaved,
+                      read1.orientation=opts[["read1-orientation"]],
+                      read2.orientation=opts[["read2-orientation"]],
+                      align.opts=align.opts)
+            save.deloxed.pairs.as.fastq(deloxed.pairs$read1, deloxed.pairs$read2, output.basename, append=append)
+
+            ret <- get.category.counts(deloxed.pairs)
+            return(ret)
+        }
+        fq1 <- yield(read1.stream)
+        fq2 <- if (!interleaved) yield(read2.stream)
+        if (length(fq1) == 0)
+            warning("No reads were read from the input file.")
+        proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=FALSE))
+        reads.processed <- length(fq1) / ifelse(interleaved, 2, 1)
+        category.stats <-
+            category.counts <- NULL
+        while (length(fq1 <- yield(read1.stream))) {
+            if (!interleaved)
+                fq2 <- yield(read2.stream)
+            prev.result <- mccollect(proc)[[1]]
+            if (is(prev.result, "try-error")) {
+                tsmsg("Encountered error in deloxing subprocess:")
+                stop(attr(prev.result, "condition"))
+            }
+            if (is.null(category.counts)) {
+                category.counts <- prev.result
+            } else {
+                category.counts <- category.counts + prev.result
+            }
+            tsmsg("Category stats after processing ", reads.processed, " reads:")
+            ## category.pct <- setNames(sprintf("%.3g%%", category.counts / sum(category.counts) * 100),
+            ##                          names(category.counts))
+            print.stats(category.counts)
+            proc <- mcparallel.quiet(process.chunk(fq1, fq2, append=TRUE))
+            reads.processed <- reads.processed + length(fq1) / ifelse(interleaved, 2, 1)
+        }
+        close(read1.stream)
+        if (!interleaved) close(read2.stream)
+        prev.result <- mccollect(proc)[[1]]
+        if (is.null(category.counts)) {
+            category.counts <- prev.result
+        } else {
+            category.counts <- category.counts + prev.result
+        }
+        if (is(prev.result, "try-error")) {
+            tsmsg("Encountered error in deloxing subprocess:")
+            stop(attr(prev.result, "condition"))
+            stop("Encountered error in deloxing")
+        }
+        tsmsg("Final category stats after processing ", reads.processed, " reads:")
+        print.stats(category.counts)
+    } else {
+        tsmsg("Deloxing single sequences")
+        read1.stream <- FastqStreamer(read1.file, n=yieldSize)
+        process.chunk <- function(fq, append) {
+            if (length(fq) < 1)
+                return(TRUE)
+            reads <- as(fq, "QualityScaledDNAStringSet")
+            deloxed.reads <-
+                delox(subj, reads, NULL,
+                      min.call=opts[["min-call"]],
+                      interleaved=interleaved,
+                      read1.orientation=opts[["read1-orientation"]],
+                      read2.orientation=opts[["read2-orientation"]],
+                      align.opts=align.opts)
+            write.QualityScaledDNAStringSet(deloxed.reads, output.file, append=append)
+            return(TRUE)
+        }
+        ## First chunk is processed with append=FALSE to start the file
+        fq <- yield(read1.stream)
+        if (length(fq) == 0)
+            warning("No reads were read from the input file.")
+        proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=FALSE)))
+        reads.processed <- length(fq)
+        while (length(fq <- yield(read1.stream))) {
+            prev.result <- mccollect(proc)[[1]]
+            if (is(prev.result, "try-error")) {
+                tsmsg("Encountered error in deloxing subprocess:")
+                stop(attr(prev.result, "condition"))
+                stop("Encountered error in deloxing")
+            }
+            tsmsg("Processed ", reads.processed, " reads")
+            proc <- mcparallel.quiet(suppressMessages(process.chunk(fq, append=TRUE)))
+            reads.processed <- reads.processed + length(fq)
+        }
+        close(read1.stream)
+        prev.result <- mccollect(proc)[[1]]
+        if (is(prev.result, "try-error")) {
+            tsmsg("Encountered error in deloxing subprocess:")
+            stop(attr(prev.result, "condition"))
+            stop("Encountered error in deloxing")
+        }
+        tsmsg("Processed ", reads.processed, " reads")
+    }
+    tsmsg("Finished successful run")
+}
+
+main()

+ 814 - 0
examples/Gaasterland/delox.R.html

@@ -0,0 +1,814 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/env Rscript</span>
+
+default.align.opts <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>match<span class="o">=</span><span class="m">1</span><span class="p">,</span> mismatch<span class="o">=</span><span class="m">3</span><span class="p">,</span>
+                           gapOpening<span class="o">=</span><span class="m">5</span><span class="p">,</span> gapExtension<span class="o">=</span><span class="m">2</span><span class="p">)</span>
+
+parse_arguments <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">()</span> <span class="p">{</span>
+    <span class="kp">suppressMessages</span><span class="p">({</span>
+        <span class="kn">library</span><span class="p">(</span>optparse<span class="p">)</span>
+        <span class="kn">library</span><span class="p">(</span>parallel<span class="p">)</span>
+    <span class="p">})</span>
+    option_list <span class="o">&lt;-</span>
+        <span class="kt">list</span><span class="p">(</span>make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-c&quot;</span><span class="p">,</span> <span class="s">&quot;--min-call&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;integer&quot;</span><span class="p">,</span> default<span class="o">=</span><span class="m">10</span><span class="p">,</span> metavar<span class="o">=</span><span class="s">&quot;10&quot;</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Minimum perfect overlap required to call the presence of the subject (paired only). Imperfect overlap will need to be longer, based on specified mismatch and gap penalties.&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-l&quot;</span><span class="p">,</span> <span class="s">&quot;--min-length&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;integer&quot;</span><span class="p">,</span> default<span class="o">=</span><span class="m">36</span><span class="p">,</span> metavar<span class="o">=</span><span class="s">&quot;36&quot;</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Minimum length allowed after trimming a read. Any reads shorter than this after trimming will be discarded.&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-i&quot;</span><span class="p">,</span> <span class="s">&quot;--interleaved&quot;</span><span class="p">),</span> action<span class="o">=</span><span class="s">&quot;store_true&quot;</span><span class="p">,</span> default<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Specify this option if you have paired-end sequences interleaved in a single FASTQ file. The default is to read paired-end sequences from a matched pair of files, and this option is ignored if two fastq files are provided. When you use this option, skip the \&quot;READ2.fastq\&quot; argument.&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-o&quot;</span><span class="p">,</span> <span class="s">&quot;--read1-orientation&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;character&quot;</span><span class="p">,</span> default<span class="o">=</span><span class="s">&quot;in&quot;</span><span class="p">,</span> metavar<span class="o">=</span><span class="s">&quot;in/out&quot;</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Orientation of read1. Can be either \&quot;in\&quot; or \&quot;out\&quot; (paired only). Note that Illumina reads are \&quot;in\&quot;.&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-q&quot;</span><span class="p">,</span> <span class="s">&quot;--read2-orientation&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;character&quot;</span><span class="p">,</span> default<span class="o">=</span><span class="s">&quot;in&quot;</span><span class="p">,</span> metavar<span class="o">=</span><span class="s">&quot;in/out&quot;</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Orientation of read2. Can be either \&quot;in\&quot; or \&quot;out\&quot; (paired only)&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-j&quot;</span><span class="p">,</span> <span class="s">&quot;--jobs&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;integer&quot;</span><span class="p">,</span>
+                         default<span class="o">=</span>parallel<span class="o">:::</span>detectCores<span class="p">(),</span>
+                         metavar<span class="o">=</span><span class="kp">as.character</span><span class="p">(</span>parallel<span class="o">:::</span>detectCores<span class="p">()),</span>
+                         help<span class="o">=</span><span class="s">&quot;Number of jobs to run in parallel for alignment. This should be autodetected by default.&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-y&quot;</span><span class="p">,</span> <span class="s">&quot;--yield-size&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;integer&quot;</span><span class="p">,</span>
+                         default<span class="o">=</span><span class="m">100000</span><span class="p">,</span>
+                         metavar<span class="o">=</span><span class="s">&quot;100000&quot;</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Number of reads to process at a time. Setting this higher will read more data into memory at once and result in faster runtime. Setting this lower will require less memory.&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-m&quot;</span><span class="p">,</span> <span class="s">&quot;--match-bonus&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;double&quot;</span><span class="p">,</span>
+                         default<span class="o">=</span>default.align.opts<span class="o">$</span><span class="kp">match</span><span class="p">,</span>
+                         metavar<span class="o">=</span><span class="kp">as.character</span><span class="p">(</span>default.align.opts<span class="o">$</span><span class="kp">match</span><span class="p">),</span>
+                         help<span class="o">=</span><span class="s">&quot;Score bonus for a matching nucleotide&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-p&quot;</span><span class="p">,</span> <span class="s">&quot;--mismatch-penalty&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;double&quot;</span><span class="p">,</span>
+                         default<span class="o">=</span>default.align.opts<span class="o">$</span>mismatch<span class="p">,</span>
+                         metavar<span class="o">=</span><span class="kp">as.character</span><span class="p">(</span>default.align.opts<span class="o">$</span>mismatch<span class="p">),</span>
+                         help<span class="o">=</span><span class="s">&quot;Score penalty for a mismatched nucleotide (specify as a positive number)&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-g&quot;</span><span class="p">,</span> <span class="s">&quot;--gap-open-penalty&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;double&quot;</span><span class="p">,</span>
+                         default<span class="o">=</span>default.align.opts<span class="o">$</span>gapOpening<span class="p">,</span>
+                         metavar<span class="o">=</span><span class="kp">as.character</span><span class="p">(</span>default.align.opts<span class="o">$</span>gapOpening<span class="p">),</span>
+                         help<span class="o">=</span><span class="s">&quot;Score penalty for opening a gap in the alignment (specifiy as a positive number)&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-e&quot;</span><span class="p">,</span> <span class="s">&quot;--gap-extension-penalty&quot;</span><span class="p">),</span> type<span class="o">=</span><span class="s">&quot;double&quot;</span><span class="p">,</span>
+                         default<span class="o">=</span>default.align.opts<span class="o">$</span><span class="kp">match</span><span class="p">,</span>
+                         metavar<span class="o">=</span><span class="kp">as.character</span><span class="p">(</span>default.align.opts<span class="o">$</span>gapExtension<span class="p">),</span>
+                         help<span class="o">=</span><span class="s">&quot;Score penalty for extending an alignment gap by two nucleotides (specify as a positive number)&quot;</span><span class="p">),</span>
+             make_option<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-s&quot;</span><span class="p">,</span> <span class="s">&quot;--single-read-mode&quot;</span><span class="p">),</span> action<span class="o">=</span><span class="s">&quot;store_true&quot;</span><span class="p">,</span> default<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span>
+                         help<span class="o">=</span><span class="s">&quot;Tell DeLoxer to run in single-end mode instead of paired-end mode. In this mode, the only a single input fastq file is provided, and only a single output file is created. No classification is performed, only trimming.  When you use this option, skip the \&quot;READ2.fastq\&quot; argument, and specify the full file name for OUTPUT_NAME instead of just the base name.&quot;</span><span class="p">))</span>
+    option_parser <span class="o">&lt;-</span> OptionParser<span class="p">(</span>option_list<span class="o">=</span>option_list<span class="p">,</span>
+                                  usage<span class="o">=</span><span class="s">&quot;%prog [options] adapter.fasta READ1.fastq READ2.fastq OUTPUT_NAME&quot;</span><span class="p">)</span>
+    opt <span class="o">&lt;-</span> parse_args<span class="p">(</span>option_parser<span class="p">,</span> positional_arguments<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+    <span class="kr">return</span><span class="p">(</span>opt<span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## Call this here to handle --help quickly, before we waste 10 seconds</span>
+<span class="c1">## loading all the libraries</span>
+<span class="kp">invisible</span><span class="p">(</span>parse_arguments<span class="p">())</span>
+
+print.option.list <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>opt<span class="o">=</span>parse_arguments<span class="p">())</span> <span class="p">{</span>
+    args <span class="o">&lt;-</span> opt<span class="o">$</span><span class="kp">args</span>
+    opts <span class="o">&lt;-</span> opt<span class="o">$</span><span class="kp">options</span>
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Options:&quot;</span><span class="p">)</span>
+    foreach <span class="p">(</span>o<span class="o">=</span>opts<span class="p">,</span> n<span class="o">=</span><span class="kp">names</span><span class="p">(</span>opts<span class="p">))</span> <span class="o">%do%</span> <span class="p">{</span>
+        <span class="kr">if</span> <span class="p">(</span>n <span class="o">!=</span> <span class="s">&quot;help&quot;</span><span class="p">)</span>
+            <span class="kp">message</span><span class="p">(</span><span class="s">&quot; &quot;</span><span class="p">,</span> n<span class="p">,</span> <span class="s">&quot;: &quot;</span><span class="p">,</span> o<span class="p">)</span>
+    <span class="p">}</span>
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Args: &quot;</span><span class="p">,</span> <span class="kp">paste</span><span class="p">(</span><span class="s">&quot;\&quot;&quot;</span><span class="p">,</span> <span class="kp">args</span><span class="p">,</span> <span class="s">&quot;\&quot;&quot;</span><span class="p">,</span> sep<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">,</span> collapse<span class="o">=</span><span class="s">&quot;, &quot;</span><span class="p">))</span>
+<span class="p">}</span>
+
+unimplemented <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">()</span> <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;UNIMPLEMENTED&quot;</span><span class="p">)</span>
+
+<span class="c1">## Timestampped message</span>
+tsmsg <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;# &quot;</span><span class="p">,</span> <span class="kp">date</span><span class="p">(),</span> <span class="s">&quot;: &quot;</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span>
+<span class="p">}</span>
+
+tsmsg<span class="p">(</span><span class="s">&quot;Starting deloxer and loading required packages&quot;</span><span class="p">)</span>
+
+<span class="kp">suppressMessages</span><span class="p">({</span>
+    <span class="kn">library</span><span class="p">(</span>ShortRead<span class="p">)</span>
+    <span class="kn">library</span><span class="p">(</span>optparse<span class="p">)</span>
+    <span class="kn">library</span><span class="p">(</span>foreach<span class="p">)</span>
+    <span class="kn">library</span><span class="p">(</span>iterators<span class="p">)</span>
+    <span class="kn">library</span><span class="p">(</span>itertools<span class="p">)</span>
+    <span class="kn">library</span><span class="p">(</span>doMC<span class="p">)</span>
+    registerDoMC<span class="p">()</span>
+    mcoptions <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>preschedule<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> set.seed<span class="o">=</span><span class="kc">FALSE</span><span class="p">)</span>
+<span class="p">})</span>
+
+<span class="c1">## Merge l1 and l2 by names</span>
+merge.lists <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>l1<span class="p">,</span> l2<span class="p">)</span> <span class="p">{</span>
+    new.names <span class="o">&lt;-</span> <span class="kp">setdiff</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>l2<span class="p">),</span> <span class="kp">names</span><span class="p">(</span>l1<span class="p">))</span>
+    l1<span class="p">[</span>new.names<span class="p">]</span> <span class="o">&lt;-</span> l2<span class="p">[</span>new.names<span class="p">]</span>
+    l1
+<span class="p">}</span>
+
+<span class="c1">## Return an object sans names</span>
+strip.names <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
+    <span class="kp">names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="kc">NULL</span>
+    x
+<span class="p">}</span>
+
+<span class="c1">## Define some missing type coercions</span>
+setAs<span class="p">(</span>from<span class="o">=</span><span class="s">&quot;ShortRead&quot;</span><span class="p">,</span> to<span class="o">=</span><span class="s">&quot;DNAStringSet&quot;</span><span class="p">,</span> def<span class="o">=</span><span class="kr">function</span><span class="p">(</span>from<span class="p">)</span> sread<span class="p">(</span>from<span class="p">))</span>
+setAs<span class="p">(</span>from<span class="o">=</span><span class="s">&quot;PhredQuality&quot;</span><span class="p">,</span> to<span class="o">=</span><span class="s">&quot;FastqQuality&quot;</span><span class="p">,</span> def<span class="o">=</span><span class="kr">function</span><span class="p">(</span>from<span class="p">)</span> FastqQuality<span class="p">(</span>BStringSet<span class="p">(</span>from<span class="p">)))</span>
+setAs<span class="p">(</span>from<span class="o">=</span><span class="s">&quot;SolexaQuality&quot;</span><span class="p">,</span> to<span class="o">=</span><span class="s">&quot;SFastqQuality&quot;</span><span class="p">,</span> def<span class="o">=</span><span class="kr">function</span><span class="p">(</span>from<span class="p">)</span> SFastqQuality<span class="p">(</span>BStringSet<span class="p">(</span>from<span class="p">)))</span>
+setAs<span class="p">(</span>from<span class="o">=</span><span class="s">&quot;QualityScaledXStringSet&quot;</span><span class="p">,</span> to<span class="o">=</span><span class="s">&quot;ShortReadQ&quot;</span><span class="p">,</span> def<span class="o">=</span><span class="kr">function</span><span class="p">(</span>from<span class="p">)</span> <span class="p">{</span>
+    q <span class="o">&lt;-</span> quality<span class="p">(</span>from<span class="p">)</span>
+    new.quality.class <span class="o">&lt;-</span> <span class="kr">switch</span><span class="p">(</span><span class="kp">class</span><span class="p">(</span><span class="kp">q</span><span class="p">),</span>
+                                SolexaQuality<span class="o">=</span><span class="s">&quot;SFastqQuality&quot;</span><span class="p">,</span>
+                                PhredQuality<span class="o">=</span><span class="s">&quot;FastqQuality&quot;</span><span class="p">,</span>
+                                <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Unknown quality type: &quot;</span><span class="p">,</span> <span class="kp">class</span><span class="p">(</span><span class="kp">q</span><span class="p">)))</span>
+    q <span class="o">&lt;-</span> as<span class="p">(</span><span class="kp">q</span><span class="p">,</span> new.quality.class<span class="p">)</span>
+    ShortReadQ<span class="p">(</span>sread<span class="o">=</span>as<span class="p">(</span>from<span class="p">,</span> <span class="s">&quot;DNAStringSet&quot;</span><span class="p">),</span>
+               quality<span class="o">=</span><span class="kp">q</span><span class="p">,</span>
+               id<span class="o">=</span>BStringSet<span class="p">(</span><span class="kp">names</span><span class="p">(</span>from<span class="p">)))</span>
+<span class="p">})</span>
+<span class="c1">## Override the provided method to keep the sequence names</span>
+setAs<span class="p">(</span>from<span class="o">=</span><span class="s">&quot;ShortReadQ&quot;</span><span class="p">,</span> to<span class="o">=</span><span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">,</span>
+      def<span class="o">=</span><span class="kr">function</span> <span class="p">(</span>from<span class="p">,</span> to <span class="o">=</span> <span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">,</span> strict <span class="o">=</span> <span class="kc">TRUE</span><span class="p">)</span> <span class="p">{</span>
+          q <span class="o">&lt;-</span> quality<span class="p">(</span>from<span class="p">)</span>
+          new.quality.class <span class="o">&lt;-</span> <span class="kr">switch</span><span class="p">(</span><span class="kp">class</span><span class="p">(</span><span class="kp">q</span><span class="p">),</span>
+                                      SFastqQuality<span class="o">=</span><span class="s">&quot;SolexaQuality&quot;</span><span class="p">,</span>
+                                      FastqQuality<span class="o">=</span><span class="s">&quot;PhredQuality&quot;</span><span class="p">,</span>
+                                      <span class="s">&quot;XStringQuality&quot;</span><span class="p">)</span>
+          q <span class="o">&lt;-</span> as<span class="p">(</span><span class="kp">q</span><span class="p">,</span> new.quality.class<span class="p">)</span>
+          x <span class="o">&lt;-</span> QualityScaledDNAStringSet<span class="p">(</span>sread<span class="p">(</span>from<span class="p">),</span> <span class="kp">q</span><span class="p">)</span>
+          <span class="kp">names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>id<span class="p">(</span>from<span class="p">))</span>
+          x
+      <span class="p">})</span>
+
+<span class="c1">## Define functions for reading fastq into standard Biostrings object</span>
+<span class="c1">## and writing it back out. The standard functions readFastq and</span>
+<span class="c1">## writeFastq operate on ShortRead objects. These simply wrap them in</span>
+<span class="c1">## conversion to/from QualityScaledDNAStringSet.</span>
+read.QualityScaledDNAStringSet <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>filepath<span class="p">,</span> format <span class="o">=</span> <span class="s">&quot;fastq&quot;</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+    <span class="kr">switch</span><span class="p">(</span><span class="kp">format</span><span class="p">,</span>
+           fastq<span class="o">=</span>as<span class="p">(</span>readFastq<span class="p">(</span>filepath<span class="p">,</span> withIds<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> <span class="kc">...</span><span class="p">),</span> <span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">),</span>
+           <span class="c1">## Default</span>
+           <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Unknown quality-scaled sequence format: &quot;</span><span class="p">,</span> <span class="kp">format</span><span class="p">))</span>
+<span class="p">}</span>
+
+write.QualityScaledDNAStringSet <span class="o">&lt;-</span> <span class="kr">function</span> <span class="p">(</span>x<span class="p">,</span> filepath<span class="p">,</span> append <span class="o">=</span> <span class="kc">FALSE</span><span class="p">,</span> format <span class="o">=</span> <span class="s">&quot;fastq&quot;</span><span class="p">)</span> <span class="p">{</span>
+    <span class="kr">if</span><span class="p">(</span><span class="kp">length</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&gt;</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+        sr <span class="o">&lt;-</span> as<span class="p">(</span>x<span class="p">,</span> <span class="s">&quot;ShortReadQ&quot;</span><span class="p">)</span>
+        <span class="kr">switch</span><span class="p">(</span><span class="kp">format</span><span class="p">,</span>
+               fastq<span class="o">=</span><span class="p">{</span>
+                   <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="kp">append</span><span class="p">)</span>
+                       <span class="kp">unlink</span><span class="p">(</span>filepath<span class="p">);</span>
+                   writeFastq<span class="p">(</span>object<span class="o">=</span>sr<span class="p">,</span>
+                              file<span class="o">=</span>filepath<span class="p">,</span> mode<span class="o">=</span><span class="kp">ifelse</span><span class="p">(</span><span class="kp">append</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">,</span> <span class="s">&quot;w&quot;</span><span class="p">))</span>
+               <span class="p">},</span>
+               <span class="c1">## Default</span>
+               <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Unknown quality-scaled sequence format: &quot;</span><span class="p">,</span> <span class="kp">format</span><span class="p">))</span>
+    <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+        <span class="c1">## Zero-length sequence; just truncate/touch the file</span>
+        <span class="kp">sink</span><span class="p">(</span>file<span class="o">=</span>filepath<span class="p">,</span> append<span class="o">=</span><span class="kp">append</span><span class="p">)</span>
+        <span class="kp">sink</span><span class="p">()</span>
+    <span class="p">}</span>
+<span class="p">}</span>
+
+discard.short.reads <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>reads<span class="p">,</span> min.length<span class="o">=</span><span class="m">1</span><span class="p">)</span> <span class="p">{</span>
+    kept.reads <span class="o">&lt;-</span> reads<span class="p">[</span>width<span class="p">(</span>reads<span class="p">)</span> <span class="o">&gt;=</span> min.length<span class="p">]</span>
+    <span class="kr">return</span><span class="p">(</span>kept.reads<span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## Takes a set of interleaved reads (or anything else) and</span>
+<span class="c1">## de-interleaves them</span>
+deinterleave.pairs <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>reads<span class="p">)</span> <span class="p">{</span>
+    <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">length</span><span class="p">(</span>reads<span class="p">)</span> <span class="o">%%</span> <span class="m">2</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span>
+    mask <span class="o">&lt;-</span> <span class="kp">seq</span><span class="p">(</span>from<span class="o">=</span><span class="m">1</span><span class="p">,</span> to<span class="o">=</span><span class="kp">length</span><span class="p">(</span>reads<span class="p">),</span> by<span class="o">=</span><span class="m">2</span><span class="p">)</span>
+    <span class="kr">return</span><span class="p">(</span><span class="kt">list</span><span class="p">(</span>read1<span class="o">=</span>reads<span class="p">[</span>mask<span class="p">],</span> read2<span class="o">=</span>reads<span class="p">[</span><span class="o">-</span>mask<span class="p">]))</span>
+<span class="p">}</span>
+
+<span class="m">.</span>delox.trimmed.ranges <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>subj<span class="p">,</span> reads<span class="p">,</span> min.length<span class="o">=</span><span class="m">36</span><span class="p">,</span>
+                                  include.scores<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>
+                                  include.deleted.ranges<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>
+                                  align.opts<span class="o">=</span><span class="kt">list</span><span class="p">())</span> <span class="p">{</span>
+
+    align.opts <span class="o">&lt;-</span> merge.lists<span class="p">(</span>align.opts<span class="p">,</span> default.align.opts<span class="p">)</span>
+
+    aln <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>forward<span class="o">=</span>pairwiseAlignment<span class="p">(</span>pattern<span class="o">=</span>reads<span class="p">,</span>
+                subject<span class="o">=</span>subj<span class="p">,</span>
+                type<span class="o">=</span><span class="s">&quot;overlap&quot;</span><span class="p">,</span>
+                substitutionMatrix<span class="o">=</span>nucleotideSubstitutionMatrix<span class="p">(</span>match <span class="o">=</span> align.opts<span class="o">$</span><span class="kp">match</span><span class="p">,</span> mismatch <span class="o">=</span> <span class="o">-</span>align.opts<span class="o">$</span>mismatch<span class="p">),</span>
+                gapOpening<span class="o">=-</span>align.opts<span class="o">$</span>gapOpening<span class="p">,</span> gapExtension<span class="o">=-</span>align.opts<span class="o">$</span>gapExtension<span class="p">),</span>
+                revcomp<span class="o">=</span>pairwiseAlignment<span class="p">(</span>pattern<span class="o">=</span>reads<span class="p">,</span>
+                subject<span class="o">=</span>reverseComplement<span class="p">(</span>DNAString<span class="p">(</span>subj<span class="p">)),</span>
+                type<span class="o">=</span><span class="s">&quot;overlap&quot;</span><span class="p">,</span>
+                substitutionMatrix<span class="o">=</span>nucleotideSubstitutionMatrix<span class="p">(</span>match <span class="o">=</span> align.opts<span class="o">$</span><span class="kp">match</span><span class="p">,</span> mismatch <span class="o">=</span> <span class="o">-</span>align.opts<span class="o">$</span>mismatch<span class="p">),</span>
+                gapOpening<span class="o">=-</span>align.opts<span class="o">$</span>gapOpening<span class="p">,</span> gapExtension<span class="o">=-</span>align.opts<span class="o">$</span>gapExtension<span class="p">))</span>
+
+    aln.scores <span class="o">&lt;-</span> <span class="kp">Map</span><span class="p">(</span>score<span class="p">,</span> aln<span class="p">)</span>
+    aln.pat <span class="o">&lt;-</span> <span class="kp">Map</span><span class="p">(</span>pattern<span class="p">,</span> aln<span class="p">)</span>
+    aln.ranges <span class="o">&lt;-</span> <span class="kp">Map</span><span class="p">(</span><span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> IRanges<span class="p">(</span>start<span class="o">=</span>start<span class="p">(</span>x<span class="p">),</span> end<span class="o">=</span>end<span class="p">(</span>x<span class="p">)),</span> aln.pat<span class="p">)</span>
+    aln.threebands <span class="o">&lt;-</span> <span class="kp">Map</span><span class="p">(</span><span class="kr">function</span> <span class="p">(</span>x<span class="p">)</span> threebands<span class="p">(</span>IRanges<span class="p">(</span>start<span class="o">=</span><span class="m">1</span><span class="p">,</span> end<span class="o">=</span>width<span class="p">(</span>reads<span class="p">)),</span>
+                                                  start<span class="o">=</span>start<span class="p">(</span>x<span class="p">),</span> end<span class="o">=</span>end<span class="p">(</span>x<span class="p">)),</span>
+                          aln.ranges<span class="p">)</span>
+
+    <span class="c1">## For each read, decide whether the forward or reverse alignment</span>
+    <span class="c1">## was better.</span>
+    revcomp.better <span class="o">&lt;-</span> aln.scores<span class="o">$</span>forward <span class="o">&lt;</span> aln.scores<span class="o">$</span>revcomp
+
+    <span class="c1">## For each read, take the threebands for the better alignment.</span>
+    best.threebands <span class="o">&lt;-</span> aln.threebands<span class="o">$</span>forward
+    <span class="kr">for</span> <span class="p">(</span>band <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>best.threebands<span class="p">))</span> <span class="p">{</span>
+        best.threebands<span class="p">[[</span>band<span class="p">]][</span>revcomp.better<span class="p">]</span> <span class="o">&lt;-</span> aln.threebands<span class="o">$</span>revcomp<span class="p">[[</span>band<span class="p">]][</span>revcomp.better<span class="p">]</span>
+    <span class="p">}</span>
+
+    <span class="c1">## Use the left band if it is longer than either min.length or</span>
+    <span class="c1">## length of right band.</span>
+    use.right.band <span class="o">&lt;-</span> width<span class="p">(</span>best.threebands<span class="o">$</span>left<span class="p">)</span> <span class="o">&lt;</span> <span class="kp">pmin</span><span class="p">(</span>min.length<span class="p">,</span> width<span class="p">(</span>best.threebands<span class="o">$</span>right<span class="p">))</span>
+    ranges <span class="o">&lt;-</span> best.threebands<span class="o">$</span>left
+    ranges<span class="p">[</span>use.right.band<span class="p">]</span> <span class="o">&lt;-</span> best.threebands<span class="o">$</span>right<span class="p">[</span>use.right.band<span class="p">]</span>
+
+    <span class="c1">## Record which ranges are shorter than min.length</span>
+    too.short <span class="o">&lt;-</span> width<span class="p">(</span>ranges<span class="p">)</span> <span class="o">&lt;</span> min.length
+    <span class="c1">## ranges[too.short] &lt;- IRanges(start=1,end=0)</span>
+
+    <span class="c1">## Record what was trimmed off of each read (NOT what was kept!)</span>
+    trim <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span><span class="kp">ifelse</span><span class="p">(</span>use.right.band<span class="p">,</span> <span class="s">&quot;left&quot;</span><span class="p">,</span> <span class="s">&quot;right&quot;</span><span class="p">),</span> levels<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;right&quot;</span><span class="p">,</span> <span class="s">&quot;left&quot;</span><span class="p">,</span> <span class="s">&quot;all&quot;</span><span class="p">,</span> <span class="s">&quot;none&quot;</span><span class="p">))</span>
+    <span class="c1">## If it&#39;s too short, then we trim &quot;all&quot;, i.e. discard the whole</span>
+    <span class="c1">## read.</span>
+    trim<span class="p">[</span>too.short<span class="p">]</span> <span class="o">&lt;-</span> <span class="s">&quot;all&quot;</span>
+    <span class="c1">## If the read is not shorter after trimming, then nothing was</span>
+    <span class="c1">## actually trimmed.</span>
+    trim<span class="p">[</span>width<span class="p">(</span>ranges<span class="p">)</span> <span class="o">==</span> width<span class="p">(</span>reads<span class="p">)]</span> <span class="o">&lt;-</span> <span class="s">&quot;none&quot;</span>
+
+    emeta <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">()</span>
+
+    emeta<span class="o">$</span>trim <span class="o">&lt;-</span> trim
+
+    <span class="kr">if</span> <span class="p">(</span>include.deleted.ranges<span class="p">)</span> <span class="p">{</span>
+        deleted.start <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span>too.short<span class="p">,</span> <span class="m">1</span><span class="p">,</span>
+                                <span class="kp">ifelse</span><span class="p">(</span>use.right.band<span class="p">,</span>
+                                       start<span class="p">(</span>best.threebands<span class="o">$</span>left<span class="p">),</span>
+                                       start<span class="p">(</span>best.threebands<span class="o">$</span>middle<span class="p">)))</span>
+        deleted.end <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span>too.short<span class="p">,</span> width<span class="p">(</span>reads<span class="p">),</span>
+                              <span class="kp">ifelse</span><span class="p">(</span>use.right.band<span class="p">,</span>
+                                     end<span class="p">(</span>best.threebands<span class="o">$</span>middle<span class="p">),</span>
+                                     end<span class="p">(</span>best.threebands<span class="o">$</span>right<span class="p">)))</span>
+        emeta<span class="o">$</span>deleted.range <span class="o">&lt;-</span> IRanges<span class="p">(</span>deleted.start<span class="p">,</span> deleted.end<span class="p">)</span>
+    <span class="p">}</span>
+
+    <span class="kr">if</span> <span class="p">(</span>include.scores<span class="p">)</span> <span class="p">{</span>
+        <span class="c1">## If requested, take the best score out of each pair of forward</span>
+        <span class="c1">## and reverse scores.</span>
+        scores <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span>revcomp.better<span class="p">,</span> aln.scores<span class="o">$</span>revcomp<span class="p">,</span> aln.scores<span class="o">$</span>forward<span class="p">)</span>
+        emeta<span class="o">$</span>score <span class="o">&lt;-</span> scores
+    <span class="p">}</span>
+
+    mcols<span class="p">(</span>ranges<span class="p">)</span> <span class="o">&lt;-</span> DataFrame<span class="p">(</span>emeta<span class="p">)</span>
+
+    <span class="kr">return</span><span class="p">(</span>ranges<span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## Always call delox on the underlying DNAStringSet object when called</span>
+<span class="c1">## on something more complicated.</span>
+<span class="kp">suppressMessages</span><span class="p">({</span>
+    <span class="kp">invisible</span><span class="p">(</span>setMethod<span class="p">(</span><span class="s">&quot;.delox.trimmed.ranges&quot;</span><span class="p">,</span> signature<span class="o">=</span><span class="kt">c</span><span class="p">(</span>reads<span class="o">=</span><span class="s">&quot;ShortRead&quot;</span><span class="p">),</span>
+                        <span class="kr">function</span> <span class="p">(</span>subj<span class="p">,</span> reads<span class="p">,</span> min.length<span class="p">,</span> include.scores<span class="p">,</span> include.deleted.ranges<span class="p">,</span> align.opts<span class="p">)</span> <span class="p">{</span>
+                            callGeneric<span class="p">(</span>subj<span class="p">,</span> as<span class="p">(</span>reads<span class="p">,</span> <span class="s">&quot;DNAStringSet&quot;</span><span class="p">),</span> min.length<span class="p">,</span> include.scores<span class="p">,</span> include.deleted.ranges<span class="p">,</span> align.opts<span class="p">)</span>
+                        <span class="p">}))</span>
+    <span class="kp">invisible</span><span class="p">(</span>setMethod<span class="p">(</span><span class="s">&quot;.delox.trimmed.ranges&quot;</span><span class="p">,</span> signature<span class="o">=</span><span class="kt">c</span><span class="p">(</span>reads<span class="o">=</span><span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">),</span>
+                        <span class="kr">function</span> <span class="p">(</span>subj<span class="p">,</span> reads<span class="p">,</span> min.length<span class="p">,</span> include.scores<span class="p">,</span> include.deleted.ranges<span class="p">,</span> align.opts<span class="p">)</span> <span class="p">{</span>
+                            callGeneric<span class="p">(</span>subj<span class="p">,</span> as<span class="p">(</span>reads<span class="p">,</span> <span class="s">&quot;DNAStringSet&quot;</span><span class="p">),</span> min.length<span class="p">,</span> include.scores<span class="p">,</span> include.deleted.ranges<span class="p">,</span> align.opts<span class="p">)</span>
+                        <span class="p">}))</span>
+    <span class="kp">invisible</span><span class="p">(</span>setMethod<span class="p">(</span><span class="s">&quot;.delox.trimmed.ranges&quot;</span><span class="p">,</span> signature<span class="o">=</span><span class="kt">c</span><span class="p">(</span>reads<span class="o">=</span><span class="s">&quot;QualityScaledXStringSet&quot;</span><span class="p">),</span>
+                        <span class="kr">function</span> <span class="p">(</span>subj<span class="p">,</span> reads<span class="p">,</span> min.length<span class="p">,</span> include.scores<span class="p">,</span> include.deleted.ranges<span class="p">,</span> align.opts<span class="p">)</span> <span class="p">{</span>
+                            callGeneric<span class="p">(</span>subj<span class="p">,</span> as<span class="p">(</span>reads<span class="p">,</span> <span class="s">&quot;XStringSet&quot;</span><span class="p">),</span> min.length<span class="p">,</span> include.scores<span class="p">,</span> include.deleted.ranges<span class="p">,</span> align.opts<span class="p">)</span>
+                        <span class="p">}))</span>
+<span class="p">})</span>
+
+delox.single <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>subj<span class="p">,</span> reads <span class="p">,</span> min.length<span class="o">=</span><span class="m">36</span><span class="p">,</span>
+                         include.scores<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> align.opts<span class="o">=</span><span class="kt">list</span><span class="p">())</span> <span class="p">{</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Saving read names&quot;</span><span class="p">)</span>
+    saved.names <span class="o">&lt;-</span> BStringSet<span class="p">(</span><span class="kp">names</span><span class="p">(</span>reads<span class="p">))</span>
+    reads <span class="o">&lt;-</span> strip.names<span class="p">(</span>reads<span class="p">)</span>
+    <span class="kp">invisible</span><span class="p">(</span><span class="kp">gc</span><span class="p">())</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Doing alignments&quot;</span><span class="p">)</span>
+    nchunks <span class="o">&lt;-</span> <span class="kp">min</span><span class="p">(</span>getDoParWorkers<span class="p">(),</span> <span class="kp">ceiling</span><span class="p">(</span><span class="kp">length</span><span class="p">(</span>reads<span class="p">)</span><span class="o">/</span><span class="m">1000</span><span class="p">))</span>
+    deloxed.ranges <span class="o">&lt;-</span> foreach<span class="p">(</span>reads<span class="o">=</span>isplitVector<span class="p">(</span>reads<span class="p">,</span> chunks<span class="o">=</span>nchunks<span class="p">),</span> <span class="m">.</span>combine<span class="o">=</span><span class="kt">c</span><span class="p">)</span> <span class="o">%dopar%</span> <span class="p">{</span>
+        <span class="m">.</span>delox.trimmed.ranges<span class="p">(</span>reads<span class="o">=</span>reads<span class="p">,</span> subj<span class="o">=</span>subj<span class="p">,</span> min.length<span class="o">=</span>min.length<span class="p">,</span>
+                              include.scores<span class="o">=</span>include.scores<span class="p">,</span>
+                              include.deleted.ranges<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span>
+                              align.opts<span class="o">=</span>align.opts<span class="p">)</span>
+    <span class="p">}</span>
+    <span class="c1">## maybe.chunkapply(.delox.trimmed.ranges,</span>
+    <span class="c1">##                  VECTOR.ARGS=list(reads=reads),</span>
+    <span class="c1">##                  SCALAR.ARGS=list(subj=subj, min.length=min.length,</span>
+    <span class="c1">##                    include.scores=include.scores,</span>
+    <span class="c1">##                    include.deleted.ranges=FALSE,</span>
+    <span class="c1">##                    align.opts=align.opts),</span>
+    <span class="c1">##                  min.chunk.size=1000,</span>
+    <span class="c1">##                  MERGE=c)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Trimming reads&quot;</span><span class="p">)</span>
+    trimmed.reads <span class="o">&lt;-</span> narrow<span class="p">(</span>reads<span class="p">,</span> start<span class="p">(</span>deloxed.ranges<span class="p">),</span> end<span class="p">(</span>deloxed.ranges<span class="p">))</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Restoring read names&quot;</span><span class="p">)</span>
+    <span class="kp">names</span><span class="p">(</span>trimmed.reads<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>saved.names<span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Adding metadata&quot;</span><span class="p">)</span>
+    emeta <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">()</span>
+    <span class="kr">if</span> <span class="p">(</span>include.scores<span class="p">)</span> <span class="p">{</span>
+        emeta<span class="o">$</span>score <span class="o">&lt;-</span> mcols<span class="p">(</span>deloxed.ranges<span class="p">)</span><span class="o">$</span>score
+    <span class="p">}</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>emeta<span class="p">)</span> <span class="o">&gt;</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+        mcols<span class="p">(</span>trimmed.reads<span class="p">)</span> <span class="o">&lt;-</span> DataFrame<span class="p">(</span>emeta<span class="p">)</span>
+    <span class="p">}</span>
+
+    <span class="kr">return</span><span class="p">(</span>discard.short.reads<span class="p">(</span>trimmed.reads<span class="p">,</span> min.length<span class="p">))</span>
+<span class="p">}</span>
+
+delox.paired <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>subj<span class="p">,</span> read1<span class="p">,</span> read2<span class="p">,</span>
+                         min.call<span class="o">=</span><span class="m">10</span><span class="p">,</span> min.length<span class="o">=</span><span class="m">36</span><span class="p">,</span>
+                         include.scores<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> align.opts<span class="o">=</span><span class="kt">list</span><span class="p">())</span> <span class="p">{</span>
+    align.opts <span class="o">&lt;-</span> merge.lists<span class="p">(</span>align.opts<span class="p">,</span> default.align.opts<span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Checking read counts&quot;</span><span class="p">)</span>
+    <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">length</span><span class="p">(</span>read1<span class="p">)</span> <span class="o">==</span> <span class="kp">length</span><span class="p">(</span>read2<span class="p">))</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Listing reads&quot;</span><span class="p">)</span>
+    original.reads <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>read1<span class="o">=</span>read1<span class="p">,</span>
+                           read2<span class="o">=</span>read2<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>read1<span class="p">,</span> read2<span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Saving read names&quot;</span><span class="p">)</span>
+    read.names <span class="o">&lt;-</span> foreach<span class="p">(</span>r<span class="o">=</span>original.reads<span class="p">)</span> <span class="o">%do%</span> BStringSet<span class="p">(</span><span class="kp">names</span><span class="p">(</span>r<span class="p">))</span>
+    <span class="kp">names</span><span class="p">(</span>read.names<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>original.reads<span class="p">)</span>
+    original.reads <span class="o">&lt;-</span> <span class="kp">Map</span><span class="p">(</span>strip.names<span class="p">,</span> original.reads<span class="p">)</span>
+    <span class="kp">invisible</span><span class="p">(</span><span class="kp">gc</span><span class="p">())</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Doing alignments&quot;</span><span class="p">)</span>
+    deloxed.ranges <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>original.reads<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
+        nchunks <span class="o">&lt;-</span> <span class="kp">min</span><span class="p">(</span>getDoParWorkers<span class="p">(),</span> <span class="kp">ceiling</span><span class="p">(</span><span class="kp">length</span><span class="p">(</span>x<span class="p">)</span><span class="o">/</span><span class="m">1000</span><span class="p">))</span>
+        foreach<span class="p">(</span>reads<span class="o">=</span>isplitVector<span class="p">(</span>x<span class="p">,</span> chunks<span class="o">=</span>nchunks<span class="p">),</span> <span class="m">.</span>combine<span class="o">=</span><span class="kt">c</span><span class="p">)</span> <span class="o">%dopar%</span> <span class="p">{</span>
+            <span class="m">.</span>delox.trimmed.ranges<span class="p">(</span>reads<span class="o">=</span>reads<span class="p">,</span> subj<span class="o">=</span>subj<span class="p">,</span> min.length<span class="o">=</span>min.length<span class="p">,</span>
+                                  include.scores<span class="o">=</span>include.scores<span class="p">,</span>
+                                  include.deleted.ranges<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span>
+                                  align.opts<span class="o">=</span>align.opts<span class="p">)</span>
+        <span class="p">}</span>
+
+        <span class="c1">## maybe.chunkapply(.delox.trimmed.ranges,</span>
+        <span class="c1">##                  VECTOR.ARGS=list(reads=strip.names(x)),</span>
+        <span class="c1">##                  SCALAR.ARGS=list(subj=subj,</span>
+        <span class="c1">##                    min.length=min.length,</span>
+        <span class="c1">##                    include.scores=TRUE,</span>
+        <span class="c1">##                    include.deleted.ranges=TRUE,</span>
+        <span class="c1">##                    align.opts=align.opts),</span>
+        <span class="c1">##                  MERGE=c,</span>
+        <span class="c1">##                  min.chunk.size=1000)</span>
+    <span class="p">})</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Extracting metadata&quot;</span><span class="p">)</span>
+    delox.meta <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>deloxed.ranges<span class="p">,</span> mcols<span class="p">)</span>
+
+    <span class="c1">## Decide whether enough was trimmed on the inside (right end) of</span>
+    <span class="c1">## either read to call it a mate-pair.</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Calculating inside trim score&quot;</span><span class="p">)</span>
+    inside.trim.score <span class="o">&lt;-</span> <span class="kp">Reduce</span><span class="p">(</span><span class="kp">pmax</span><span class="p">,</span>
+                                <span class="kp">lapply</span><span class="p">(</span>delox.meta<span class="p">,</span>
+                                       <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="kp">ifelse</span><span class="p">(</span>x<span class="o">$</span>trim <span class="o">==</span> <span class="s">&quot;right&quot;</span><span class="p">,</span> x<span class="o">$</span>score<span class="p">,</span> <span class="m">0</span><span class="p">)))</span>
+
+    <span class="c1">## Decide whether enough was trimmed on the outside (left end) of</span>
+    <span class="c1">## either read to call it a non-mate-pair.</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Calculating outside trim score&quot;</span><span class="p">)</span>
+    outside.trim.score <span class="o">&lt;-</span> <span class="kp">Reduce</span><span class="p">(</span><span class="kp">pmax</span><span class="p">,</span>
+                                 <span class="kp">lapply</span><span class="p">(</span>delox.meta<span class="p">,</span>
+                                        <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="kp">ifelse</span><span class="p">(</span>x<span class="o">$</span>trim <span class="o">==</span> <span class="s">&quot;left&quot;</span><span class="p">,</span> x<span class="o">$</span>score<span class="p">,</span> <span class="m">0</span><span class="p">)))</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Calling presence of subject&quot;</span><span class="p">)</span>
+    calls <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>inside<span class="o">=</span>inside.trim.score <span class="o">&gt;=</span> min.call <span class="o">*</span> align.opts<span class="o">$</span><span class="kp">match</span><span class="p">,</span>
+                  outside<span class="o">=</span>outside.trim.score <span class="o">&gt;=</span> min.call <span class="o">*</span> align.opts<span class="o">$</span><span class="kp">match</span><span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Categorizing reads&quot;</span><span class="p">)</span>
+    <span class="kt">category</span> <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span><span class="kp">rep</span><span class="p">(</span><span class="kc">NA</span><span class="p">,</span> <span class="kp">length</span><span class="p">(</span>original.reads<span class="o">$</span>read1<span class="p">)),</span> levels<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;mate&quot;</span><span class="p">,</span> <span class="s">&quot;non-mate&quot;</span><span class="p">,</span> <span class="s">&quot;negative&quot;</span><span class="p">,</span> <span class="s">&quot;unpaired&quot;</span><span class="p">,</span> <span class="s">&quot;discard&quot;</span><span class="p">))</span>
+    <span class="kp">category</span><span class="p">[</span>calls<span class="o">$</span>inside<span class="p">]</span> <span class="o">&lt;-</span> <span class="s">&quot;mate&quot;</span>
+    <span class="kp">category</span><span class="p">[</span>calls<span class="o">$</span>outside<span class="p">]</span> <span class="o">&lt;-</span> <span class="s">&quot;non-mate&quot;</span>
+    <span class="c1">## If they&#39;re either both true or both false, then it&#39;s ambiguous</span>
+    <span class="kp">category</span><span class="p">[</span>calls<span class="o">$</span>inside <span class="o">==</span> calls<span class="o">$</span>outside<span class="p">]</span> <span class="o">&lt;-</span> <span class="s">&quot;negative&quot;</span>
+    <span class="c1">## All categories should be filled in now</span>
+    <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">all</span><span class="p">(</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span><span class="kp">category</span><span class="p">)))</span>
+
+    too.short <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>deloxed.ranges<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> width<span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;</span> min.length<span class="p">)</span>
+    <span class="c1">## If either read in a pair is too short, then its partner is no</span>
+    <span class="c1">## longer paired at all.</span>
+    one.too.short <span class="o">&lt;-</span> <span class="kp">Reduce</span><span class="p">(</span><span class="sb">`|`</span><span class="p">,</span> too.short<span class="p">)</span>
+    <span class="kp">category</span><span class="p">[</span>one.too.short<span class="p">]</span> <span class="o">&lt;-</span> <span class="s">&quot;unpaired&quot;</span>
+    <span class="c1">## If both reads in a pair are too short, then the entire pair is</span>
+    <span class="c1">## discarded. This is highly unlikely, since Cre-Lox should not</span>
+    <span class="c1">## appear in the middle of both sequences.</span>
+    both.too.short <span class="o">&lt;-</span> <span class="kp">Reduce</span><span class="p">(</span><span class="sb">`&amp;`</span><span class="p">,</span> too.short<span class="p">)</span>
+    <span class="kp">category</span><span class="p">[</span>both.too.short<span class="p">]</span> <span class="o">&lt;-</span> <span class="s">&quot;discard&quot;</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Trimming reads and restoring read names&quot;</span><span class="p">)</span>
+    trimmed.reads <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>original.reads<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
+        trimmed <span class="o">&lt;-</span> narrow<span class="p">(</span>original.reads<span class="p">[[</span>x<span class="p">]],</span>
+                          start<span class="o">=</span>start<span class="p">(</span>deloxed.ranges<span class="p">[[</span>x<span class="p">]]),</span>
+                          end<span class="o">=</span>end<span class="p">(</span>deloxed.ranges<span class="p">[[</span>x<span class="p">]]))</span>
+        <span class="kp">names</span><span class="p">(</span>trimmed<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>read.names<span class="p">[[</span>x<span class="p">]])</span>
+        trimmed
+    <span class="p">})</span>
+    <span class="kp">names</span><span class="p">(</span>trimmed.reads<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>original.reads<span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Assembling metadata&quot;</span><span class="p">)</span>
+    foreach <span class="p">(</span>r<span class="o">=</span><span class="kp">names</span><span class="p">(</span>trimmed.reads<span class="p">))</span> <span class="o">%do%</span> <span class="p">{</span>
+        emeta <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">()</span>
+        emeta<span class="o">$</span><span class="kt">category</span> <span class="o">&lt;-</span> <span class="kp">category</span>
+        emeta<span class="o">$</span><span class="kp">category</span><span class="p">[</span>too.short<span class="p">[[</span>r<span class="p">]]]</span> <span class="o">&lt;-</span> <span class="s">&quot;discard&quot;</span>
+        <span class="kr">if</span> <span class="p">(</span>include.scores<span class="p">)</span> <span class="p">{</span>
+            emeta<span class="o">$</span>score <span class="o">&lt;-</span> delox.meta<span class="p">[[</span>r<span class="p">]]</span><span class="o">$</span>score
+        <span class="p">}</span>
+        mcols<span class="p">(</span>trimmed.reads<span class="p">[[</span>r<span class="p">]])</span> <span class="o">&lt;-</span> DataFrame<span class="p">(</span>emeta<span class="p">)</span>
+    <span class="p">}</span>
+
+    <span class="kr">return</span><span class="p">(</span>trimmed.reads<span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## Wrapper for both single and paired as appropriate</span>
+delox <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>subj<span class="p">,</span> read1<span class="p">,</span> read2<span class="o">=</span><span class="kc">NULL</span><span class="p">,</span>
+                  min.call<span class="o">=</span><span class="m">10</span><span class="p">,</span> min.length<span class="o">=</span><span class="m">36</span><span class="p">,</span>
+                  interleaved<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span>
+                  read1.orientation<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;in&quot;</span><span class="p">,</span> <span class="s">&quot;out&quot;</span><span class="p">)[</span><span class="m">1</span><span class="p">],</span>
+                  read2.orientation<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;in&quot;</span><span class="p">,</span> <span class="s">&quot;out&quot;</span><span class="p">)[</span><span class="m">1</span><span class="p">],</span>
+                  align.opts<span class="o">=</span><span class="kt">list</span><span class="p">())</span> <span class="p">{</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>read2<span class="p">))</span> <span class="p">{</span>
+        <span class="kr">if</span> <span class="p">(</span>interleaved<span class="p">)</span> <span class="p">{</span>
+            x <span class="o">&lt;-</span> deinterleave.pairs<span class="p">(</span>read1<span class="p">)</span>
+            read1 <span class="o">&lt;-</span> x<span class="o">$</span>read1
+            read2 <span class="o">&lt;-</span> x<span class="o">$</span>read2
+        <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+            tsmsg<span class="p">(</span><span class="s">&quot;Doing single-read delox&quot;</span><span class="p">)</span>
+            <span class="kr">return</span><span class="p">(</span>delox.single<span class="p">(</span>subj<span class="o">=</span>subj<span class="p">,</span> reads<span class="o">=</span>read1<span class="p">,</span> min.length<span class="o">=</span>min.length<span class="p">,</span> align.opts<span class="o">=</span>align.opts<span class="p">))</span>
+        <span class="p">}</span>
+    <span class="p">}</span>
+
+    <span class="c1">## Make sure both reads are oriented &quot;in&quot; before calling</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Ensuring correct read orientation&quot;</span><span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="kp">tolower</span><span class="p">(</span>read1.orientation<span class="p">)</span> <span class="o">==</span> <span class="s">&quot;out&quot;</span><span class="p">)</span> <span class="p">{</span>
+        read1 <span class="o">&lt;-</span> reverseComplement<span class="p">(</span>read1<span class="p">)</span>
+    <span class="p">}</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="kp">is.null</span><span class="p">(</span>read2<span class="p">)</span> <span class="o">&amp;&amp;</span> <span class="kp">tolower</span><span class="p">(</span>read2.orientation<span class="p">)</span> <span class="o">==</span> <span class="s">&quot;out&quot;</span><span class="p">)</span> <span class="p">{</span>
+        read2 <span class="o">&lt;-</span> reverseComplement<span class="p">(</span>read2<span class="p">)</span>
+    <span class="p">}</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Doing paired-end delox&quot;</span><span class="p">)</span>
+    deloxed.reads <span class="o">&lt;-</span> delox.paired<span class="p">(</span>subj<span class="p">,</span> read1<span class="p">,</span> read2<span class="p">,</span>
+                                  min.call<span class="o">=</span>min.call<span class="p">,</span> min.length<span class="o">=</span>min.length<span class="p">,</span>
+                                  align.opts<span class="o">=</span>align.opts<span class="p">)</span>
+
+    <span class="c1">## If reads started &quot;out&quot;, put them back that way before returning</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Restoring original read orientation&quot;</span><span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="kp">tolower</span><span class="p">(</span>read1.orientation<span class="p">)</span> <span class="o">==</span> <span class="s">&quot;out&quot;</span><span class="p">)</span> <span class="p">{</span>
+        deloxed.reads<span class="o">$</span>read1 <span class="o">&lt;-</span> reverseComplement<span class="p">(</span>deloxed.reads<span class="o">$</span>read1<span class="p">)</span>
+    <span class="p">}</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="kp">tolower</span><span class="p">(</span>read2.orientation<span class="p">)</span> <span class="o">==</span> <span class="s">&quot;out&quot;</span><span class="p">)</span> <span class="p">{</span>
+        deloxed.reads<span class="o">$</span>read2 <span class="o">&lt;-</span> reverseComplement<span class="p">(</span>deloxed.reads<span class="o">$</span>read2<span class="p">)</span>
+    <span class="p">}</span>
+
+    <span class="kr">return</span><span class="p">(</span>deloxed.reads<span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## ## Hack to work around a bug in BioConductor that prevents subsetting</span>
+<span class="c1">## ## of named XStringSet objects. Apparently, since DeLoxer was first</span>
+<span class="c1">## ## published, the BioConductor devs broke the XStringSet subsetting</span>
+<span class="c1">## ## code so that it can no longer handle XStringSets with names. The</span>
+<span class="c1">## ## code below strips the names from the XStringSet, then calls the old</span>
+<span class="c1">## ## code to subset the nameless object while subsetting the names</span>
+<span class="c1">## ## separately, then finally puts the names back on and returns the</span>
+<span class="c1">## ## result.</span>
+<span class="c1">## old.XStringSet.subset.method &lt;- selectMethod(&quot;[&quot;, &quot;XStringSet&quot;)</span>
+<span class="c1">## invisible(setMethod(&quot;[&quot;, signature=&quot;XStringSet&quot;, definition=function(x, i, j, ..., drop=TRUE) {</span>
+<span class="c1">##     ## Save the names into a seaprate variable</span>
+<span class="c1">##     xnames &lt;- names(x)</span>
+<span class="c1">##     ## Do the old behavior, which works on unnamed objects</span>
+<span class="c1">##     x &lt;- old.XStringSet.subset.method(unname(x), i, j, ..., drop=drop)</span>
+<span class="c1">##     ## Put the names back on and return</span>
+<span class="c1">##     setNames(x, xnames[i])</span>
+<span class="c1">## }))</span>
+
+save.deloxed.pairs.as.fastq <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>read1<span class="p">,</span> read2<span class="p">,</span> output.base<span class="p">,</span>
+                                        mate.ext<span class="o">=</span><span class="s">&quot;matepaired&quot;</span><span class="p">,</span>
+                                        nonmate.ext<span class="o">=</span><span class="s">&quot;paired&quot;</span><span class="p">,</span>
+                                        negative.ext<span class="o">=</span><span class="s">&quot;negative&quot;</span><span class="p">,</span>
+                                        unpaired.ext<span class="o">=</span><span class="s">&quot;unpaired&quot;</span><span class="p">,</span>
+                                        append<span class="o">=</span><span class="kc">FALSE</span><span class="p">)</span> <span class="p">{</span>
+
+    extension <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span>mate<span class="o">=</span>mate.ext<span class="p">,</span>
+                   <span class="sb">`non-mate`</span><span class="o">=</span>nonmate.ext<span class="p">,</span>
+                   negative<span class="o">=</span>negative.ext<span class="p">,</span>
+                   unpaired<span class="o">=</span>unpaired.ext<span class="p">)</span>
+
+    <span class="c1">## ## Make sure that read1 and read2 are a match for each other</span>
+    <span class="c1">## stopifnot(identical(as.character(mcols(read1)$category),</span>
+    <span class="c1">##                     as.character(mcols(read2)$category)))</span>
+
+    <span class="c1">## ## Discard the shorter read on &quot;unpaired&quot;</span>
+    <span class="c1">## read1.shorter &lt;- width(read1) &lt; width(read2)</span>
+    <span class="c1">## mcols(read1)$category[mcols(read1)$category == &quot;unpaired&quot; &amp; read1.shorter] &lt;- NA</span>
+    <span class="c1">## mcols(read2)$category[mcols(read2)$category == &quot;unpaired&quot; &amp; !read1.shorter] &lt;- NA</span>
+
+    filename.template <span class="o">&lt;-</span> <span class="s">&quot;%s_read%s.%s.fastq&quot;</span>
+
+    <span class="kr">for</span> <span class="p">(</span>cat <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>extension<span class="p">))</span> <span class="p">{</span>
+        read1.for.category <span class="o">&lt;-</span> read1<span class="p">[</span>mcols<span class="p">(</span>read1<span class="p">)</span><span class="o">$</span><span class="kt">category</span> <span class="o">==</span> <span class="kp">cat</span><span class="p">]</span>
+        read1.file.for.category <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span>filename.template<span class="p">,</span> output.base<span class="p">,</span> <span class="m">1</span><span class="p">,</span> extension<span class="p">[[</span><span class="kp">cat</span><span class="p">]])</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Writing &quot;</span><span class="p">,</span> read1.file.for.category<span class="p">)</span>
+        write.QualityScaledDNAStringSet<span class="p">(</span>read1.for.category<span class="p">,</span>
+                                        file<span class="o">=</span>read1.file.for.category<span class="p">,</span>
+                                        append<span class="o">=</span><span class="kp">append</span><span class="p">)</span>
+
+        read2.for.category <span class="o">&lt;-</span> read2<span class="p">[</span>mcols<span class="p">(</span>read2<span class="p">)</span><span class="o">$</span><span class="kt">category</span> <span class="o">==</span> <span class="kp">cat</span><span class="p">]</span>
+        read2.file.for.category <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span>filename.template<span class="p">,</span> output.base<span class="p">,</span> <span class="m">2</span><span class="p">,</span> extension<span class="p">[[</span><span class="kp">cat</span><span class="p">]])</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Writing &quot;</span><span class="p">,</span> read2.file.for.category<span class="p">)</span>
+        write.QualityScaledDNAStringSet<span class="p">(</span>read2.for.category<span class="p">,</span>
+                                        file<span class="o">=</span>read2.file.for.category<span class="p">,</span>
+                                        append<span class="o">=</span><span class="kp">append</span><span class="p">)</span>
+    <span class="p">}</span>
+
+    <span class="kr">return</span><span class="p">(</span><span class="kc">TRUE</span><span class="p">)</span>
+<span class="p">}</span>
+
+get.category.counts <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>deloxed.pairs<span class="p">)</span> <span class="p">{</span>
+    r1cat <span class="o">&lt;-</span> mcols<span class="p">(</span>deloxed.pairs<span class="o">$</span>read1<span class="p">)</span><span class="o">$</span><span class="kp">category</span>
+    r2cat <span class="o">&lt;-</span> mcols<span class="p">(</span>deloxed.pairs<span class="o">$</span>read2<span class="p">)</span><span class="o">$</span><span class="kp">category</span>
+    x <span class="o">&lt;-</span> <span class="kp">table</span><span class="p">(</span>r1cat<span class="p">)[</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;mate&quot;</span><span class="p">,</span> <span class="s">&quot;non-mate&quot;</span><span class="p">,</span> <span class="s">&quot;negative&quot;</span><span class="p">)]</span>
+    x<span class="p">[</span><span class="s">&quot;r1.single&quot;</span><span class="p">]</span> <span class="o">&lt;-</span> <span class="kp">sum</span><span class="p">(</span>r1cat <span class="o">==</span> <span class="s">&quot;unpaired&quot;</span><span class="p">)</span>
+    x<span class="p">[</span><span class="s">&quot;r2.single&quot;</span><span class="p">]</span> <span class="o">&lt;-</span> <span class="kp">sum</span><span class="p">(</span>r2cat <span class="o">==</span> <span class="s">&quot;unpaired&quot;</span><span class="p">)</span>
+    x<span class="p">[</span><span class="s">&quot;discard&quot;</span><span class="p">]</span> <span class="o">&lt;-</span> <span class="kp">length</span><span class="p">(</span>r1cat<span class="p">)</span> <span class="o">-</span> <span class="kp">sum</span><span class="p">(</span>x<span class="p">)</span>
+    x
+<span class="p">}</span>
+
+mcparallel.quiet <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>expr<span class="p">,</span> <span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+    parallel<span class="o">:::</span>mcparallel<span class="p">(</span><span class="kp">suppressMessages</span><span class="p">(</span>expr<span class="p">),</span> <span class="kc">...</span><span class="p">)</span>
+<span class="p">}</span>
+
+print.stats <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>category.counts<span class="p">)</span> <span class="p">{</span>
+    category.pct <span class="o">&lt;-</span> setNames<span class="p">(</span><span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%.3g%%&quot;</span><span class="p">,</span> category.counts <span class="o">/</span> <span class="kp">sum</span><span class="p">(</span>category.counts<span class="p">)</span> <span class="o">*</span> <span class="m">100</span><span class="p">),</span>
+                             <span class="kp">names</span><span class="p">(</span>category.counts<span class="p">))</span>
+    x <span class="o">&lt;-</span> <span class="kp">rbind</span><span class="p">(</span>Counts<span class="o">=</span>category.counts<span class="p">,</span> Fractions<span class="o">=</span>category.pct<span class="p">)</span>
+    <span class="kp">names</span><span class="p">(</span><span class="kp">dimnames</span><span class="p">(</span>x<span class="p">))</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;Stat&quot;</span><span class="p">,</span> <span class="s">&quot;Category&quot;</span><span class="p">)</span>
+    <span class="kp">print</span><span class="p">(</span>x<span class="p">,</span> quote<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span> justify<span class="o">=</span><span class="s">&quot;right&quot;</span><span class="p">)</span>
+<span class="p">}</span>
+
+main <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">()</span> <span class="p">{</span>
+    opt <span class="o">&lt;-</span> parse_arguments<span class="p">()</span>
+    print.option.list<span class="p">(</span>opt<span class="p">)</span>
+    args <span class="o">&lt;-</span> opt<span class="o">$</span><span class="kp">args</span>
+    opts <span class="o">&lt;-</span> opt<span class="o">$</span><span class="kp">options</span>
+
+    <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="p">(</span><span class="kp">tolower</span><span class="p">(</span>opts<span class="p">[[</span><span class="s">&quot;read1-orientation&quot;</span><span class="p">]])</span> <span class="o">%in%</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;in&quot;</span><span class="p">,</span> <span class="s">&quot;out&quot;</span><span class="p">)</span> <span class="o">&amp;&amp;</span>
+          <span class="kp">tolower</span><span class="p">(</span>opts<span class="p">[[</span><span class="s">&quot;read2-orientation&quot;</span><span class="p">]])</span> <span class="o">%in%</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;in&quot;</span><span class="p">,</span> <span class="s">&quot;out&quot;</span><span class="p">)</span> <span class="p">))</span> <span class="p">{</span>
+        <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Valid orientations are \&quot;in\&quot; and \&quot;out\&quot;&quot;</span><span class="p">)</span>
+    <span class="p">}</span>
+
+    align.opts <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>match <span class="o">=</span> opts<span class="p">[[</span><span class="s">&quot;match-bonus&quot;</span><span class="p">]],</span>
+                       mismatch <span class="o">=</span> opts<span class="p">[[</span><span class="s">&quot;mismatch-penalty&quot;</span><span class="p">]],</span>
+                       gapOpening <span class="o">=</span> opts<span class="p">[[</span><span class="s">&quot;gap-open-penalty&quot;</span><span class="p">]],</span>
+                       gapExtension <span class="o">=</span> opts<span class="p">[[</span><span class="s">&quot;gap-extension-penalty&quot;</span><span class="p">]])</span>
+
+    <span class="kp">stopifnot</span><span class="p">(</span>opts<span class="o">$</span><span class="sb">`min-call`</span> <span class="o">&gt;=</span> <span class="m">1</span> <span class="o">&amp;&amp;</span>
+              opts<span class="o">$</span><span class="sb">`min-length`</span> <span class="o">&gt;=</span> <span class="m">0</span> <span class="o">&amp;&amp;</span>
+              opts<span class="o">$</span><span class="sb">`jobs`</span> <span class="o">&gt;=</span> <span class="m">0</span><span class="p">)</span>
+
+    <span class="c1">## Set jobs if requested</span>
+    <span class="kr">if</span> <span class="p">(</span>opts<span class="o">$</span>jobs <span class="o">&gt;</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+        <span class="kp">options</span><span class="p">(</span>cores<span class="o">=</span>opts<span class="o">$</span>jobs<span class="p">)</span>
+    <span class="p">}</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Using &quot;</span><span class="p">,</span> getDoParWorkers<span class="p">(),</span> <span class="s">&quot; cores.&quot;</span><span class="p">)</span>
+
+    paired <span class="o">&lt;-</span> <span class="o">!</span>opts<span class="p">[[</span><span class="s">&quot;single-read-mode&quot;</span><span class="p">]]</span>
+    interleaved <span class="o">&lt;-</span> opts<span class="p">[[</span><span class="s">&quot;interleaved&quot;</span><span class="p">]]</span>
+
+    <span class="kr">if</span> <span class="p">(</span><span class="o">!</span>paired <span class="o">&amp;&amp;</span> interleaved<span class="p">)</span> <span class="p">{</span>
+        <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;ERROR: You cannot specify both --interleaved and --single-read-mode&quot;</span><span class="p">)</span>
+    <span class="p">}</span> <span class="kr">else</span> <span class="kr">if</span> <span class="p">(</span><span class="o">!</span>paired<span class="p">)</span> <span class="p">{</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span><span class="kp">args</span><span class="p">)</span> <span class="o">!=</span> <span class="m">3</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;DeLoxer in single-read mode requires exactly 3 arguments&quot;</span><span class="p">)</span>
+        <span class="p">}</span>
+        subject.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">1</span><span class="p">]]</span>
+        read1.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">2</span><span class="p">]]</span>
+        read2.file <span class="o">&lt;-</span> <span class="kc">NULL</span>
+        output.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">3</span><span class="p">]]</span>
+    <span class="p">}</span> <span class="kr">else</span> <span class="kr">if</span> <span class="p">(</span>interleaved<span class="p">)</span> <span class="p">{</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span><span class="kp">args</span><span class="p">)</span> <span class="o">!=</span> <span class="m">3</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;DeLoxer interleaved input mode requires exactly 3 arguments&quot;</span><span class="p">)</span>
+        <span class="p">}</span>
+        subject.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">1</span><span class="p">]]</span>
+        read1.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">2</span><span class="p">]]</span>
+        read2.file <span class="o">&lt;-</span> <span class="kc">NULL</span>
+        output.basename <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">3</span><span class="p">]]</span>
+    <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span><span class="kp">args</span><span class="p">)</span> <span class="o">!=</span> <span class="m">4</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;DeLoxer requires exactly 4 arguments&quot;</span><span class="p">)</span>
+        <span class="p">}</span>
+        subject.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">1</span><span class="p">]]</span>
+        read1.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">2</span><span class="p">]]</span>
+        read2.file <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">3</span><span class="p">]]</span>
+        output.basename <span class="o">&lt;-</span> <span class="kp">args</span><span class="p">[[</span><span class="m">4</span><span class="p">]]</span>
+    <span class="p">}</span>
+
+    subj <span class="o">&lt;-</span> readDNAStringSet<span class="p">(</span>subject.file<span class="p">,</span> format<span class="o">=</span><span class="s">&quot;fasta&quot;</span><span class="p">,</span> nrec<span class="o">=</span><span class="m">1</span><span class="p">)[[</span><span class="m">1</span><span class="p">]]</span>
+
+    yieldSize <span class="o">&lt;-</span> opts<span class="p">[[</span><span class="s">&quot;yield-size&quot;</span><span class="p">]]</span>
+    <span class="kr">if</span> <span class="p">(</span>paired<span class="p">)</span> <span class="p">{</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Deloxing and classifying paired sequences&quot;</span><span class="p">)</span>
+        read1.stream <span class="o">&lt;-</span> FastqStreamer<span class="p">(</span>read1.file<span class="p">,</span> n<span class="o">=</span>yieldSize<span class="p">)</span>
+        read2.stream <span class="o">&lt;-</span> <span class="kr">if</span> <span class="p">(</span><span class="o">!</span>interleaved<span class="p">)</span> FastqStreamer<span class="p">(</span>read2.file<span class="p">,</span> n<span class="o">=</span>yieldSize<span class="p">)</span>
+        process.chunk <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>fq1<span class="p">,</span> fq2<span class="p">,</span> <span class="kp">append</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq1<span class="p">)</span> <span class="o">&lt;</span> <span class="m">1</span><span class="p">)</span>
+                <span class="kr">return</span><span class="p">(</span><span class="kc">TRUE</span><span class="p">)</span>
+            <span class="kr">if</span> <span class="p">(</span>interleaved<span class="p">)</span> <span class="p">{</span>
+                <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>fq2<span class="p">))</span>
+                deint <span class="o">&lt;-</span> deinterleave.pairs<span class="p">(</span>fq1<span class="p">)</span>
+                fq1 <span class="o">&lt;-</span> deint<span class="p">[[</span><span class="m">1</span><span class="p">]]</span>
+                fq2 <span class="o">&lt;-</span> deint<span class="p">[[</span><span class="m">2</span><span class="p">]]</span>
+            <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+                <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq1<span class="p">)</span> <span class="o">!=</span> <span class="kp">length</span><span class="p">(</span>fq2<span class="p">))</span>
+                    <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Both input files must have equal numbers of reads&quot;</span><span class="p">)</span>
+            <span class="p">}</span>
+            read1 <span class="o">&lt;-</span> as<span class="p">(</span>fq1<span class="p">,</span> <span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">)</span>
+            read2 <span class="o">&lt;-</span> as<span class="p">(</span>fq2<span class="p">,</span> <span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">)</span>
+            deloxed.pairs <span class="o">&lt;-</span>
+                delox<span class="p">(</span>subj<span class="p">,</span> read1<span class="p">,</span> read2<span class="p">,</span>
+                      min.call<span class="o">=</span>opts<span class="p">[[</span><span class="s">&quot;min-call&quot;</span><span class="p">]],</span>
+                      interleaved<span class="o">=</span>interleaved<span class="p">,</span>
+                      read1.orientation<span class="o">=</span>opts<span class="p">[[</span><span class="s">&quot;read1-orientation&quot;</span><span class="p">]],</span>
+                      read2.orientation<span class="o">=</span>opts<span class="p">[[</span><span class="s">&quot;read2-orientation&quot;</span><span class="p">]],</span>
+                      align.opts<span class="o">=</span>align.opts<span class="p">)</span>
+            save.deloxed.pairs.as.fastq<span class="p">(</span>deloxed.pairs<span class="o">$</span>read1<span class="p">,</span> deloxed.pairs<span class="o">$</span>read2<span class="p">,</span> output.basename<span class="p">,</span> append<span class="o">=</span><span class="kp">append</span><span class="p">)</span>
+
+            ret <span class="o">&lt;-</span> get.category.counts<span class="p">(</span>deloxed.pairs<span class="p">)</span>
+            <span class="kr">return</span><span class="p">(</span>ret<span class="p">)</span>
+        <span class="p">}</span>
+        fq1 <span class="o">&lt;-</span> yield<span class="p">(</span>read1.stream<span class="p">)</span>
+        fq2 <span class="o">&lt;-</span> <span class="kr">if</span> <span class="p">(</span><span class="o">!</span>interleaved<span class="p">)</span> yield<span class="p">(</span>read2.stream<span class="p">)</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq1<span class="p">)</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span>
+            <span class="kp">warning</span><span class="p">(</span><span class="s">&quot;No reads were read from the input file.&quot;</span><span class="p">)</span>
+        proc <span class="o">&lt;-</span> mcparallel.quiet<span class="p">(</span>process.chunk<span class="p">(</span>fq1<span class="p">,</span> fq2<span class="p">,</span> append<span class="o">=</span><span class="kc">FALSE</span><span class="p">))</span>
+        reads.processed <span class="o">&lt;-</span> <span class="kp">length</span><span class="p">(</span>fq1<span class="p">)</span> <span class="o">/</span> <span class="kp">ifelse</span><span class="p">(</span>interleaved<span class="p">,</span> <span class="m">2</span><span class="p">,</span> <span class="m">1</span><span class="p">)</span>
+        category.stats <span class="o">&lt;-</span>
+            category.counts <span class="o">&lt;-</span> <span class="kc">NULL</span>
+        <span class="kr">while</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq1 <span class="o">&lt;-</span> yield<span class="p">(</span>read1.stream<span class="p">)))</span> <span class="p">{</span>
+            <span class="kr">if</span> <span class="p">(</span><span class="o">!</span>interleaved<span class="p">)</span>
+                fq2 <span class="o">&lt;-</span> yield<span class="p">(</span>read2.stream<span class="p">)</span>
+            prev.result <span class="o">&lt;-</span> mccollect<span class="p">(</span>proc<span class="p">)[[</span><span class="m">1</span><span class="p">]]</span>
+            <span class="kr">if</span> <span class="p">(</span>is<span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;try-error&quot;</span><span class="p">))</span> <span class="p">{</span>
+                tsmsg<span class="p">(</span><span class="s">&quot;Encountered error in deloxing subprocess:&quot;</span><span class="p">)</span>
+                <span class="kp">stop</span><span class="p">(</span><span class="kp">attr</span><span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;condition&quot;</span><span class="p">))</span>
+            <span class="p">}</span>
+            <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>category.counts<span class="p">))</span> <span class="p">{</span>
+                category.counts <span class="o">&lt;-</span> prev.result
+            <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+                category.counts <span class="o">&lt;-</span> category.counts <span class="o">+</span> prev.result
+            <span class="p">}</span>
+            tsmsg<span class="p">(</span><span class="s">&quot;Category stats after processing &quot;</span><span class="p">,</span> reads.processed<span class="p">,</span> <span class="s">&quot; reads:&quot;</span><span class="p">)</span>
+            <span class="c1">## category.pct &lt;- setNames(sprintf(&quot;%.3g%%&quot;, category.counts / sum(category.counts) * 100),</span>
+            <span class="c1">##                          names(category.counts))</span>
+            print.stats<span class="p">(</span>category.counts<span class="p">)</span>
+            proc <span class="o">&lt;-</span> mcparallel.quiet<span class="p">(</span>process.chunk<span class="p">(</span>fq1<span class="p">,</span> fq2<span class="p">,</span> append<span class="o">=</span><span class="kc">TRUE</span><span class="p">))</span>
+            reads.processed <span class="o">&lt;-</span> reads.processed <span class="o">+</span> <span class="kp">length</span><span class="p">(</span>fq1<span class="p">)</span> <span class="o">/</span> <span class="kp">ifelse</span><span class="p">(</span>interleaved<span class="p">,</span> <span class="m">2</span><span class="p">,</span> <span class="m">1</span><span class="p">)</span>
+        <span class="p">}</span>
+        <span class="kp">close</span><span class="p">(</span>read1.stream<span class="p">)</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="o">!</span>interleaved<span class="p">)</span> <span class="kp">close</span><span class="p">(</span>read2.stream<span class="p">)</span>
+        prev.result <span class="o">&lt;-</span> mccollect<span class="p">(</span>proc<span class="p">)[[</span><span class="m">1</span><span class="p">]]</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>category.counts<span class="p">))</span> <span class="p">{</span>
+            category.counts <span class="o">&lt;-</span> prev.result
+        <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+            category.counts <span class="o">&lt;-</span> category.counts <span class="o">+</span> prev.result
+        <span class="p">}</span>
+        <span class="kr">if</span> <span class="p">(</span>is<span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;try-error&quot;</span><span class="p">))</span> <span class="p">{</span>
+            tsmsg<span class="p">(</span><span class="s">&quot;Encountered error in deloxing subprocess:&quot;</span><span class="p">)</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="kp">attr</span><span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;condition&quot;</span><span class="p">))</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Encountered error in deloxing&quot;</span><span class="p">)</span>
+        <span class="p">}</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Final category stats after processing &quot;</span><span class="p">,</span> reads.processed<span class="p">,</span> <span class="s">&quot; reads:&quot;</span><span class="p">)</span>
+        print.stats<span class="p">(</span>category.counts<span class="p">)</span>
+    <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Deloxing single sequences&quot;</span><span class="p">)</span>
+        read1.stream <span class="o">&lt;-</span> FastqStreamer<span class="p">(</span>read1.file<span class="p">,</span> n<span class="o">=</span>yieldSize<span class="p">)</span>
+        process.chunk <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>fq<span class="p">,</span> <span class="kp">append</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq<span class="p">)</span> <span class="o">&lt;</span> <span class="m">1</span><span class="p">)</span>
+                <span class="kr">return</span><span class="p">(</span><span class="kc">TRUE</span><span class="p">)</span>
+            reads <span class="o">&lt;-</span> as<span class="p">(</span>fq<span class="p">,</span> <span class="s">&quot;QualityScaledDNAStringSet&quot;</span><span class="p">)</span>
+            deloxed.reads <span class="o">&lt;-</span>
+                delox<span class="p">(</span>subj<span class="p">,</span> reads<span class="p">,</span> <span class="kc">NULL</span><span class="p">,</span>
+                      min.call<span class="o">=</span>opts<span class="p">[[</span><span class="s">&quot;min-call&quot;</span><span class="p">]],</span>
+                      interleaved<span class="o">=</span>interleaved<span class="p">,</span>
+                      read1.orientation<span class="o">=</span>opts<span class="p">[[</span><span class="s">&quot;read1-orientation&quot;</span><span class="p">]],</span>
+                      read2.orientation<span class="o">=</span>opts<span class="p">[[</span><span class="s">&quot;read2-orientation&quot;</span><span class="p">]],</span>
+                      align.opts<span class="o">=</span>align.opts<span class="p">)</span>
+            write.QualityScaledDNAStringSet<span class="p">(</span>deloxed.reads<span class="p">,</span> output.file<span class="p">,</span> append<span class="o">=</span><span class="kp">append</span><span class="p">)</span>
+            <span class="kr">return</span><span class="p">(</span><span class="kc">TRUE</span><span class="p">)</span>
+        <span class="p">}</span>
+        <span class="c1">## First chunk is processed with append=FALSE to start the file</span>
+        fq <span class="o">&lt;-</span> yield<span class="p">(</span>read1.stream<span class="p">)</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq<span class="p">)</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span>
+            <span class="kp">warning</span><span class="p">(</span><span class="s">&quot;No reads were read from the input file.&quot;</span><span class="p">)</span>
+        proc <span class="o">&lt;-</span> mcparallel.quiet<span class="p">(</span><span class="kp">suppressMessages</span><span class="p">(</span>process.chunk<span class="p">(</span>fq<span class="p">,</span> append<span class="o">=</span><span class="kc">FALSE</span><span class="p">)))</span>
+        reads.processed <span class="o">&lt;-</span> <span class="kp">length</span><span class="p">(</span>fq<span class="p">)</span>
+        <span class="kr">while</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>fq <span class="o">&lt;-</span> yield<span class="p">(</span>read1.stream<span class="p">)))</span> <span class="p">{</span>
+            prev.result <span class="o">&lt;-</span> mccollect<span class="p">(</span>proc<span class="p">)[[</span><span class="m">1</span><span class="p">]]</span>
+            <span class="kr">if</span> <span class="p">(</span>is<span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;try-error&quot;</span><span class="p">))</span> <span class="p">{</span>
+                tsmsg<span class="p">(</span><span class="s">&quot;Encountered error in deloxing subprocess:&quot;</span><span class="p">)</span>
+                <span class="kp">stop</span><span class="p">(</span><span class="kp">attr</span><span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;condition&quot;</span><span class="p">))</span>
+                <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Encountered error in deloxing&quot;</span><span class="p">)</span>
+            <span class="p">}</span>
+            tsmsg<span class="p">(</span><span class="s">&quot;Processed &quot;</span><span class="p">,</span> reads.processed<span class="p">,</span> <span class="s">&quot; reads&quot;</span><span class="p">)</span>
+            proc <span class="o">&lt;-</span> mcparallel.quiet<span class="p">(</span><span class="kp">suppressMessages</span><span class="p">(</span>process.chunk<span class="p">(</span>fq<span class="p">,</span> append<span class="o">=</span><span class="kc">TRUE</span><span class="p">)))</span>
+            reads.processed <span class="o">&lt;-</span> reads.processed <span class="o">+</span> <span class="kp">length</span><span class="p">(</span>fq<span class="p">)</span>
+        <span class="p">}</span>
+        <span class="kp">close</span><span class="p">(</span>read1.stream<span class="p">)</span>
+        prev.result <span class="o">&lt;-</span> mccollect<span class="p">(</span>proc<span class="p">)[[</span><span class="m">1</span><span class="p">]]</span>
+        <span class="kr">if</span> <span class="p">(</span>is<span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;try-error&quot;</span><span class="p">))</span> <span class="p">{</span>
+            tsmsg<span class="p">(</span><span class="s">&quot;Encountered error in deloxing subprocess:&quot;</span><span class="p">)</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="kp">attr</span><span class="p">(</span>prev.result<span class="p">,</span> <span class="s">&quot;condition&quot;</span><span class="p">))</span>
+            <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Encountered error in deloxing&quot;</span><span class="p">)</span>
+        <span class="p">}</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Processed &quot;</span><span class="p">,</span> reads.processed<span class="p">,</span> <span class="s">&quot; reads&quot;</span><span class="p">)</span>
+    <span class="p">}</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Finished successful run&quot;</span><span class="p">)</span>
+<span class="p">}</span>
+
+main<span class="p">()</span>
+</pre></div>
+</body>
+</html>

+ 306 - 0
examples/Salomon/blockbuster-pipeline.R.html

@@ -0,0 +1,306 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/env Rscript</span>
+
+<span class="kn">source</span><span class="p">(</span><span class="s">&quot;common.R&quot;</span><span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>BSgenome.Hsapiens.UCSC.hg19<span class="p">)</span>
+
+textConnectionFromLines <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>lines<span class="p">,</span> linesep<span class="o">=</span><span class="s">&quot;\n&quot;</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kp">textConnection</span><span class="p">(</span>str_c<span class="p">(</span><span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%s%s&quot;</span><span class="p">,</span> lines<span class="p">,</span> linesep<span class="p">),</span> collapse<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">))</span>
+<span class="p">}</span>
+
+<span class="c1">## Takes a GRangesList</span>
+calculate.block.entropy <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>grl<span class="p">,</span> expr.column<span class="o">=</span><span class="s">&quot;tagExpression&quot;</span><span class="p">)</span> <span class="p">{</span>
+  groupfac <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span><span class="kp">rep</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>grl<span class="p">),</span> elementLengths<span class="p">(</span>grl<span class="p">)))</span>
+  exprs <span class="o">&lt;-</span> <span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span><span class="kp">unlist</span><span class="p">(</span>grl<span class="p">))[[</span>expr.column<span class="p">]])</span>
+  total.exprs <span class="o">&lt;-</span> aggregate<span class="p">(</span>exprs<span class="p">,</span> by<span class="o">=</span><span class="kt">list</span><span class="p">(</span>groupfac<span class="p">),</span> FUN<span class="o">=</span><span class="kp">sum</span><span class="p">)</span><span class="o">$</span>x
+  qi <span class="o">&lt;-</span> exprs <span class="o">/</span> <span class="kp">rep</span><span class="p">(</span>total.exprs<span class="p">,</span> elementLengths<span class="p">(</span>grl<span class="p">))</span>
+  qi.times.log <span class="o">&lt;-</span> qi <span class="o">*</span> <span class="kp">log2</span><span class="p">(</span>qi<span class="p">)</span>
+  results <span class="o">&lt;-</span> <span class="o">-</span>aggregate<span class="p">(</span>qi.times.log<span class="p">,</span> by<span class="o">=</span><span class="kt">list</span><span class="p">(</span>groupfac<span class="p">),</span> FUN<span class="o">=</span><span class="kp">sum</span><span class="p">)</span><span class="o">$</span>x
+  <span class="kp">names</span><span class="p">(</span>results<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>grl<span class="p">)</span>
+  results
+<span class="p">}</span>
+
+<span class="c1">## http://hoffmann.bioinf.uni-leipzig.de/LIFE/blockbuster.html</span>
+read.blockbuster.tag.output <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>bbout<span class="p">)</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> <span class="kp">readLines</span><span class="p">(</span>bbout<span class="p">)</span>
+  cluster.line.numbers <span class="o">&lt;-</span> <span class="kp">which</span><span class="p">(</span>str_sub<span class="p">(</span>x<span class="p">,</span> <span class="m">1</span><span class="p">,</span><span class="m">1</span><span class="p">)</span> <span class="o">==</span> <span class="s">&quot;&gt;&quot;</span><span class="p">)</span>
+  cluster.lines <span class="o">&lt;-</span> str_sub<span class="p">(</span>x<span class="p">[</span>cluster.line.numbers<span class="p">],</span> <span class="m">2</span><span class="p">)</span>
+  cluster.table <span class="o">&lt;-</span> read.table<span class="p">(</span>textConnectionFromLines<span class="p">(</span>cluster.lines<span class="p">),</span>
+                              col.names<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;clusterID&quot;</span><span class="p">,</span> <span class="s">&quot;chrom&quot;</span><span class="p">,</span> <span class="s">&quot;clusterStart&quot;</span><span class="p">,</span> <span class="s">&quot;clusterEnd&quot;</span><span class="p">,</span> <span class="s">&quot;strand&quot;</span><span class="p">,</span> <span class="s">&quot;ClusterExpression&quot;</span><span class="p">,</span> <span class="s">&quot;tagCount&quot;</span><span class="p">,</span> <span class="s">&quot;blockCount&quot;</span><span class="p">),</span>
+                              colClasses<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;character&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;numeric&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">))</span>
+  tag.lines <span class="o">&lt;-</span> x<span class="p">[</span><span class="o">-</span>cluster.line.numbers<span class="p">]</span>
+  cluster.tag.line.counts <span class="o">&lt;-</span> cluster.line.numbers<span class="p">[</span><span class="m">-1</span><span class="p">]</span> <span class="o">-</span> cluster.line.numbers<span class="p">[</span><span class="o">-</span><span class="kp">length</span><span class="p">(</span>cluster.line.numbers<span class="p">)]</span> <span class="o">-</span> <span class="m">1</span>
+  cluster.tag.line.counts <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span>cluster.tag.line.counts<span class="p">,</span> <span class="kp">length</span><span class="p">(</span>tag.lines<span class="p">)</span> <span class="o">-</span> <span class="kp">sum</span><span class="p">(</span>cluster.tag.line.counts<span class="p">))</span>
+  tag.table <span class="o">&lt;-</span> read.table<span class="p">(</span>textConnectionFromLines<span class="p">(</span>tag.lines<span class="p">),</span>
+                          col.names<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;tagChrom&quot;</span><span class="p">,</span> <span class="s">&quot;tagStart&quot;</span><span class="p">,</span> <span class="s">&quot;tagEnd&quot;</span><span class="p">,</span> <span class="s">&quot;tagID&quot;</span><span class="p">,</span> <span class="s">&quot;tagExpression&quot;</span><span class="p">,</span> <span class="s">&quot;tagStrand&quot;</span><span class="p">,</span> <span class="s">&quot;blockNb&quot;</span><span class="p">),</span>
+                          colClasses<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;character&quot;</span><span class="p">,</span> <span class="s">&quot;numeric&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">))</span>
+  tag.table<span class="o">$</span>clusterID <span class="o">&lt;-</span> <span class="kp">rep</span><span class="p">(</span>cluster.table<span class="o">$</span>clusterID<span class="p">,</span> cluster.tag.line.counts<span class="p">)</span>
+  tag.table<span class="o">$</span>blockID <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%s/B%s&quot;</span><span class="p">,</span> tag.table<span class="o">$</span>clusterID<span class="p">,</span> tag.table<span class="o">$</span>blockNb<span class="p">)</span>
+  tags <span class="o">&lt;-</span> table.to.granges<span class="p">(</span>tag.table<span class="p">,</span> seqnames.column<span class="o">=</span><span class="s">&quot;tagChrom&quot;</span><span class="p">,</span> start.column<span class="o">=</span><span class="s">&quot;tagStart&quot;</span><span class="p">,</span> end.column<span class="o">=</span><span class="s">&quot;tagEnd&quot;</span><span class="p">,</span> strand.column<span class="o">=</span><span class="s">&quot;tagStrand&quot;</span><span class="p">,</span> seqlengths<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">)</span>
+  tags <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>tags<span class="p">,</span> <span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>tags<span class="p">)</span><span class="o">$</span>blockID<span class="p">))</span>
+  <span class="kr">return</span><span class="p">(</span>tags<span class="p">)</span>
+<span class="p">}</span>
+
+read.blockbuster.output <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>bbout<span class="p">,</span> bbout.tags<span class="o">=</span><span class="kc">NULL</span><span class="p">)</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> <span class="kp">readLines</span><span class="p">(</span>bbout<span class="p">)</span>
+  cluster.line.numbers <span class="o">&lt;-</span> <span class="kp">which</span><span class="p">(</span>str_sub<span class="p">(</span>x<span class="p">,</span> <span class="m">1</span><span class="p">,</span><span class="m">1</span><span class="p">)</span> <span class="o">==</span> <span class="s">&quot;&gt;&quot;</span><span class="p">)</span>
+  cluster.lines <span class="o">&lt;-</span> str_sub<span class="p">(</span>x<span class="p">[</span>cluster.line.numbers<span class="p">],</span> <span class="m">2</span><span class="p">)</span>
+  block.lines <span class="o">&lt;-</span> x<span class="p">[</span><span class="o">-</span>cluster.line.numbers<span class="p">]</span>
+  cluster.table <span class="o">&lt;-</span> read.table<span class="p">(</span>textConnectionFromLines<span class="p">(</span>cluster.lines<span class="p">),</span>
+                              col.names<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;clusterID&quot;</span><span class="p">,</span> <span class="s">&quot;chrom&quot;</span><span class="p">,</span> <span class="s">&quot;clusterStart&quot;</span><span class="p">,</span> <span class="s">&quot;clusterEnd&quot;</span><span class="p">,</span> <span class="s">&quot;strand&quot;</span><span class="p">,</span> <span class="s">&quot;ClusterExpression&quot;</span><span class="p">,</span> <span class="s">&quot;tagCount&quot;</span><span class="p">,</span> <span class="s">&quot;blockCount&quot;</span><span class="p">),</span>
+                              colClasses<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;character&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;numeric&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">))</span>
+  clusters <span class="o">&lt;-</span> table.to.granges<span class="p">(</span>cluster.table<span class="p">,</span> seqnames.column<span class="o">=</span><span class="s">&quot;chrom&quot;</span><span class="p">,</span> start.column<span class="o">=</span><span class="s">&quot;clusterStart&quot;</span><span class="p">,</span> end.column<span class="o">=</span><span class="s">&quot;clusterEnd&quot;</span><span class="p">,</span> seqlengths<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">)</span>
+  block.table <span class="o">&lt;-</span> read.table<span class="p">(</span>textConnectionFromLines<span class="p">(</span>block.lines<span class="p">),</span>
+                            col.names<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;blockNb&quot;</span><span class="p">,</span> <span class="s">&quot;blockChrom&quot;</span><span class="p">,</span> <span class="s">&quot;blockStart&quot;</span><span class="p">,</span> <span class="s">&quot;blockEnd&quot;</span><span class="p">,</span> <span class="s">&quot;blockStrand&quot;</span><span class="p">,</span> <span class="s">&quot;blockExpression&quot;</span><span class="p">,</span> <span class="s">&quot;readCount&quot;</span><span class="p">),</span>
+                            colClasses<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">,</span> <span class="s">&quot;factor&quot;</span><span class="p">,</span> <span class="s">&quot;numeric&quot;</span><span class="p">,</span> <span class="s">&quot;integer&quot;</span><span class="p">))</span>
+  block.table<span class="o">$</span>clusterID <span class="o">&lt;-</span> <span class="kp">rep</span><span class="p">(</span>cluster.table<span class="o">$</span>clusterID<span class="p">,</span> cluster.table<span class="o">$</span>blockCount<span class="p">)</span>
+  block.table<span class="o">$</span>blockID <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%s/B%s&quot;</span><span class="p">,</span> block.table<span class="o">$</span>clusterID<span class="p">,</span> block.table<span class="o">$</span>blockNb<span class="p">)</span>
+  blocks. <span class="o">&lt;-</span> table.to.granges<span class="p">(</span>block.table<span class="p">,</span> seqnames.column<span class="o">=</span><span class="s">&quot;blockChrom&quot;</span><span class="p">,</span> start.column<span class="o">=</span><span class="s">&quot;blockStart&quot;</span><span class="p">,</span> end.column<span class="o">=</span><span class="s">&quot;blockEnd&quot;</span><span class="p">,</span> strand.column<span class="o">=</span><span class="s">&quot;blockStrand&quot;</span><span class="p">,</span> seqlengths<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">)</span>
+  <span class="c1">## blocks. &lt;- split(blocks., as.vector(elementMetadata(blocks.)$clusterID))</span>
+  retval <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>clusters<span class="o">=</span>clusters<span class="p">,</span> blocks<span class="o">=</span>blocks.<span class="p">)</span>
+  <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="kp">is.null</span><span class="p">(</span>bbout.tags<span class="p">))</span> <span class="p">{</span>
+    retval<span class="o">$</span>tags <span class="o">&lt;-</span> read.blockbuster.tag.output<span class="p">(</span>bbout.tags<span class="p">)</span>
+  <span class="p">}</span>
+  <span class="kr">return</span><span class="p">(</span>retval<span class="p">)</span>
+<span class="p">}</span>
+
+
+blockbuster.path <span class="o">&lt;-</span> <span class="s">&quot;/home/ryan/bin/blockbuster&quot;</span>
+
+run.blockbuster <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>gr<span class="p">,</span> <span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  temp.bed.file <span class="o">&lt;-</span> <span class="kp">tempfile</span><span class="p">(</span>fileext<span class="o">=</span><span class="s">&quot;.bed&quot;</span><span class="p">)</span>
+  temp.bbout.file <span class="o">&lt;-</span> <span class="kp">tempfile</span><span class="p">(</span>fileext<span class="o">=</span><span class="s">&quot;.bbout&quot;</span><span class="p">)</span>
+  temp.bbout.tag.file <span class="o">&lt;-</span> <span class="kp">tempfile</span><span class="p">(</span>fileext<span class="o">=</span><span class="s">&quot;.tag.bbout&quot;</span><span class="p">)</span>
+  <span class="kp">tryCatch</span><span class="p">({</span>
+    export<span class="p">(</span><span class="kp">sort</span><span class="p">(</span>gr<span class="p">),</span> temp.bed.file<span class="p">,</span> format<span class="o">=</span><span class="s">&quot;BED&quot;</span><span class="p">)</span>
+    extra.args <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span>
+    bbargs <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="kp">rbind</span><span class="p">(</span><span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;-%s&quot;</span><span class="p">,</span> <span class="kp">names</span><span class="p">(</span>extra.args<span class="p">)),</span> extra.args<span class="p">),</span> <span class="s">&quot;-print&quot;</span><span class="p">,</span> <span class="s">&quot;1&quot;</span><span class="p">,</span> temp.bed.file<span class="p">)</span>
+    bbargs.tags <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="kp">rbind</span><span class="p">(</span><span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;-%s&quot;</span><span class="p">,</span> <span class="kp">names</span><span class="p">(</span>extra.args<span class="p">)),</span> extra.args<span class="p">),</span> <span class="s">&quot;-print&quot;</span><span class="p">,</span> <span class="s">&quot;2&quot;</span><span class="p">,</span> temp.bed.file<span class="p">)</span>
+    <span class="kp">system2</span><span class="p">(</span>blockbuster.path<span class="p">,</span> args<span class="o">=</span>bbargs<span class="p">,</span>
+            stdout<span class="o">=</span>temp.bbout.file<span class="p">)</span>
+    <span class="kp">system2</span><span class="p">(</span>blockbuster.path<span class="p">,</span> args<span class="o">=</span>bbargs.tags<span class="p">,</span>
+            stdout<span class="o">=</span>temp.bbout.tag.file<span class="p">)</span>
+    x <span class="o">&lt;-</span> read.blockbuster.output<span class="p">(</span>temp.bbout.file<span class="p">,</span> temp.bbout.tag.file<span class="p">)</span>
+    x
+  <span class="p">},</span> finally<span class="o">=</span><span class="p">{</span>
+    <span class="kp">unlink</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span>temp.bed.file<span class="p">,</span> temp.bbout.file<span class="p">,</span> temp.bbout.tag.file<span class="p">))</span>
+  <span class="p">})</span>
+<span class="p">}</span>
+
+<span class="c1">## Load the reads from the bam file</span>
+infile <span class="o">&lt;-</span> <span class="s">&quot;./all-results.bam&quot;</span>
+read.ranges <span class="o">&lt;-</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> readAlignedRanges<span class="p">(</span>infile<span class="p">,</span> include.reads<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+  <span class="c1">## Throw away unneeded columns</span>
+  elementMetadata<span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">subset</span><span class="p">(</span>elementMetadata<span class="p">(</span>x<span class="p">),</span> select<span class="o">=-</span><span class="kt">c</span><span class="p">(</span>id<span class="p">,</span>qual<span class="p">,</span>flag<span class="p">))</span>
+  read.multi.map.counts <span class="o">&lt;-</span> <span class="kp">table</span><span class="p">(</span><span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span><span class="kp">seq</span><span class="p">))</span>
+  elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span>multimap <span class="o">&lt;-</span> Rle<span class="p">(</span><span class="kp">as.vector</span><span class="p">(</span>read.multi.map.counts<span class="p">[</span><span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span><span class="kp">seq</span><span class="p">)]))</span>
+  x
+<span class="p">}</span>
+
+<span class="c1">## Get needed annotations</span>
+
+<span class="c1">## rRNA &amp; tRNA (from the repeats table)</span>
+repeat.table <span class="o">&lt;-</span> get.ucsc.table<span class="p">(</span><span class="s">&quot;rmsk&quot;</span><span class="p">,</span> <span class="s">&quot;rmsk&quot;</span><span class="p">,</span> genome<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">)</span>
+repeat.ranges <span class="o">&lt;-</span> table.to.granges<span class="p">(</span>repeat.table<span class="p">,</span> seqnames.column<span class="o">=</span><span class="s">&quot;genoName&quot;</span><span class="p">,</span> start.column<span class="o">=</span><span class="s">&quot;genoStart&quot;</span><span class="p">,</span> end.column<span class="o">=</span><span class="s">&quot;genoEnd&quot;</span><span class="p">,</span> seqlengths<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">)</span>
+trna.ranges <span class="o">&lt;-</span> repeat.ranges<span class="p">[</span>elementMetadata<span class="p">(</span>repeat.ranges<span class="p">)</span><span class="o">$</span>repClass <span class="o">==</span> <span class="s">&quot;tRNA&quot;</span><span class="p">]</span>
+rrna.ranges <span class="o">&lt;-</span> repeat.ranges<span class="p">[</span>elementMetadata<span class="p">(</span>repeat.ranges<span class="p">)</span><span class="o">$</span>repClass <span class="o">==</span> <span class="s">&quot;rRNA&quot;</span><span class="p">]</span>
+
+<span class="c1">## miRNA &amp; snoRNA</span>
+small.rna.table <span class="o">&lt;-</span> <span class="kp">subset</span><span class="p">(</span>get.ucsc.table<span class="p">(</span><span class="s">&quot;wgRna&quot;</span><span class="p">,</span> <span class="s">&quot;wgRna&quot;</span><span class="p">,</span> genome<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">),</span> select<span class="o">=-</span><span class="kt">c</span><span class="p">(</span>thickStart<span class="p">,</span>thickEnd<span class="p">,</span>bin<span class="p">))</span>
+small.rna.ranges <span class="o">&lt;-</span> table.to.granges<span class="p">(</span>small.rna.table<span class="p">,</span> seqnames.column<span class="o">=</span><span class="s">&quot;chrom&quot;</span><span class="p">,</span> start.column<span class="o">=</span><span class="s">&quot;chromStart&quot;</span><span class="p">,</span> end.column<span class="o">=</span><span class="s">&quot;chromEnd&quot;</span><span class="p">,</span> seqlengths<span class="o">=</span><span class="s">&quot;hg19&quot;</span><span class="p">)</span>
+miRNA.types <span class="o">&lt;-</span> <span class="s">&quot;miRNA&quot;</span>
+snoRNA.types <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;CDBox&quot;</span><span class="p">,</span> <span class="s">&quot;HAcaBox&quot;</span><span class="p">)</span>
+miRNA.ranges <span class="o">&lt;-</span> small.rna.ranges<span class="p">[</span> elementMetadata<span class="p">(</span>small.rna.ranges<span class="p">)</span><span class="o">$</span>type <span class="o">%in%</span> miRNA.types <span class="p">]</span>
+snoRNA.ranges <span class="o">&lt;-</span> small.rna.ranges<span class="p">[</span> elementMetadata<span class="p">(</span>small.rna.ranges<span class="p">)</span><span class="o">$</span>type <span class="o">%in%</span> snoRNA.types <span class="p">]</span>
+
+<span class="c1">## chrM</span>
+chrM.range <span class="o">&lt;-</span> GRanges<span class="p">(</span>seqnames<span class="o">=</span><span class="s">&quot;chrM&quot;</span><span class="p">,</span> IRanges<span class="p">(</span>start<span class="o">=</span><span class="m">1</span><span class="p">,</span>end<span class="o">=</span>seqlengths<span class="p">(</span>read.ranges<span class="p">)[[</span><span class="s">&quot;chrM&quot;</span><span class="p">]]),</span> seqlengths<span class="o">=</span>seqlengths<span class="p">(</span>read.ranges<span class="p">))</span>
+
+<span class="c1">## For each sequence that maps once to an anotated rRNA, tRNA, miRNA,</span>
+<span class="c1">## or chrM, remove *all* mappings for that sequence.</span>
+forbidden.ranges <span class="o">&lt;-</span>
+  <span class="kp">Reduce</span><span class="p">(</span><span class="kp">append</span><span class="p">,</span>
+         llply<span class="p">(</span><span class="kt">list</span><span class="p">(</span>rrna.ranges<span class="p">,</span>
+                    trna.ranges<span class="p">,</span>
+                    <span class="c1">## miRNA.ranges,</span>
+                    chrM.range<span class="p">),</span>
+               <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span> elementMetadata<span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="kc">NULL</span><span class="p">;</span> x <span class="p">}))</span>
+
+generate.null.ranges <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>y<span class="p">)</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> GRanges<span class="p">(</span>seqnames<span class="o">=</span><span class="kp">names</span><span class="p">(</span>seqlengths<span class="p">(</span>y<span class="p">)),</span> ranges<span class="o">=</span>IRanges<span class="p">(</span><span class="m">1</span><span class="p">,</span><span class="m">0</span><span class="p">),</span> strand<span class="o">=</span><span class="s">&quot;*&quot;</span><span class="p">,</span> seqlengths<span class="o">=</span>seqlengths<span class="p">(</span>y<span class="p">))</span>
+  <span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>elementMetadata<span class="p">(</span>y<span class="p">)))</span> <span class="p">{</span>
+    elementMetadata<span class="p">(</span>x<span class="p">)[[</span>i<span class="p">]]</span> <span class="o">&lt;-</span> as<span class="p">(</span><span class="kc">NA</span><span class="p">,</span> <span class="kp">class</span><span class="p">(</span><span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>y<span class="p">)[[</span>i<span class="p">]])))</span>
+  <span class="p">}</span>
+  x
+<span class="p">}</span>
+
+select.nearest <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">,</span> y<span class="p">)</span> <span class="p">{</span>
+  y <span class="o">&lt;-</span> <span class="kp">append</span><span class="p">(</span>y<span class="p">,</span> generate.null.ranges<span class="p">(</span>y<span class="p">))</span>
+  y<span class="p">[</span>nearest<span class="p">(</span>x<span class="p">,</span>y<span class="p">)]</span>
+<span class="p">}</span>
+
+annotate.by.granges <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>peaks<span class="p">,</span> gr<span class="p">,</span> annot.columns<span class="p">)</span> <span class="p">{</span>
+  <span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>elementMetadata<span class="p">(</span>gr<span class="p">)))</span> <span class="p">{</span>
+    elementMetadata<span class="p">(</span>gr<span class="p">)[[</span>i<span class="p">]]</span> <span class="o">&lt;-</span> <span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>gr<span class="p">)[[</span>i<span class="p">]])</span>
+  <span class="p">}</span>
+  nearest.ranges <span class="o">&lt;-</span> select.nearest<span class="p">(</span>peaks<span class="p">,</span> gr<span class="p">)</span>
+  nearest.distance <span class="o">&lt;-</span> distance<span class="p">(</span>peaks<span class="p">,</span> nearest.ranges<span class="p">,</span> ignore.strand<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+  in.ranges <span class="o">&lt;-</span> Rle<span class="p">(</span>nearest.distance <span class="o">==</span> <span class="m">0</span><span class="p">)</span>
+  annot.data <span class="o">&lt;-</span> elementMetadata<span class="p">(</span>nearest.ranges<span class="p">)[</span>annot.columns<span class="p">]</span>
+  <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="kp">is.null</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>annot.columns<span class="p">)))</span> <span class="p">{</span>
+    <span class="kp">names</span><span class="p">(</span>annot.data<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>annot.columns<span class="p">)</span>
+  <span class="p">}</span>
+  DataFrame<span class="p">(</span>overlap<span class="o">=</span>in.ranges<span class="p">,</span> distance<span class="o">=</span>nearest.distance<span class="p">,</span> annot.data<span class="p">)</span>
+<span class="p">}</span>
+
+forbidden.seqs <span class="o">&lt;-</span> <span class="kp">unique</span><span class="p">(</span><span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>read.ranges<span class="p">)</span><span class="o">$</span><span class="kp">seq</span><span class="p">[</span>read.ranges <span class="o">%in%</span> forbidden.ranges<span class="p">]))</span>
+forbidden.indices <span class="o">&lt;-</span> <span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>findOverlaps<span class="p">(</span>read.ranges<span class="p">,</span> forbidden.ranges<span class="p">,</span> select<span class="o">=</span><span class="s">&quot;first&quot;</span><span class="p">,</span> ignore.strand<span class="o">=</span><span class="kc">TRUE</span><span class="p">))</span>
+forbidden.read.ranges <span class="o">&lt;-</span> read.ranges<span class="p">[</span>forbidden.indices<span class="p">]</span>
+read.ranges <span class="o">&lt;-</span> read.ranges<span class="p">[</span><span class="o">!</span>forbidden.indices<span class="p">]</span>
+
+<span class="c1">## Read the counts table</span>
+read.counts <span class="o">&lt;-</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> read.csv<span class="p">(</span><span class="s">&quot;./data/R21_R82_read_expr_matrix_no_blanks&quot;</span><span class="p">,</span>
+                stringsAsFactors<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span> sep<span class="o">=</span><span class="s">&quot;\t&quot;</span><span class="p">,</span> header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> row.names<span class="o">=</span><span class="m">1</span><span class="p">)</span>
+  <span class="kp">row.names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> str_trim<span class="p">(</span><span class="kp">row.names</span><span class="p">(</span>x<span class="p">))</span>
+  <span class="kp">names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> str_replace<span class="p">(</span><span class="kp">names</span><span class="p">(</span>x<span class="p">),</span> <span class="s">&quot;_collapsed_sorted_with_zeros$&quot;</span><span class="p">,</span> <span class="s">&quot;&quot;</span><span class="p">)</span>
+  x <span class="o">&lt;-</span> x<span class="p">[</span><span class="kp">row.names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">%in%</span> elementMetadata<span class="p">(</span>read.ranges<span class="p">)</span><span class="o">$</span><span class="kp">seq</span><span class="p">,]</span>
+  x <span class="o">&lt;-</span> x<span class="p">[</span><span class="kp">order</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>x<span class="p">))]</span>
+  x
+<span class="p">}</span>
+
+<span class="c1">## Create per-sample bed files with score = count / multimap</span>
+sample.read.ranges <span class="o">&lt;-</span>
+  llply<span class="p">(</span><span class="kp">names</span><span class="p">(</span>read.counts<span class="p">),</span>
+        <span class="kr">function</span> <span class="p">(</span><span class="kp">sample</span><span class="p">)</span> <span class="p">{</span>
+          x <span class="o">&lt;-</span> read.ranges
+          <span class="c1">## Score = sample count / multimap</span>
+          elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span>score <span class="o">&lt;-</span>
+            <span class="kp">as.vector</span><span class="p">(</span>read.counts<span class="p">[</span><span class="kp">as.vector</span><span class="p">(</span>elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span><span class="kp">seq</span><span class="p">),</span> <span class="kp">sample</span><span class="p">]</span> <span class="o">/</span> elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span>multimap<span class="p">)</span>
+          <span class="c1">## Eliminate reads with zero score</span>
+          x <span class="o">&lt;-</span> x<span class="p">[</span>elementMetadata<span class="p">(</span>x<span class="p">)</span><span class="o">$</span>score <span class="o">&gt;</span> <span class="m">0</span><span class="p">]</span>
+          x
+        <span class="p">},</span> <span class="m">.</span>parallel<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+<span class="kp">names</span><span class="p">(</span>sample.read.ranges<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>read.counts<span class="p">)</span>
+
+<span class="c1">## Run blockbuster on each sample</span>
+<span class="c1">## x &lt;- sample.read.ranges[[1]][1:500]</span>
+<span class="c1">## y &lt;- run.blockbuster(x)</span>
+<span class="c1">## z &lt;- calculate.block.entropy(y$tags)</span>
+blockbuster.results <span class="o">&lt;-</span> llply<span class="p">(</span>sample.read.ranges<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> run.blockbuster<span class="p">(</span>unstranded<span class="p">(</span>x<span class="p">),</span> minBlockHeight<span class="o">=</span><span class="m">5</span><span class="p">),</span> <span class="m">.</span>parallel<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+
+<span class="c1">## calculate.block.entropy(blockbuster.results[[1]]$tags[1:5])</span>
+
+<span class="c1">## Calculate entropy of blocks</span>
+block.entropy <span class="o">&lt;-</span> llply<span class="p">(</span>blockbuster.results<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> calculate.block.entropy<span class="p">(</span>x<span class="o">$</span>tags<span class="p">),</span> <span class="m">.</span>parallel<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+
+<span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>blockbuster.results<span class="p">))</span> <span class="p">{</span>
+  elementMetadata<span class="p">(</span>blockbuster.results<span class="p">[[</span>i<span class="p">]]</span><span class="o">$</span>blocks<span class="p">)</span><span class="o">$</span>entropy <span class="o">&lt;-</span> block.entropy<span class="p">[[</span>i<span class="p">]][</span><span class="kp">as.character</span><span class="p">(</span>elementMetadata<span class="p">(</span>blockbuster.results<span class="p">[[</span>i<span class="p">]]</span><span class="o">$</span>blocks<span class="p">)</span><span class="o">$</span>blockID<span class="p">)]</span>
+<span class="p">}</span>
+
+<span class="c1">## Plot entropy vs block length</span>
+x <span class="o">&lt;-</span> blockbuster.results<span class="p">[[</span><span class="m">1</span><span class="p">]]</span><span class="o">$</span>blocks
+y <span class="o">&lt;-</span> <span class="kp">cbind</span><span class="p">(</span><span class="kp">as.data.frame</span><span class="p">(</span>elementMetadata<span class="p">(</span>x<span class="p">)),</span> width<span class="o">=</span>width<span class="p">(</span>x<span class="p">))</span>
+ggplot<span class="p">(</span>y<span class="p">,</span> aes<span class="p">(</span>x<span class="o">=</span>width<span class="p">,</span> y<span class="o">=</span>entropy<span class="p">,</span> color<span class="o">=</span><span class="m">..</span>density..<span class="p">))</span> <span class="o">+</span> stat_density2d<span class="p">(</span>geom<span class="o">=</span><span class="s">&quot;tile&quot;</span><span class="p">,</span> aes<span class="p">(</span>fill<span class="o">=</span><span class="m">..</span>density..<span class="p">),</span> contour<span class="o">=</span><span class="kc">FALSE</span><span class="p">)</span> <span class="o">+</span> scale_fill_gradient<span class="p">(</span>low<span class="o">=</span><span class="s">&quot;blue&quot;</span><span class="p">,</span> high<span class="o">=</span><span class="s">&quot;yellow&quot;</span><span class="p">)</span> <span class="o">+</span> scale_color_gradient<span class="p">(</span>low<span class="o">=</span><span class="s">&quot;blue&quot;</span><span class="p">,</span> high<span class="o">=</span><span class="s">&quot;yellow&quot;</span><span class="p">)</span>
+
+<span class="c1">## Compute nearest miRNA/snoRNA to each block and add annotation</span>
+block.annot <span class="o">&lt;-</span> llply<span class="p">(</span>blockbuster.results<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>br<span class="p">)</span> <span class="p">{</span>
+  annotate.by.granges<span class="p">(</span>br<span class="o">$</span>blocks<span class="p">,</span> small.rna.ranges<span class="p">,</span> <span class="kt">c</span><span class="p">(</span>nearest.ncRNA<span class="o">=</span><span class="s">&quot;name&quot;</span><span class="p">,</span> ncRNA.type<span class="o">=</span><span class="s">&quot;type&quot;</span><span class="p">))</span>
+<span class="p">},</span> <span class="m">.</span>parallel<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+<span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>blockbuster.results<span class="p">))</span> <span class="p">{</span>
+  elementMetadata<span class="p">(</span>blockbuster.results<span class="p">[[</span>i<span class="p">]]</span><span class="o">$</span>blocks<span class="p">)[</span><span class="kp">names</span><span class="p">(</span>block.annot<span class="p">[[</span>i<span class="p">]])]</span> <span class="o">&lt;-</span> block.annot<span class="p">[[</span>i<span class="p">]]</span>
+<span class="p">}</span>
+
+<span class="c1">## write output</span>
+<span class="kp">saveRDS</span><span class="p">(</span>blockbuster.results<span class="p">,</span> <span class="s">&quot;blockbuster_results.RDS&quot;</span><span class="p">)</span>
+block.tables <span class="o">&lt;-</span> llply<span class="p">(</span>blockbuster.results<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>br<span class="p">)</span> as<span class="p">(</span>granges.to.dataframe<span class="p">(</span>br<span class="o">$</span>blocks<span class="p">,</span> ignore.strand<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> include.width<span class="o">=</span><span class="kc">TRUE</span><span class="p">),</span> <span class="s">&quot;data.frame&quot;</span><span class="p">))</span>
+write.xlsx.multisheet<span class="p">(</span>block.tables<span class="p">,</span> <span class="s">&quot;blockbuster_results.xlsx&quot;</span><span class="p">,</span> row.names<span class="o">=</span><span class="kc">FALSE</span><span class="p">)</span>
+</pre></div>
+</body>
+</html>

+ 4 - 4
examples/Salomon/fRMA/README.mkdn

@@ -20,28 +20,28 @@ data with them. The second pair, `consistency-train.R` and
 `consistency-evaluate.R`, handle (respectively) training five separate
 fRMA vector sets and testing their consistency.
 
-## [`train.R`](train.R): Creating the fRMA vectors ##
+## [`train.R`](train.R.html): Creating the fRMA vectors ##
 
 This script reads the sample metadata tables, assembles the full file
 lists for BX and PAX tissues, and trains a set of fRMA vectors for
 each tissue. It exports each of these vector sets to an installable R
 package.
 
-## [`test.R`](test.R): Testing the fRMA vectors ##
+## [`test.R`](test.R.html): Testing the fRMA vectors ##
 
 This script simply loads all the arrays and normalizes them using the
 appropriate fRMA vectors that were generated by `train.R`. It should
 be run after installing the packages produced by `train.R`. It is
 simply used for testing to make sure the fRMA vectors work.
 
-## [`consistency-train.R`](consistency-train.R): Train several vector sets for each tissue ##
+## [`consistency-train.R`](consistency-train.R.html): Train several vector sets for each tissue ##
 
 This script essentially does the same thing as `train.R`, only it does
 it five times with five different subsamplings of the arrays to
 generate five different fRMA vector sets and saves them all in an R
 data file.
 
-## [`consistency-evaluate.R`](consistency-evaluate.R): Verify consistency of fRMA vectors ##
+## [`consistency-evaluate.R`](consistency-evaluate.R.html): Verify consistency of fRMA vectors ##
 
 This script loads the data file from `consistency-train.R`, then loads
 20 random arrays from each tissue and normalizes them with all five

+ 163 - 0
examples/Salomon/fRMA/consistency-evaluate.R.html

@@ -0,0 +1,163 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/env Rscript</span>
+
+<span class="c1"># Script to check reproducibility of fRMA training process</span>
+
+<span class="kn">library</span><span class="p">(</span>xlsx<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frma<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frmaTools<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>stringr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>magrittr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>plyr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>affy<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>preprocessCore<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>ggplot2<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>proto<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>dplyr<span class="p">)</span>
+<span class="kp">load</span><span class="p">(</span><span class="s">&quot;consistency.rda&quot;</span><span class="p">)</span>
+
+<span class="c1">## Select a random subset of 20 arrays from each tissue (Would rather</span>
+<span class="c1">## use entire dataset but ENOMEM)</span>
+<span class="kp">set.seed</span><span class="p">(</span><span class="m">1986</span><span class="p">)</span>
+norm.exprs <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>vectors<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>ttype<span class="p">)</span> <span class="p">{</span>
+    stab <span class="o">&lt;-</span> sample.tables<span class="p">[[</span>ttype<span class="p">]]</span> <span class="o">%&gt;%</span> sample_n<span class="p">(</span><span class="m">20</span><span class="p">)</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Reading 20 random arrays for &quot;</span><span class="p">,</span> ttype<span class="p">)</span>
+    affy <span class="o">&lt;-</span> ReadAffy<span class="p">(</span>filenames<span class="o">=</span>stab<span class="o">$</span>Filename<span class="p">,</span> sampleNames<span class="o">=</span><span class="kp">rownames</span><span class="p">(</span>stab<span class="p">))</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Normalizing with RMA for comparison&quot;</span><span class="p">)</span>
+    eset.rma <span class="o">&lt;-</span> rma<span class="p">(</span>affy<span class="p">)</span>
+    rma.exprs <span class="o">&lt;-</span> eset.rma <span class="o">%&gt;%</span> exprs <span class="o">%&gt;%</span> <span class="kp">as.vector</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Normalizing with 5 trains fRMA vector sets&quot;</span><span class="p">)</span>
+    esets.frma <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>vectors<span class="p">[[</span>ttype<span class="p">]],</span> <span class="m">.</span> <span class="o">%&gt;%</span> frma<span class="p">(</span>affy<span class="p">,</span> input.vecs<span class="o">=</span><span class="m">.</span><span class="p">))</span>
+    frma.exprs <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span>esets.frma<span class="p">,</span> <span class="m">.</span> <span class="o">%&gt;%</span> exprs <span class="o">%&gt;%</span> <span class="kp">as.vector</span><span class="p">)</span>
+    <span class="kt">data.frame</span><span class="p">(</span>RMA<span class="o">=</span>rma.exprs<span class="p">,</span> fRMA<span class="o">=</span>frma.exprs<span class="p">)</span>
+<span class="p">})</span> <span class="o">%&gt;%</span> setNames<span class="p">(</span><span class="kp">names</span><span class="p">(</span>vectors<span class="p">))</span>
+
+<span class="c1">## Save because the above takes a while</span>
+<span class="kp">save.image</span><span class="p">(</span><span class="s">&quot;consistency.rda&quot;</span><span class="p">)</span>
+
+<span class="kp">dir.create</span><span class="p">(</span><span class="s">&quot;fRMA_consistency_results&quot;</span><span class="p">,</span> <span class="kc">FALSE</span><span class="p">)</span>
+
+<span class="c1">## Compute M/A data for all pairwise comparisons</span>
+ma.data <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>norm.exprs<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>normexprs<span class="p">)</span> <span class="p">{</span>
+    normexprs <span class="o">%&gt;%</span> names <span class="o">%&gt;%</span> combn<span class="p">(</span><span class="m">2</span><span class="p">,</span> simplify<span class="o">=</span><span class="kc">FALSE</span><span class="p">)</span> <span class="o">%&gt;%</span> ldply<span class="p">(</span><span class="m">.</span> <span class="o">%&gt;%</span> <span class="p">{</span>
+        col1 <span class="o">&lt;-</span> <span class="m">.</span><span class="p">[</span><span class="m">1</span><span class="p">]</span>
+        col2 <span class="o">&lt;-</span> <span class="m">.</span><span class="p">[</span><span class="m">2</span><span class="p">]</span>
+        x1 <span class="o">&lt;-</span> normexprs<span class="p">[[</span>col1<span class="p">]]</span>
+        x2 <span class="o">&lt;-</span> normexprs<span class="p">[[</span>col2<span class="p">]]</span>
+        <span class="kt">data.frame</span><span class="p">(</span>Comparison <span class="o">=</span> str_c<span class="p">(</span>col1<span class="p">,</span><span class="s">&quot;.vs.&quot;</span><span class="p">,</span>col2<span class="p">),</span>
+                   x1<span class="o">=</span>x1<span class="p">,</span> x2<span class="o">=</span>x2<span class="p">,</span>
+                   M <span class="o">=</span> x2 <span class="o">-</span> x1<span class="p">,</span>
+                   A <span class="o">=</span> <span class="p">(</span>x1<span class="o">+</span>x2<span class="p">)</span><span class="o">/</span><span class="m">2</span><span class="p">)</span>
+    <span class="p">})</span>
+<span class="p">})</span>
+
+<span class="kr">for</span> <span class="p">(</span>ttype <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>norm.exprs<span class="p">))</span> <span class="p">{</span>
+    madata <span class="o">&lt;-</span> ma.data<span class="p">[[</span>ttype<span class="p">]]</span>
+
+    pdf<span class="p">(</span>str_c<span class="p">(</span><span class="s">&quot;fRMA_consistency_results/MA Plots for &quot;</span><span class="p">,</span> ttype<span class="p">,</span> <span class="s">&quot;.pdf&quot;</span><span class="p">))</span>
+    <span class="c1">## MA plot for every pair of normalizations</span>
+    ddply<span class="p">(</span>madata<span class="p">,</span> <span class="m">.</span><span class="p">(</span>Comparison<span class="p">),</span> <span class="m">.</span> <span class="o">%$%</span> <span class="p">{</span>
+        smoothScatter<span class="p">(</span>x<span class="o">=</span>A<span class="p">,</span> y<span class="o">=</span>M<span class="p">,</span> nbin<span class="o">=</span><span class="m">512</span><span class="p">,</span> main<span class="o">=</span>Comparison<span class="p">[</span><span class="m">1</span><span class="p">],</span> xlab<span class="o">=</span><span class="s">&quot;A&quot;</span><span class="p">,</span> ylab<span class="o">=</span><span class="s">&quot;M&quot;</span><span class="p">)</span>
+    <span class="p">})</span>
+    dev.off<span class="p">()</span>
+
+    <span class="c1">## M boxplots &amp; violin plots</span>
+    pdf<span class="p">(</span>str_c<span class="p">(</span><span class="s">&quot;fRMA_consistency_results/M Boxplots for &quot;</span><span class="p">,</span> ttype<span class="p">,</span> <span class="s">&quot;.pdf&quot;</span><span class="p">),</span>
+        width<span class="o">=</span><span class="m">8</span><span class="p">,</span> height<span class="o">=</span><span class="m">12</span><span class="p">)</span>
+    p <span class="o">&lt;-</span> ggplot<span class="p">(</span>madata<span class="p">)</span> <span class="o">+</span> aes<span class="p">(</span>x<span class="o">=</span>Comparison<span class="p">,</span> y<span class="o">=</span>M<span class="p">)</span> <span class="o">+</span>
+        scale_x_discrete<span class="p">(</span>limits <span class="o">=</span> <span class="kp">rev</span><span class="p">(</span><span class="kp">levels</span><span class="p">(</span>madata<span class="o">$</span>Comparison<span class="p">)))</span> <span class="o">+</span>
+            coord_flip<span class="p">()</span>
+    <span class="kp">print</span><span class="p">(</span>p <span class="o">+</span> geom_boxplot<span class="p">(</span>notch<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> outlier.shape <span class="o">=</span> <span class="kc">NA</span><span class="p">)</span> <span class="o">+</span>
+          ggtitle<span class="p">(</span>str_c<span class="p">(</span><span class="s">&quot;Boxplots of M value distributions for &quot;</span><span class="p">,</span> ttype<span class="p">)))</span>
+    <span class="kp">print</span><span class="p">(</span>p <span class="o">+</span> geom_violin<span class="p">(</span>scale<span class="o">=</span><span class="s">&quot;width&quot;</span><span class="p">)</span> <span class="o">+</span>
+          ggtitle<span class="p">(</span>str_c<span class="p">(</span><span class="s">&quot;Violin plots of M value distributions for &quot;</span><span class="p">,</span> ttype<span class="p">)))</span>
+    dev.off<span class="p">()</span>
+<span class="p">}</span>
+</pre></div>
+</body>
+</html>

+ 198 - 0
examples/Salomon/fRMA/consistency-train.R.html

@@ -0,0 +1,198 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/env Rscript</span>
+
+<span class="c1"># Script to train multiple fRMA vectors in preparation for consistency</span>
+<span class="c1"># evaluation</span>
+
+<span class="kn">library</span><span class="p">(</span>xlsx<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frma<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frmaTools<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>stringr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>magrittr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>plyr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>affy<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>preprocessCore<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>ggplot2<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>proto<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>dplyr<span class="p">)</span>
+
+training.data.dir <span class="o">&lt;-</span> <span class="s">&quot;Training Data&quot;</span>
+datasets <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>Dataset<span class="o">=</span><span class="kp">list.files</span><span class="p">(</span>training.data.dir<span class="p">))</span>
+<span class="kp">rownames</span><span class="p">(</span>datasets<span class="p">)</span> <span class="o">&lt;-</span> datasets<span class="o">$</span>Dataset
+datasets<span class="o">$</span>Tissue <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span>str_extract<span class="p">(</span>datasets<span class="o">$</span>Dataset<span class="p">,</span> <span class="s">&quot;\\b(PAX|BX)\\b&quot;</span><span class="p">))</span>
+
+tsmsg <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kp">message</span><span class="p">(</span><span class="kp">date</span><span class="p">(),</span> <span class="s">&quot;: &quot;</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## Some Scan Dates are marked as identical for multiple batches, which</span>
+<span class="c1">## is bad. But the dates embedded in the file names for these batches</span>
+<span class="c1">## are different, so we use those dates instead.</span>
+parse.date.from.filename <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>fname<span class="p">)</span> <span class="p">{</span>
+    res1 <span class="o">&lt;-</span> str_match<span class="p">(</span>fname<span class="p">,</span> <span class="s">&quot;^(\\d\\d)(\\d\\d)(\\d\\d)&quot;</span><span class="p">)[,</span><span class="kt">c</span><span class="p">(</span><span class="m">4</span><span class="p">,</span><span class="m">2</span><span class="p">,</span><span class="m">3</span><span class="p">)]</span>
+    res2 <span class="o">&lt;-</span> str_match<span class="p">(</span>fname<span class="p">,</span> <span class="s">&quot;^20(\\d\\d)_(\\d\\d)_(\\d\\d)&quot;</span><span class="p">)[,</span><span class="m">-1</span><span class="p">]</span>
+    res1<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>res1<span class="p">)]</span> <span class="o">&lt;-</span> res2<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>res1<span class="p">)]</span>
+    <span class="kp">colnames</span><span class="p">(</span>res1<span class="p">)</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;year&quot;</span><span class="p">,</span> <span class="s">&quot;month&quot;</span><span class="p">,</span> <span class="s">&quot;day&quot;</span><span class="p">)</span>
+    res1<span class="p">[,</span><span class="s">&quot;year&quot;</span><span class="p">]</span> <span class="o">%&lt;&gt;%</span> str_c<span class="p">(</span><span class="s">&quot;20&quot;</span><span class="p">,</span> <span class="m">.</span><span class="p">)</span>
+    <span class="kp">as.Date</span><span class="p">(</span><span class="kp">do.call</span><span class="p">(</span><span class="kp">ISOdate</span><span class="p">,</span> <span class="kt">data.frame</span><span class="p">(</span>res1<span class="p">)))</span>
+<span class="p">}</span>
+
+<span class="c1">## This reads in the xlsx file for each of the 7 datasets and combines</span>
+<span class="c1">## them into one big table of all samples. The Batch column contains</span>
+<span class="c1">## the partitioning of samples into unique combinations of Dataset,</span>
+<span class="c1">## Scan Date, and Phenotype. Finally, we split based on Tissue type to</span>
+<span class="c1">## get one table for biopsies (BX), and one for blood (PAX).</span>
+sample.tables <span class="o">&lt;-</span> ddply<span class="p">(</span>datasets<span class="p">,</span> <span class="m">.</span><span class="p">(</span>Dataset<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>df<span class="p">)</span> <span class="p">{</span>
+    df <span class="o">&lt;-</span> df<span class="p">[</span><span class="m">1</span><span class="p">,]</span>
+    <span class="kp">rownames</span><span class="p">(</span>df<span class="p">)</span> <span class="o">&lt;-</span> <span class="kc">NULL</span>
+    dset.dir <span class="o">&lt;-</span> <span class="kp">file.path</span><span class="p">(</span>training.data.dir<span class="p">,</span> df<span class="o">$</span>Dataset<span class="p">)</span>
+    x <span class="o">&lt;-</span> read.xlsx<span class="p">(</span><span class="kp">list.files</span><span class="p">(</span>dset.dir<span class="p">,</span> pattern<span class="o">=</span>glob2rx<span class="p">(</span><span class="s">&quot;*.xlsx&quot;</span><span class="p">),</span> full.names<span class="o">=</span><span class="kc">TRUE</span><span class="p">)[</span><span class="m">1</span><span class="p">],</span> <span class="m">1</span><span class="p">)</span> <span class="o">%&gt;%</span>
+        setNames<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;Filename&quot;</span><span class="p">,</span> <span class="s">&quot;Phenotype&quot;</span><span class="p">,</span> <span class="s">&quot;ScanDate&quot;</span><span class="p">))</span>
+    x<span class="o">$</span>Filename <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>Filename<span class="p">)</span>
+    missing.CEL <span class="o">&lt;-</span> <span class="o">!</span>str_detect<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">,</span> <span class="s">&quot;\\.CEL$&quot;</span><span class="p">)</span>
+    x<span class="o">$</span>Filename<span class="p">[</span>missing.CEL<span class="p">]</span> <span class="o">&lt;-</span> str_c<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">[</span>missing.CEL<span class="p">],</span> <span class="s">&quot;.CEL&quot;</span><span class="p">)</span>
+    <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">all</span><span class="p">(</span>str_detect<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">,</span> <span class="s">&quot;\\.CEL$&quot;</span><span class="p">)))</span>
+    parsed.date <span class="o">&lt;-</span> parse.date.from.filename<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">)</span>
+    x<span class="o">$</span>ScanDate<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>parsed.date<span class="p">)]</span> <span class="o">&lt;-</span> parsed.date<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>parsed.date<span class="p">)]</span>
+    x <span class="o">%&gt;%</span> <span class="kp">cbind</span><span class="p">(</span>df<span class="p">)</span> <span class="o">%&gt;%</span>
+        <span class="kp">transform</span><span class="p">(</span>Filename<span class="o">=</span><span class="kp">file.path</span><span class="p">(</span>dset.dir<span class="p">,</span> Filename<span class="p">),</span>
+                  Batch<span class="o">=</span><span class="kp">droplevels</span><span class="p">(</span>Tissue<span class="o">:</span>Dataset<span class="o">:</span><span class="kp">factor</span><span class="p">(</span>ScanDate<span class="p">)</span><span class="o">:</span>Phenotype<span class="p">))</span> <span class="o">%&gt;%</span>
+                      <span class="kp">subset</span><span class="p">(</span><span class="o">!</span> Filename <span class="o">%in%</span> blacklist<span class="p">)</span> <span class="o">%&gt;%</span>
+                          <span class="kp">subset</span><span class="p">(</span><span class="o">!</span><span class="kp">duplicated</span><span class="p">(</span>Filename<span class="p">))</span>
+<span class="p">})</span> <span class="o">%&gt;%</span>
+    <span class="kp">split</span><span class="p">(</span><span class="m">.</span><span class="o">$</span>Tissue<span class="p">)</span> <span class="o">%&gt;%</span>
+        <span class="kp">lapply</span><span class="p">(</span><span class="kp">droplevels</span><span class="p">)</span>
+
+<span class="c1">## fRMA requires equal-sized batches, so for each batch size from 3 to</span>
+<span class="c1">## 15, compute how many batches have at least that many samples.</span>
+x <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span><span class="m">3</span><span class="o">:</span><span class="m">15</span><span class="p">,</span> <span class="kr">function</span><span class="p">(</span>i<span class="p">)</span> <span class="kp">sapply</span><span class="p">(</span>sample.tables<span class="p">,</span> <span class="m">.</span> <span class="o">%$%</span> Batch <span class="o">%&gt;%</span> table <span class="o">%&gt;%</span> as.vector <span class="o">%&gt;%</span> <span class="p">{</span><span class="kp">sum</span><span class="p">(</span><span class="m">.</span> <span class="o">&gt;=</span> i<span class="p">)}))</span>
+<span class="kp">colnames</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="m">3</span><span class="o">:</span><span class="m">15</span>
+
+<span class="c1">## Based on the above and the recommendations in the frmaTools paper,</span>
+<span class="c1">## I chose 5 as the optimal batch size. This could be optimized</span>
+<span class="c1">## empirically, though.</span>
+arrays.per.batch <span class="o">&lt;-</span> <span class="m">5</span>
+
+vectors <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>sample.tables<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>ttype<span class="p">)</span> <span class="p">{</span>
+    stab <span class="o">&lt;-</span> sample.tables<span class="p">[[</span>ttype<span class="p">]]</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Reading full dataset for &quot;</span><span class="p">,</span> ttype<span class="p">)</span>
+    affy <span class="o">&lt;-</span> ReadAffy<span class="p">(</span>filenames<span class="o">=</span>stab<span class="o">$</span>Filename<span class="p">,</span> sampleNames<span class="o">=</span><span class="kp">rownames</span><span class="p">(</span>stab<span class="p">))</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Getting reference normalziation distribution from full dataset for &quot;</span><span class="p">,</span> ttype<span class="p">)</span>
+    normVec <span class="o">&lt;-</span> normalize.quantiles.determine.target<span class="p">(</span>pm<span class="p">(</span>bg.correct.rma<span class="p">(</span>affy<span class="p">)))</span>
+    <span class="kp">rm</span><span class="p">(</span>affy<span class="p">);</span> <span class="kp">gc</span><span class="p">()</span>
+    <span class="c1">## Set the random seed for reproducibility.</span>
+    <span class="kp">set.seed</span><span class="p">(</span><span class="m">1986</span><span class="p">)</span>
+
+    <span class="kp">lapply</span><span class="p">(</span><span class="m">1</span><span class="o">:</span><span class="m">5</span><span class="p">,</span> <span class="kr">function</span><span class="p">(</span>i<span class="p">)</span> <span class="p">{</span>
+        <span class="kp">on.exit</span><span class="p">(</span><span class="kp">gc</span><span class="p">())</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Starting training run number &quot;</span><span class="p">,</span> i<span class="p">,</span> <span class="s">&quot; for &quot;</span><span class="p">,</span> ttype<span class="p">)</span>
+        tsmsg<span class="p">(</span><span class="s">&quot;Selecting batches for &quot;</span><span class="p">,</span> ttype<span class="p">)</span>
+        <span class="c1">## Keep only batches with enough samples</span>
+        big.enough <span class="o">&lt;-</span> stab<span class="o">$</span>Batch <span class="o">%&gt;%</span> table <span class="o">%&gt;%</span> <span class="m">.</span><span class="p">[</span><span class="m">.</span><span class="o">&gt;=</span> arrays.per.batch<span class="p">]</span> <span class="o">%&gt;%</span> <span class="kp">names</span>
+        stab <span class="o">&lt;-</span> stab<span class="p">[</span>stab<span class="o">$</span>Batch <span class="o">%in%</span> big.enough<span class="p">,]</span> <span class="o">%&gt;%</span> <span class="kp">droplevels</span>
+
+        <span class="c1">## Sample an equal number of arrays from each batch</span>
+        subtab <span class="o">&lt;-</span> ddply<span class="p">(</span>stab<span class="p">,</span> <span class="m">.</span><span class="p">(</span>Batch<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>df<span class="p">)</span> <span class="p">{</span>
+            df<span class="p">[</span><span class="kp">sample</span><span class="p">(</span><span class="kp">seq</span><span class="p">(</span><span class="kp">nrow</span><span class="p">(</span>df<span class="p">)),</span> size<span class="o">=</span>arrays.per.batch<span class="p">),]</span>
+        <span class="p">})</span>
+
+        tsmsg<span class="p">(</span><span class="s">&quot;Making fRMA vectors&quot;</span><span class="p">)</span>
+        <span class="c1">## Make fRMA vectors, using normVec from full dataset</span>
+        res <span class="o">&lt;-</span> makeVectorsAffyBatch<span class="p">(</span>subtab<span class="o">$</span>Filename<span class="p">,</span> subtab<span class="o">$</span>Batch<span class="p">,</span> normVec<span class="o">=</span>normVec<span class="p">)</span>
+
+        tsmsg<span class="p">(</span><span class="s">&quot;Finished training run number &quot;</span><span class="p">,</span> i<span class="p">,</span> <span class="s">&quot; for &quot;</span><span class="p">,</span> ttype<span class="p">)</span>
+        res
+    <span class="p">})</span> <span class="o">%&gt;%</span> setNames<span class="p">(</span><span class="m">.</span><span class="p">,</span> str_c<span class="p">(</span><span class="s">&quot;V&quot;</span><span class="p">,</span> <span class="kp">seq_along</span><span class="p">(</span><span class="m">.</span><span class="p">)))</span>
+<span class="p">})</span> <span class="o">%&gt;%</span> setNames<span class="p">(</span><span class="kp">names</span><span class="p">(</span>sample.tables<span class="p">))</span>
+
+<span class="kp">saveRDS</span><span class="p">(</span>vectors<span class="p">,</span> <span class="s">&quot;consistency-vectors.RDS&quot;</span><span class="p">)</span>
+<span class="kp">save.image</span><span class="p">(</span><span class="s">&quot;consistency.rda&quot;</span><span class="p">)</span>
+<span class="c1">## Continues in consistency-evaluate.R</span>
+</pre></div>
+</body>
+</html>

+ 4 - 4
examples/Salomon/fRMA/index.html

@@ -2,11 +2,11 @@
 <p>The scripts below were used to evaluate the consistency of the fRMA normalization vectors by repeating the training process with 5 different random samples and then comparing a random selection of arrays normalized by all five trained vectors as well as by ordinary RMA. <a href="fRMA_consistency_results">This folder</a> shows the results.</p>
 <h1 id="scripts">Scripts</h1>
 <p>There are two pairs of scripts. The first pair, <code>train.R</code> and <code>test.R</code>, handle the tasks of (respectively) generating/training the main fRMA vectors and ensuring that they work by normalizing all the data with them. The second pair, <code>consistency-train.R</code> and <code>consistency-evaluate.R</code>, handle (respectively) training five separate fRMA vector sets and testing their consistency.</p>
-<h2 id="train.r-creating-the-frma-vectors"><a href="train.R"><code>train.R</code></a>: Creating the fRMA vectors</h2>
+<h2 id="train.r-creating-the-frma-vectors"><a href="train.R.html"><code>train.R</code></a>: Creating the fRMA vectors</h2>
 <p>This script reads the sample metadata tables, assembles the full file lists for BX and PAX tissues, and trains a set of fRMA vectors for each tissue. It exports each of these vector sets to an installable R package.</p>
-<h2 id="test.r-testing-the-frma-vectors"><a href="test.R"><code>test.R</code></a>: Testing the fRMA vectors</h2>
+<h2 id="test.r-testing-the-frma-vectors"><a href="test.R.html"><code>test.R</code></a>: Testing the fRMA vectors</h2>
 <p>This script simply loads all the arrays and normalizes them using the appropriate fRMA vectors that were generated by <code>train.R</code>. It should be run after installing the packages produced by <code>train.R</code>. It is simply used for testing to make sure the fRMA vectors work.</p>
-<h2 id="consistency-train.r-train-several-vector-sets-for-each-tissue"><a href="consistency-train.R"><code>consistency-train.R</code></a>: Train several vector sets for each tissue</h2>
+<h2 id="consistency-train.r-train-several-vector-sets-for-each-tissue"><a href="consistency-train.R.html"><code>consistency-train.R</code></a>: Train several vector sets for each tissue</h2>
 <p>This script essentially does the same thing as <code>train.R</code>, only it does it five times with five different subsamplings of the arrays to generate five different fRMA vector sets and saves them all in an R data file.</p>
-<h2 id="consistency-evaluate.r-verify-consistency-of-frma-vectors"><a href="consistency-evaluate.R"><code>consistency-evaluate.R</code></a>: Verify consistency of fRMA vectors</h2>
+<h2 id="consistency-evaluate.r-verify-consistency-of-frma-vectors"><a href="consistency-evaluate.R.html"><code>consistency-evaluate.R</code></a>: Verify consistency of fRMA vectors</h2>
 <p>This script loads the data file from <code>consistency-train.R</code>, then loads 20 random arrays from each tissue and normalizes them with all five fRMA vector sets, and also by ordinary RMA. It then produces plots of M vs A for every pair of normalizations. Unlike regular MA plots, these are <em>not</em> plotting arrays against each other, but rather arrays against themselves, but normalized using two different methods. So if two normalizations were perfectly consistent, the MA plot would be a flat horizontal line at M=0. It also produces boxplots and violin plots showing the M distribution for each of the pairwise comparisons.</p>

+ 116 - 0
examples/Salomon/fRMA/run-frma-example.R.html

@@ -0,0 +1,116 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">## Load packages</span>
+<span class="kn">library</span><span class="p">(</span>openxlsx<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>affy<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frma<span class="p">)</span>
+
+<span class="c1">## Load the fRMA data package and then load the data</span>
+<span class="kn">library</span><span class="p">(</span>DSalomon.PAX.hthgu133pluspmfrmavecs<span class="p">)</span>
+data<span class="p">(</span>DSalomon.PAX.hthgu133pluspmfrmavecs<span class="p">)</span>
+
+<span class="c1">## Get the sample table and list of CEL files</span>
+cel.dir <span class="o">&lt;-</span> <span class="s">&quot;Training Data/01 - CTOT08 ABECASSIS PAX Samples&quot;</span>
+<span class="c1">## Read the first xlsx file in the directory, which is a spreadsheet</span>
+<span class="c1">## containing all of the CEL file names</span>
+sample.table <span class="o">&lt;-</span> read.xlsx<span class="p">(</span><span class="kp">list.files</span><span class="p">(</span>cel.dir<span class="p">,</span> pattern<span class="o">=</span>glob2rx<span class="p">(</span><span class="s">&quot;*.xlsx&quot;</span><span class="p">),</span> full.names<span class="o">=</span><span class="kc">TRUE</span><span class="p">)[</span><span class="m">1</span><span class="p">])</span>
+cel.files <span class="o">&lt;-</span> <span class="kp">file.path</span><span class="p">(</span>cel.dir<span class="p">,</span> sample.table<span class="o">$</span>Filename<span class="p">)</span>
+
+<span class="c1">## Read the data from the CEL files</span>
+affy <span class="o">&lt;-</span> ReadAffy<span class="p">(</span>filenames<span class="o">=</span>cel.files<span class="p">,</span> phenoData<span class="o">=</span>sample.table<span class="p">)</span>
+
+<span class="c1">## Apply fRMA</span>
+eset <span class="o">&lt;-</span> frma<span class="p">(</span>affy<span class="p">,</span> input.vecs<span class="o">=</span>DSalomon.PAX.hthgu133pluspmfrmavecs<span class="p">)</span>
+
+<span class="c1">## Extract expression matrix</span>
+expr <span class="o">&lt;-</span> exprs<span class="p">(</span>eset<span class="p">)</span>
+
+<span class="c1">## Extract sample table</span>
+sample.table <span class="o">&lt;-</span> pData<span class="p">(</span>eset<span class="p">)</span>
+</pre></div>
+</body>
+</html>

+ 155 - 0
examples/Salomon/fRMA/test.R.html

@@ -0,0 +1,155 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/env Rscript</span>
+
+<span class="kn">library</span><span class="p">(</span>xlsx<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frma<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frmaTools<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>stringr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>magrittr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>plyr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>affy<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>preprocessCore<span class="p">)</span>
+
+training.data.dir <span class="o">&lt;-</span> <span class="s">&quot;Training Data&quot;</span>
+datasets <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>Dataset<span class="o">=</span><span class="kp">list.files</span><span class="p">(</span>training.data.dir<span class="p">))</span>
+<span class="kp">rownames</span><span class="p">(</span>datasets<span class="p">)</span> <span class="o">&lt;-</span> datasets<span class="o">$</span>Dataset
+datasets<span class="o">$</span>Tissue <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span>str_extract<span class="p">(</span>datasets<span class="o">$</span>Dataset<span class="p">,</span> <span class="s">&quot;\\b(PAX|BX)\\b&quot;</span><span class="p">))</span>
+
+tsmsg <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kp">message</span><span class="p">(</span><span class="kp">date</span><span class="p">(),</span> <span class="s">&quot;: &quot;</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span>
+<span class="p">}</span>
+
+parse.date.from.filename <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>fname<span class="p">)</span> <span class="p">{</span>
+    res1 <span class="o">&lt;-</span> str_match<span class="p">(</span>fname<span class="p">,</span> <span class="s">&quot;^(\\d\\d)(\\d\\d)(\\d\\d)&quot;</span><span class="p">)[,</span><span class="kt">c</span><span class="p">(</span><span class="m">4</span><span class="p">,</span><span class="m">2</span><span class="p">,</span><span class="m">3</span><span class="p">)]</span>
+    res2 <span class="o">&lt;-</span> str_match<span class="p">(</span>fname<span class="p">,</span> <span class="s">&quot;^20(\\d\\d)_(\\d\\d)_(\\d\\d)&quot;</span><span class="p">)[,</span><span class="m">-1</span><span class="p">]</span>
+    res1<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>res1<span class="p">)]</span> <span class="o">&lt;-</span> res2<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>res1<span class="p">)]</span>
+    <span class="kp">colnames</span><span class="p">(</span>res1<span class="p">)</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;year&quot;</span><span class="p">,</span> <span class="s">&quot;month&quot;</span><span class="p">,</span> <span class="s">&quot;day&quot;</span><span class="p">)</span>
+    res1<span class="p">[,</span><span class="s">&quot;year&quot;</span><span class="p">]</span> <span class="o">%&lt;&gt;%</span> str_c<span class="p">(</span><span class="s">&quot;20&quot;</span><span class="p">,</span> <span class="m">.</span><span class="p">)</span>
+    <span class="kp">as.Date</span><span class="p">(</span><span class="kp">do.call</span><span class="p">(</span><span class="kp">ISOdate</span><span class="p">,</span> <span class="kt">data.frame</span><span class="p">(</span>res1<span class="p">)))</span>
+<span class="p">}</span>
+
+sample.tables <span class="o">&lt;-</span> ddply<span class="p">(</span>datasets<span class="p">,</span> <span class="m">.</span><span class="p">(</span>Dataset<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>df<span class="p">)</span> <span class="p">{</span>
+    df <span class="o">&lt;-</span> df<span class="p">[</span><span class="m">1</span><span class="p">,]</span>
+    <span class="kp">rownames</span><span class="p">(</span>df<span class="p">)</span> <span class="o">&lt;-</span> <span class="kc">NULL</span>
+    dset.dir <span class="o">&lt;-</span> <span class="kp">file.path</span><span class="p">(</span>training.data.dir<span class="p">,</span> df<span class="o">$</span>Dataset<span class="p">)</span>
+    x <span class="o">&lt;-</span> read.xlsx<span class="p">(</span><span class="kp">list.files</span><span class="p">(</span>dset.dir<span class="p">,</span> pattern<span class="o">=</span>glob2rx<span class="p">(</span><span class="s">&quot;*.xlsx&quot;</span><span class="p">),</span> full.names<span class="o">=</span><span class="kc">TRUE</span><span class="p">)[</span><span class="m">1</span><span class="p">],</span> <span class="m">1</span><span class="p">)</span> <span class="o">%&gt;%</span>
+        setNames<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;Filename&quot;</span><span class="p">,</span> <span class="s">&quot;Phenotype&quot;</span><span class="p">,</span> <span class="s">&quot;ScanDate&quot;</span><span class="p">))</span>
+    x<span class="o">$</span>Filename <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>Filename<span class="p">)</span>
+    missing.CEL <span class="o">&lt;-</span> <span class="o">!</span>str_detect<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">,</span> <span class="s">&quot;\\.CEL$&quot;</span><span class="p">)</span>
+    x<span class="o">$</span>Filename<span class="p">[</span>missing.CEL<span class="p">]</span> <span class="o">&lt;-</span> str_c<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">[</span>missing.CEL<span class="p">],</span> <span class="s">&quot;.CEL&quot;</span><span class="p">)</span>
+    <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">all</span><span class="p">(</span>str_detect<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">,</span> <span class="s">&quot;\\.CEL$&quot;</span><span class="p">)))</span>
+    parsed.date <span class="o">&lt;-</span> parse.date.from.filename<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">)</span>
+    x<span class="o">$</span>ScanDate<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>parsed.date<span class="p">)]</span> <span class="o">&lt;-</span> parsed.date<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>parsed.date<span class="p">)]</span>
+    x <span class="o">%&gt;%</span> <span class="kp">cbind</span><span class="p">(</span>df<span class="p">)</span> <span class="o">%&gt;%</span>
+        <span class="kp">transform</span><span class="p">(</span>Filename<span class="o">=</span><span class="kp">file.path</span><span class="p">(</span>dset.dir<span class="p">,</span> Filename<span class="p">),</span>
+                  Batch<span class="o">=</span><span class="kp">droplevels</span><span class="p">(</span>Tissue<span class="o">:</span>Dataset<span class="o">:</span><span class="kp">factor</span><span class="p">(</span>ScanDate<span class="p">)</span><span class="o">:</span>Phenotype<span class="p">))</span> <span class="o">%&gt;%</span>
+                      <span class="kp">subset</span><span class="p">(</span><span class="o">!</span> Filename <span class="o">%in%</span> blacklist<span class="p">)</span>
+<span class="p">})</span> <span class="o">%&gt;%</span> <span class="kp">split</span><span class="p">(</span><span class="m">.</span><span class="o">$</span>Tissue<span class="p">)</span> <span class="o">%&gt;%</span> <span class="kp">lapply</span><span class="p">(</span><span class="kp">droplevels</span><span class="p">)</span>
+
+annotation <span class="o">&lt;-</span> cleancdfname<span class="p">(</span>affyio<span class="o">:::</span>read.celfile.header<span class="p">(</span>sample.tables<span class="p">[[</span><span class="m">1</span><span class="p">]]</span><span class="o">$</span>Filename<span class="p">[</span><span class="m">1</span><span class="p">])</span><span class="o">$</span>cdfName<span class="p">,</span> <span class="kc">FALSE</span><span class="p">)</span>
+
+esets <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">()</span>
+
+<span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>sample.tables<span class="p">))</span> <span class="p">{</span>
+    pkgname <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;DSalomon.%s.%sfrmavecs&quot;</span><span class="p">,</span> i<span class="p">,</span> annotation<span class="p">)</span>
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Loading &quot;</span><span class="p">,</span> pkgname<span class="p">)</span>
+    <span class="kn">require</span><span class="p">(</span>pkgname<span class="p">,</span> character.only<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> quietly<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+    data<span class="p">(</span><span class="kt">list</span> <span class="o">=</span> pkgname<span class="p">)</span>
+
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Loading raw data for &quot;</span><span class="p">,</span> i<span class="p">)</span>
+    stab <span class="o">&lt;-</span> sample.tables<span class="p">[[</span>i<span class="p">]]</span>
+    affy <span class="o">&lt;-</span> ReadAffy<span class="p">(</span>filenames<span class="o">=</span>stab<span class="o">$</span>Filename<span class="p">,</span> phenoData<span class="o">=</span>stab<span class="p">)</span>
+
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Running fRMA for &quot;</span><span class="p">,</span> i<span class="p">)</span>
+    esets<span class="p">[[</span>i<span class="p">]]</span> <span class="o">&lt;-</span> frma<span class="p">(</span>affy<span class="p">,</span> input.vecs<span class="o">=</span><span class="kp">get</span><span class="p">(</span>pkgname<span class="p">),</span> verbose<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>affy<span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+<span class="p">}</span>
+</pre></div>
+</body>
+</html>

+ 351 - 0
examples/Salomon/fRMA/train.R.html

@@ -0,0 +1,351 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/env Rscript</span>
+
+<span class="kn">library</span><span class="p">(</span>xlsx<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>frmaTools<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>stringr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>magrittr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>plyr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>affy<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>preprocessCore<span class="p">)</span>
+
+training.data.dir <span class="o">&lt;-</span> <span class="s">&quot;Training Data&quot;</span>
+datasets <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>Dataset<span class="o">=</span><span class="kp">list.files</span><span class="p">(</span>training.data.dir<span class="p">))</span>
+<span class="kp">rownames</span><span class="p">(</span>datasets<span class="p">)</span> <span class="o">&lt;-</span> datasets<span class="o">$</span>Dataset
+datasets<span class="o">$</span>Tissue <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span>str_extract<span class="p">(</span>datasets<span class="o">$</span>Dataset<span class="p">,</span> <span class="s">&quot;\\b(PAX|BX)\\b&quot;</span><span class="p">))</span>
+
+tsmsg <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kp">message</span><span class="p">(</span><span class="kp">date</span><span class="p">(),</span> <span class="s">&quot;: &quot;</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## Some Scan Dates are marked as identical for multiple batches, which</span>
+<span class="c1">## is bad. But the dates embedded in the file names for these batches</span>
+<span class="c1">## are different, so we use those dates instead.</span>
+parse.date.from.filename <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>fname<span class="p">)</span> <span class="p">{</span>
+    res1 <span class="o">&lt;-</span> str_match<span class="p">(</span>fname<span class="p">,</span> <span class="s">&quot;^(\\d\\d)(\\d\\d)(\\d\\d)&quot;</span><span class="p">)[,</span><span class="kt">c</span><span class="p">(</span><span class="m">4</span><span class="p">,</span><span class="m">2</span><span class="p">,</span><span class="m">3</span><span class="p">)]</span>
+    res2 <span class="o">&lt;-</span> str_match<span class="p">(</span>fname<span class="p">,</span> <span class="s">&quot;^20(\\d\\d)_(\\d\\d)_(\\d\\d)&quot;</span><span class="p">)[,</span><span class="m">-1</span><span class="p">]</span>
+    res1<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>res1<span class="p">)]</span> <span class="o">&lt;-</span> res2<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>res1<span class="p">)]</span>
+    <span class="kp">colnames</span><span class="p">(</span>res1<span class="p">)</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;year&quot;</span><span class="p">,</span> <span class="s">&quot;month&quot;</span><span class="p">,</span> <span class="s">&quot;day&quot;</span><span class="p">)</span>
+    res1<span class="p">[,</span><span class="s">&quot;year&quot;</span><span class="p">]</span> <span class="o">%&lt;&gt;%</span> str_c<span class="p">(</span><span class="s">&quot;20&quot;</span><span class="p">,</span> <span class="m">.</span><span class="p">)</span>
+    <span class="kp">as.Date</span><span class="p">(</span><span class="kp">do.call</span><span class="p">(</span><span class="kp">ISOdate</span><span class="p">,</span> <span class="kt">data.frame</span><span class="p">(</span>res1<span class="p">)))</span>
+<span class="p">}</span>
+
+makeVectorsAffyBatch <span class="o">&lt;-</span> <span class="kr">function</span> <span class="p">(</span>files<span class="p">,</span> batch.id<span class="p">,</span> background <span class="o">=</span> <span class="s">&quot;rma&quot;</span><span class="p">,</span> normalize <span class="o">=</span> <span class="s">&quot;quantile&quot;</span><span class="p">,</span>
+    normVec <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span> cdfname <span class="o">=</span> <span class="kc">NULL</span><span class="p">,</span> file.dir <span class="o">=</span> <span class="s">&quot;.&quot;</span><span class="p">,</span> verbose <span class="o">=</span> <span class="kc">TRUE</span><span class="p">)</span>
+<span class="p">{</span>
+    wd <span class="o">&lt;-</span> <span class="kp">getwd</span><span class="p">()</span>
+    <span class="kp">setwd</span><span class="p">(</span>file.dir<span class="p">)</span>
+    object <span class="o">&lt;-</span> ReadAffy<span class="p">(</span>filenames <span class="o">=</span> files<span class="p">,</span> cdfname <span class="o">=</span> cdfname<span class="p">,</span>
+        verbose <span class="o">=</span> verbose<span class="p">)</span>
+    <span class="kp">setwd</span><span class="p">(</span>wd<span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Data loaded \n&quot;</span><span class="p">)</span>
+    batch.size <span class="o">&lt;-</span> <span class="kp">table</span><span class="p">(</span>batch.id<span class="p">)[</span><span class="m">1</span><span class="p">]</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="kp">all</span><span class="p">(</span><span class="kp">table</span><span class="p">(</span>batch.id<span class="p">)</span> <span class="o">==</span> batch.size<span class="p">))</span>
+        <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Batches must be of the same size.&quot;</span><span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>background <span class="o">==</span> <span class="s">&quot;rma&quot;</span><span class="p">)</span> <span class="p">{</span>
+        object <span class="o">&lt;-</span> bg.correct.rma<span class="p">(</span>object<span class="p">)</span>
+        <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+            <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Background Corrected \n&quot;</span><span class="p">)</span>
+        <span class="kp">gc</span><span class="p">()</span>
+    <span class="p">}</span>
+    pms <span class="o">&lt;-</span> pm<span class="p">(</span>object<span class="p">)</span>
+    pns <span class="o">&lt;-</span> probeNames<span class="p">(</span>object<span class="p">)</span>
+    pmi <span class="o">&lt;-</span> <span class="kp">unlist</span><span class="p">(</span>pmindex<span class="p">(</span>object<span class="p">))</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="o">!</span><span class="kp">all</span><span class="p">(</span><span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%i&quot;</span><span class="p">,</span> pmi<span class="p">)</span> <span class="o">==</span> <span class="kp">rownames</span><span class="p">(</span>pms<span class="p">)))</span>
+        <span class="kp">stop</span><span class="p">(</span><span class="s">&quot;Mismatch between pmindex and rownames of pms&quot;</span><span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>object<span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    <span class="kr">if</span> <span class="p">(</span>normalize <span class="o">==</span> <span class="s">&quot;quantile&quot;</span><span class="p">)</span> <span class="p">{</span>
+        <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>normVec<span class="p">))</span>
+            normVec <span class="o">&lt;-</span> normalize.quantiles.determine.target<span class="p">(</span>pms<span class="p">)</span>
+        pms <span class="o">&lt;-</span> normalize.quantiles.use.target<span class="p">(</span>pms<span class="p">,</span> normVec<span class="p">)</span>
+        <span class="kp">names</span><span class="p">(</span>normVec<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>pmi<span class="p">)</span>
+        <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+            <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Normalized \n&quot;</span><span class="p">)</span>
+    <span class="p">}</span>
+    pms <span class="o">&lt;-</span> <span class="kp">log2</span><span class="p">(</span>pms<span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    N <span class="o">&lt;-</span> <span class="m">1</span><span class="o">:</span><span class="kp">dim</span><span class="p">(</span>pms<span class="p">)[</span><span class="m">1</span><span class="p">]</span>
+    S <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>N<span class="p">,</span> pns<span class="p">)</span>
+    nc <span class="o">&lt;-</span> <span class="kp">ncol</span><span class="p">(</span>pms<span class="p">)</span>
+    nr <span class="o">&lt;-</span> <span class="kp">nrow</span><span class="p">(</span>pms<span class="p">)</span>
+    resids <span class="o">&lt;-</span> <span class="kt">matrix</span><span class="p">(</span>ncol <span class="o">=</span> nc<span class="p">,</span> nrow <span class="o">=</span> nr<span class="p">)</span>
+    probeVec <span class="o">&lt;-</span> <span class="kt">vector</span><span class="p">(</span>length <span class="o">=</span> nr<span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Beginning Probe Effect Calculation ... \n&quot;</span><span class="p">)</span>
+    <span class="kr">for</span> <span class="p">(</span>k <span class="kr">in</span> <span class="m">1</span><span class="o">:</span><span class="kp">length</span><span class="p">(</span>S<span class="p">))</span> <span class="p">{</span>
+        fit <span class="o">&lt;-</span> rcModelPLM<span class="p">(</span>pms<span class="p">[</span>S<span class="p">[[</span>k<span class="p">]],</span> <span class="p">,</span> drop <span class="o">=</span> <span class="kc">FALSE</span><span class="p">])</span>
+        resids<span class="p">[</span>S<span class="p">[[</span>k<span class="p">]],</span> <span class="p">]</span> <span class="o">&lt;-</span> fit<span class="o">$</span>Residuals
+        probeVec<span class="p">[</span>S<span class="p">[[</span>k<span class="p">]]]</span> <span class="o">&lt;-</span> fit<span class="o">$</span>Estimates<span class="p">[(</span>nc <span class="o">+</span> <span class="m">1</span><span class="p">)</span><span class="o">:</span><span class="kp">length</span><span class="p">(</span>fit<span class="o">$</span>Estimates<span class="p">)]</span>
+        <span class="kr">if</span> <span class="p">((</span>k<span class="o">%%</span><span class="m">1000</span><span class="p">)</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kp">message</span><span class="p">(</span><span class="kp">paste</span><span class="p">(</span><span class="s">&quot;Finished probeset:&quot;</span><span class="p">,</span> k<span class="p">,</span> <span class="s">&quot;\n&quot;</span><span class="p">))</span>
+            <span class="kp">gc</span><span class="p">()</span>
+        <span class="p">}</span>
+    <span class="p">}</span>
+    <span class="kp">names</span><span class="p">(</span>probeVec<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>pmi<span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Probe Effects Calculated \n&quot;</span><span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    tmp <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span><span class="kp">t</span><span class="p">(</span>resids<span class="p">),</span> batch.id<span class="p">)</span>
+    withinMean <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>tmp<span class="p">,</span> frmaTools<span class="o">:::</span>getProbeMean<span class="p">,</span> batch.size<span class="p">)</span>
+    withinVar <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>tmp<span class="p">,</span> frmaTools<span class="o">:::</span>getProbeVar<span class="p">,</span> batch.size<span class="p">)</span>
+    withinAvgVar <span class="o">&lt;-</span> <span class="kp">rowMeans</span><span class="p">(</span><span class="kt">matrix</span><span class="p">(</span><span class="kp">unlist</span><span class="p">(</span>withinVar<span class="p">),</span> ncol <span class="o">=</span> <span class="kp">length</span><span class="p">(</span>withinVar<span class="p">)))</span>
+    btwVar <span class="o">&lt;-</span> <span class="kp">apply</span><span class="p">(</span><span class="kt">matrix</span><span class="p">(</span><span class="kp">unlist</span><span class="p">(</span>withinMean<span class="p">),</span> ncol <span class="o">=</span> <span class="kp">length</span><span class="p">(</span>withinMean<span class="p">)),</span>
+        <span class="m">1</span><span class="p">,</span> var<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>tmp<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>withinMean<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>withinVar<span class="p">)</span>
+    <span class="kp">names</span><span class="p">(</span>withinAvgVar<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>btwVar<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>pmi<span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Probe Variances Calculated \n&quot;</span><span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    tmp <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>resids<span class="p">,</span> pns<span class="p">)</span>
+    psetMAD <span class="o">&lt;-</span> <span class="kp">unlist</span><span class="p">(</span><span class="kp">lapply</span><span class="p">(</span>tmp<span class="p">,</span> frmaTools<span class="o">:::</span>getPsetMAD<span class="p">,</span> nc<span class="p">,</span> batch.id<span class="p">))</span>
+    <span class="kp">names</span><span class="p">(</span>psetMAD<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>tmp<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>tmp<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>resids<span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Probe Set SDs Calculated \n&quot;</span><span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    w <span class="o">&lt;-</span> <span class="m">1</span><span class="o">/</span><span class="p">(</span>withinAvgVar <span class="o">+</span> btwVar<span class="p">)</span>
+    w<span class="p">[</span>w <span class="o">==</span> <span class="kc">Inf</span><span class="p">]</span> <span class="o">&lt;-</span> <span class="m">1</span>
+    medianSE <span class="o">&lt;-</span> <span class="kt">vector</span><span class="p">(</span>length <span class="o">=</span> <span class="kp">length</span><span class="p">(</span>psetMAD<span class="p">))</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Beginning Median SE Calculation ... \n&quot;</span><span class="p">)</span>
+    <span class="kr">for</span> <span class="p">(</span>k <span class="kr">in</span> <span class="m">1</span><span class="o">:</span><span class="kp">length</span><span class="p">(</span>S<span class="p">))</span> <span class="p">{</span>
+        fit <span class="o">&lt;-</span> frmaTools<span class="o">:::</span>rwaFit2<span class="p">(</span>pms<span class="p">[</span>S<span class="p">[[</span>k<span class="p">]],</span> <span class="p">,</span> drop <span class="o">=</span> <span class="kc">FALSE</span><span class="p">],</span> w<span class="p">[</span>S<span class="p">[[</span>k<span class="p">]]],</span>
+            probeVec<span class="p">[</span>S<span class="p">[[</span>k<span class="p">]]],</span> psetMAD<span class="p">[</span>k<span class="p">])</span>
+        medianSE<span class="p">[</span>k<span class="p">]</span> <span class="o">&lt;-</span> median<span class="p">(</span>fit<span class="o">$</span>StdErrors<span class="p">)</span>
+        <span class="kr">if</span> <span class="p">((</span>k<span class="o">%%</span><span class="m">1000</span><span class="p">)</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+            <span class="kp">message</span><span class="p">(</span><span class="kp">paste</span><span class="p">(</span><span class="s">&quot;Finished probeset:&quot;</span><span class="p">,</span> k<span class="p">,</span> <span class="s">&quot;\n&quot;</span><span class="p">))</span>
+            <span class="kp">gc</span><span class="p">()</span>
+        <span class="p">}</span>
+    <span class="p">}</span>
+    <span class="kp">names</span><span class="p">(</span>medianSE<span class="p">)</span> <span class="o">&lt;-</span> <span class="kp">names</span><span class="p">(</span>psetMAD<span class="p">)</span>
+    <span class="kr">if</span> <span class="p">(</span>verbose<span class="p">)</span>
+        <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Median SEs Calculated \n&quot;</span><span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    <span class="kp">rm</span><span class="p">(</span>w<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>pms<span class="p">)</span>
+    <span class="kp">rm</span><span class="p">(</span>pns<span class="p">)</span>
+    <span class="kp">gc</span><span class="p">()</span>
+    <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>cdfname<span class="p">))</span> <span class="p">{</span>
+        vers <span class="o">&lt;-</span> <span class="s">&quot;&quot;</span>
+    <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+        vers <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>packageVersion<span class="p">(</span>cdfname<span class="p">))</span>
+    <span class="p">}</span>
+    <span class="c1">## vers &lt;- ifelse(!is.null(cdfname), as.character(packageVersion(cdfname)),</span>
+    <span class="c1">##     &quot;&quot;)</span>
+    <span class="kr">return</span><span class="p">(</span><span class="kt">list</span><span class="p">(</span>normVec <span class="o">=</span> normVec<span class="p">,</span> probeVec <span class="o">=</span> probeVec<span class="p">,</span> probeVarWithin <span class="o">=</span> withinAvgVar<span class="p">,</span>
+        probeVarBetween <span class="o">=</span> btwVar<span class="p">,</span> probesetSD <span class="o">=</span> psetMAD<span class="p">,</span> medianSE <span class="o">=</span> medianSE<span class="p">,</span>
+        version <span class="o">=</span> vers<span class="p">))</span>
+<span class="p">}</span>
+
+<span class="c1">## This reads in the xlsx file for each of the 7 datasets and combines</span>
+<span class="c1">## them into one big table of all samples. The Batch column contains</span>
+<span class="c1">## the partitioning of samples into unique combinations of Dataset,</span>
+<span class="c1">## Scan Date, and Phenotype. Finally, we split based on Tissue type to</span>
+<span class="c1">## get one table for biopsies (BX), and one for blood (PAX).</span>
+sample.tables <span class="o">&lt;-</span> ddply<span class="p">(</span>datasets<span class="p">,</span> <span class="m">.</span><span class="p">(</span>Dataset<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>df<span class="p">)</span> <span class="p">{</span>
+    df <span class="o">&lt;-</span> df<span class="p">[</span><span class="m">1</span><span class="p">,]</span>
+    <span class="kp">rownames</span><span class="p">(</span>df<span class="p">)</span> <span class="o">&lt;-</span> <span class="kc">NULL</span>
+    dset.dir <span class="o">&lt;-</span> <span class="kp">file.path</span><span class="p">(</span>training.data.dir<span class="p">,</span> df<span class="o">$</span>Dataset<span class="p">)</span>
+    x <span class="o">&lt;-</span> read.xlsx<span class="p">(</span><span class="kp">list.files</span><span class="p">(</span>dset.dir<span class="p">,</span> pattern<span class="o">=</span>glob2rx<span class="p">(</span><span class="s">&quot;*.xlsx&quot;</span><span class="p">),</span> full.names<span class="o">=</span><span class="kc">TRUE</span><span class="p">)[</span><span class="m">1</span><span class="p">],</span> <span class="m">1</span><span class="p">)</span> <span class="o">%&gt;%</span>
+        setNames<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;Filename&quot;</span><span class="p">,</span> <span class="s">&quot;Phenotype&quot;</span><span class="p">,</span> <span class="s">&quot;ScanDate&quot;</span><span class="p">))</span>
+    x<span class="o">$</span>Filename <span class="o">&lt;-</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>Filename<span class="p">)</span>
+    missing.CEL <span class="o">&lt;-</span> <span class="o">!</span>str_detect<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">,</span> <span class="s">&quot;\\.CEL$&quot;</span><span class="p">)</span>
+    x<span class="o">$</span>Filename<span class="p">[</span>missing.CEL<span class="p">]</span> <span class="o">&lt;-</span> str_c<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">[</span>missing.CEL<span class="p">],</span> <span class="s">&quot;.CEL&quot;</span><span class="p">)</span>
+    <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">all</span><span class="p">(</span>str_detect<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">,</span> <span class="s">&quot;\\.CEL$&quot;</span><span class="p">)))</span>
+    parsed.date <span class="o">&lt;-</span> parse.date.from.filename<span class="p">(</span>x<span class="o">$</span>Filename<span class="p">)</span>
+    x<span class="o">$</span>ScanDate<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>parsed.date<span class="p">)]</span> <span class="o">&lt;-</span> parsed.date<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>parsed.date<span class="p">)]</span>
+    x <span class="o">%&gt;%</span> <span class="kp">cbind</span><span class="p">(</span>df<span class="p">)</span> <span class="o">%&gt;%</span>
+        <span class="kp">transform</span><span class="p">(</span>Filename<span class="o">=</span><span class="kp">file.path</span><span class="p">(</span>dset.dir<span class="p">,</span> Filename<span class="p">),</span>
+                  Batch<span class="o">=</span><span class="kp">droplevels</span><span class="p">(</span>Tissue<span class="o">:</span>Dataset<span class="o">:</span><span class="kp">factor</span><span class="p">(</span>ScanDate<span class="p">)</span><span class="o">:</span>Phenotype<span class="p">))</span> <span class="o">%&gt;%</span>
+                      <span class="kp">subset</span><span class="p">(</span><span class="o">!</span> Filename <span class="o">%in%</span> blacklist<span class="p">)</span> <span class="o">%&gt;%</span>
+                          <span class="kp">subset</span><span class="p">(</span><span class="o">!</span><span class="kp">duplicated</span><span class="p">(</span>Filename<span class="p">))</span>
+<span class="p">})</span> <span class="o">%&gt;%</span>
+    <span class="kp">split</span><span class="p">(</span><span class="m">.</span><span class="o">$</span>Tissue<span class="p">)</span> <span class="o">%&gt;%</span>
+        <span class="kp">lapply</span><span class="p">(</span><span class="kp">droplevels</span><span class="p">)</span>
+
+<span class="c1">## fRMA requires equal-sized batches, so for each batch size from 3 to</span>
+<span class="c1">## 15, compute how many batches have at least that many samples.</span>
+x <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span><span class="m">3</span><span class="o">:</span><span class="m">15</span><span class="p">,</span> <span class="kr">function</span><span class="p">(</span>i<span class="p">)</span> <span class="kp">sapply</span><span class="p">(</span>sample.tables<span class="p">,</span> <span class="m">.</span> <span class="o">%$%</span> Batch <span class="o">%&gt;%</span> table <span class="o">%&gt;%</span> as.vector <span class="o">%&gt;%</span> <span class="p">{</span><span class="kp">sum</span><span class="p">(</span><span class="m">.</span> <span class="o">&gt;=</span> i<span class="p">)}))</span>
+<span class="kp">colnames</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="m">3</span><span class="o">:</span><span class="m">15</span>
+
+<span class="c1">## Based on the above and the recommendations in the frmaTools paper,</span>
+<span class="c1">## I chose 5 as the optimal batch size. This could be optimized</span>
+<span class="c1">## empirically, though.</span>
+arrays.per.batch <span class="o">&lt;-</span> <span class="m">5</span>
+
+<span class="c1">## For each tissue type, compute fRMA vectors.</span>
+vectors <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span>sample.tables<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>stab<span class="p">)</span> <span class="p">{</span>
+    <span class="kp">set.seed</span><span class="p">(</span><span class="m">1986</span><span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Reading full dataset&quot;</span><span class="p">)</span>
+    affy <span class="o">&lt;-</span> ReadAffy<span class="p">(</span>filenames<span class="o">=</span>stab<span class="o">$</span>Filename<span class="p">,</span> sampleNames<span class="o">=</span><span class="kp">rownames</span><span class="p">(</span>stab<span class="p">))</span>
+    tsmsg<span class="p">(</span><span class="s">&quot;Getting reference normalization distribution from full dataset&quot;</span><span class="p">)</span>
+    normVec <span class="o">&lt;-</span> normalize.quantiles.determine.target<span class="p">(</span>pm<span class="p">(</span>bg.correct.rma<span class="p">(</span>affy<span class="p">)))</span>
+    <span class="kp">rm</span><span class="p">(</span>affy<span class="p">);</span> <span class="kp">gc</span><span class="p">()</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Selecting batches&quot;</span><span class="p">)</span>
+    <span class="c1">## Keep only arrays with enough samples</span>
+    big.enough <span class="o">&lt;-</span> stab<span class="o">$</span>Batch <span class="o">%&gt;%</span> table <span class="o">%&gt;%</span> <span class="m">.</span><span class="p">[</span><span class="m">.</span><span class="o">&gt;=</span> arrays.per.batch<span class="p">]</span> <span class="o">%&gt;%</span> <span class="kp">names</span>
+    stab <span class="o">&lt;-</span> stab<span class="p">[</span>stab<span class="o">$</span>Batch <span class="o">%in%</span> big.enough<span class="p">,]</span> <span class="o">%&gt;%</span> <span class="kp">droplevels</span>
+
+    <span class="c1">## Sample an equal number of arrays from each batch</span>
+    subtab <span class="o">&lt;-</span> ddply<span class="p">(</span>stab<span class="p">,</span> <span class="m">.</span><span class="p">(</span>Batch<span class="p">),</span> <span class="kr">function</span><span class="p">(</span>df<span class="p">)</span> <span class="p">{</span>
+        df<span class="p">[</span><span class="kp">sample</span><span class="p">(</span><span class="kp">seq</span><span class="p">(</span><span class="kp">nrow</span><span class="p">(</span>df<span class="p">)),</span> size<span class="o">=</span>arrays.per.batch<span class="p">),]</span>
+    <span class="p">})</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Making fRMA vectors&quot;</span><span class="p">)</span>
+    <span class="c1">## Make fRMA vectors, using normVec from full dataset</span>
+    res <span class="o">&lt;-</span> makeVectorsAffyBatch<span class="p">(</span>subtab<span class="o">$</span>Filename<span class="p">,</span> subtab<span class="o">$</span>Batch<span class="p">,</span> normVec<span class="o">=</span>normVec<span class="p">)</span>
+
+    tsmsg<span class="p">(</span><span class="s">&quot;Finished.&quot;</span><span class="p">)</span>
+    res
+<span class="p">})</span>
+
+<span class="c1">## The code below here just takes the trained vectors and packages</span>
+<span class="c1">## them up into installable R packages.</span>
+makePackageFromVectors <span class="o">&lt;-</span>
+    <span class="kr">function</span> <span class="p">(</span>vecs<span class="p">,</span> <span class="kp">version</span><span class="p">,</span> maintainer<span class="p">,</span> species<span class="p">,</span> annotation<span class="p">,</span>
+              packageName<span class="p">,</span> file.dir <span class="o">=</span> <span class="s">&quot;.&quot;</span><span class="p">,</span>
+              output.dir <span class="o">=</span> <span class="s">&quot;.&quot;</span><span class="p">,</span> unlink <span class="o">=</span> <span class="kc">TRUE</span><span class="p">)</span>
+<span class="p">{</span>
+    platform <span class="o">&lt;-</span> <span class="kp">gsub</span><span class="p">(</span><span class="s">&quot;cdf$&quot;</span><span class="p">,</span> <span class="s">&quot;&quot;</span><span class="p">,</span> annotation<span class="p">)</span>
+    <span class="c1">## type &lt;- match.arg(type, c(&quot;AffyBatch&quot;, &quot;FeatureSet&quot;))</span>
+    <span class="c1">## if (type == &quot;AffyBatch&quot;)</span>
+    <span class="c1">##     platform &lt;- gsub(&quot;cdf&quot;, &quot;&quot;, annotation)</span>
+    <span class="c1">## if (type == &quot;FeatureSet&quot;) {</span>
+    <span class="c1">##     platform &lt;- annotation</span>
+    <span class="c1">##     require(oligo)</span>
+    <span class="c1">## }</span>
+    thispkg <span class="o">&lt;-</span> <span class="s">&quot;frmaTools&quot;</span>
+    desc <span class="o">&lt;-</span> packageDescription<span class="p">(</span>thispkg<span class="p">)</span>
+    thispkgVers <span class="o">&lt;-</span> desc<span class="o">$</span>Version
+    symbolValues <span class="o">&lt;-</span> <span class="kt">list</span><span class="p">(</span>ARRAYTYPE <span class="o">=</span> platform<span class="p">,</span> VERSION <span class="o">=</span> <span class="kp">version</span><span class="p">,</span>
+        CREATOR <span class="o">=</span> <span class="kp">paste</span><span class="p">(</span><span class="s">&quot;package&quot;</span><span class="p">,</span> thispkg<span class="p">,</span> <span class="s">&quot;version&quot;</span><span class="p">,</span> thispkgVers<span class="p">),</span>
+        FRMATOOLSVERSION <span class="o">=</span> thispkgVers<span class="p">,</span> MAINTAINER <span class="o">=</span> maintainer<span class="p">,</span>
+        SPECIES <span class="o">=</span> species<span class="p">)</span>
+    createdPkg <span class="o">&lt;-</span> createPackage<span class="p">(</span>packageName<span class="p">,</span> destinationDir <span class="o">=</span> output.dir<span class="p">,</span>
+        originDir <span class="o">=</span> <span class="kp">system.file</span><span class="p">(</span><span class="s">&quot;VectorPkg-template&quot;</span><span class="p">,</span> package <span class="o">=</span> <span class="s">&quot;frmaTools&quot;</span><span class="p">),</span>
+        symbolValues <span class="o">=</span> symbolValues<span class="p">,</span> unlink <span class="o">=</span> <span class="kp">unlink</span><span class="p">)</span>
+    <span class="c1">## if (type == &quot;AffyBatch&quot;)</span>
+    <span class="c1">##     vecs &lt;- makeVectorsAffyBatch(files, batch.id, background,</span>
+    <span class="c1">##         normalize, normVec, annotation, file.dir, verbose)</span>
+    <span class="c1">## if (type == &quot;FeatureSet&quot;)</span>
+    <span class="c1">##     vecs &lt;- makeVectorsFeatureSet(files, batch.id, annotation,</span>
+    <span class="c1">##         background, normalize, normVec, file.dir, verbose)</span>
+    <span class="kp">assign</span><span class="p">(</span>packageName<span class="p">,</span> vecs<span class="p">)</span>
+    <span class="kp">save</span><span class="p">(</span><span class="kt">list</span> <span class="o">=</span> <span class="kp">eval</span><span class="p">(</span>packageName<span class="p">),</span> file <span class="o">=</span> <span class="kp">file.path</span><span class="p">(</span>createdPkg<span class="o">$</span>pkgdir<span class="p">,</span>
+        <span class="s">&quot;data&quot;</span><span class="p">,</span> <span class="kp">paste</span><span class="p">(</span>packageName<span class="p">,</span> <span class="s">&quot;.rda&quot;</span><span class="p">,</span> sep <span class="o">=</span> <span class="s">&quot;&quot;</span><span class="p">)),</span> compress <span class="o">=</span> <span class="kc">TRUE</span><span class="p">)</span>
+<span class="p">}</span>
+
+annotation <span class="o">&lt;-</span> cleancdfname<span class="p">(</span>affyio<span class="o">:::</span>read.celfile.header<span class="p">(</span>sample.tables<span class="p">[[</span><span class="m">1</span><span class="p">]]</span><span class="o">$</span>Filename<span class="p">[</span><span class="m">1</span><span class="p">])</span><span class="o">$</span>cdfName<span class="p">,</span> <span class="kc">FALSE</span><span class="p">)</span>
+
+<span class="kp">dir.create</span><span class="p">(</span><span class="s">&quot;pkgs&quot;</span><span class="p">,</span> <span class="kc">FALSE</span><span class="p">,</span> <span class="kc">TRUE</span><span class="p">,</span> mode<span class="o">=</span><span class="s">&quot;755&quot;</span><span class="p">)</span>
+
+<span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> <span class="kp">names</span><span class="p">(</span>vectors<span class="p">))</span> <span class="p">{</span>
+    vecs <span class="o">&lt;-</span> vectors<span class="p">[[</span>i<span class="p">]]</span>
+    pkgname <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;DSalomon.%s.%sfrmavecs&quot;</span><span class="p">,</span> i<span class="p">,</span> annotation<span class="p">)</span>
+    <span class="kp">message</span><span class="p">(</span><span class="s">&quot;Making &quot;</span><span class="p">,</span> pkgname<span class="p">)</span>
+    makePackageFromVectors<span class="p">(</span>
+        vecs<span class="p">,</span>
+        version<span class="o">=</span><span class="s">&quot;0.1&quot;</span><span class="p">,</span>
+        maintainer<span class="o">=</span><span class="s">&quot;Ryan C. Thompson &lt;rcthomps@scripps.edu&gt;&quot;</span><span class="p">,</span>
+        species<span class="o">=</span><span class="s">&quot;Homo_sapiens&quot;</span><span class="p">,</span>
+        annotation<span class="o">=</span>annotation<span class="p">,</span>
+        packageName<span class="o">=</span>pkgname<span class="p">,</span>
+        output.dir <span class="o">=</span> <span class="s">&quot;pkgs&quot;</span><span class="p">)</span>
+<span class="p">}</span>
+
+<span class="kp">save.image</span><span class="p">(</span><span class="s">&quot;train-data.rda&quot;</span><span class="p">)</span>
+</pre></div>
+</body>
+</html>

+ 206 - 0
examples/UVa/probe-selection.R.html

@@ -0,0 +1,206 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1"># We will use base 2 logarithms for now</span>
+log.base <span class="o">=</span> <span class="m">2</span>
+
+<span class="c1"># Read the data in from the file</span>
+filename.data <span class="o">=</span> <span class="s">&quot;Processed_Data_Files/Normalized_Pair_Files/All_norm_pair.txt&quot;</span>
+<span class="c1">#filename.data = &quot;Raw_Data_Files/Pair_Files/All_pair.txt&quot;</span>
+intensity.lin <span class="o">&lt;-</span> read.table<span class="p">(</span>file<span class="o">=</span>filename.data<span class="p">,</span> header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>row.names<span class="o">=</span><span class="s">&quot;PROBE_ID&quot;</span><span class="p">)</span>
+
+<span class="c1"># Column names and numbers used to categorize the data</span>
+intensity.maincols <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;GENE_EXPR_OPTION&quot;</span><span class="p">,</span> <span class="s">&quot;SEQ_ID&quot;</span><span class="p">,</span> <span class="s">&quot;POSITION&quot;</span><span class="p">)</span>
+intensity.maincolnums <span class="o">=</span> <span class="kp">which</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>intensity.lin<span class="p">)</span> <span class="o">%in%</span> intensity.maincols<span class="p">)</span>
+<span class="c1"># Column names and numbers containing intensity data</span>
+intensity.datacolnums <span class="o">=</span> <span class="p">(</span><span class="m">1</span><span class="o">:</span><span class="kp">dim</span><span class="p">(</span>intensity.lin<span class="p">)[</span><span class="m">2</span><span class="p">])[</span><span class="o">-</span>intensity.maincolnums<span class="p">]</span> <span class="c1"># i.e. &quot;the rest&quot;</span>
+intensity.datacols <span class="o">=</span> <span class="kp">names</span><span class="p">(</span>intensity.lin<span class="p">)[</span>intensity.datacolnums<span class="p">]</span>
+
+<span class="c1"># Take the logarithm of the data</span>
+intensity.log <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>intensity.lin<span class="p">[</span>intensity.maincols<span class="p">],</span><span class="kp">log</span><span class="p">(</span>intensity.lin<span class="p">[</span>intensity.datacols<span class="p">],</span> base <span class="o">=</span> log.base<span class="p">))</span>
+
+<span class="c1"># Separate into random (i.e. background) and data probes</span>
+<span class="c1"># Split into &quot;intensity$rand&quot; and &quot;intensity$data&quot;</span>
+intensity <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>x <span class="o">=</span> intensity.log<span class="p">,</span> f <span class="o">=</span> <span class="p">(</span><span class="kp">ifelse</span><span class="p">(</span>intensity.log<span class="o">$</span>GENE_EXPR_OPTION <span class="o">==</span> <span class="s">&quot;RANDOM&quot;</span><span class="p">,</span><span class="s">&quot;rand&quot;</span><span class="p">,</span><span class="s">&quot;data&quot;</span><span class="p">)))</span>
+<span class="kp">rm</span><span class="p">(</span>intensity.lin<span class="p">,</span> intensity.log<span class="p">)</span> <span class="c1"># Discard unused stuff to free memory</span>
+
+<span class="c1"># This gives a QQ plot of the data against the noise. I used it to select my cutoffs.</span>
+<span class="c1">#qqplot( y=as.vector(as.matrix(intensity$data[sample(1:dim(intensity$data)[1],5000),intensity.datacols])), x=as.vector(as.matrix(intensity$rand[intensity.datacols])), ylab=&quot;Data&quot;, xlab=&quot;Random Control&quot;,main = &quot;Log10 QQ plot of Specific Probe Intensities vs. Random Controls&quot;)</span>
+
+<span class="c1"># Detection (low) threshold is 2 sd above mean random background</span>
+intensity.rand.vector <span class="o">=</span> <span class="kp">as.vector</span><span class="p">(</span><span class="kp">as.matrix</span><span class="p">(</span>intensity<span class="o">$</span>rand<span class="p">[</span>intensity.datacols<span class="p">]))</span>
+threshold.low <span class="o">=</span> <span class="kp">mean</span><span class="p">(</span>intensity.rand.vector<span class="p">)</span> <span class="o">+</span> <span class="m">2</span><span class="o">*</span>sd<span class="p">(</span>intensity.rand.vector<span class="p">)</span>
+<span class="kp">rm</span><span class="p">(</span>intensity.rand.vector<span class="p">)</span>
+<span class="c1"># Saturation (high) threshold is 2-fold down from max possible</span>
+threshold.high <span class="o">=</span> <span class="kp">log</span><span class="p">(</span><span class="m">65535</span><span class="o">/</span><span class="m">2</span><span class="p">,</span> base<span class="o">=</span>log.base<span class="p">)</span>
+<span class="kr">if</span> <span class="p">(</span>threshold.high <span class="o">&lt;=</span> threshold.low<span class="p">)</span> <span class="kp">print</span><span class="p">(</span><span class="s">&quot;Error: low threshold is too high&quot;</span><span class="p">)</span>
+
+<span class="c1"># Compute needed row statistics</span>
+intensity<span class="o">$</span>data<span class="o">$</span>MAX <span class="o">&lt;-</span> <span class="kp">apply</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="p">[</span>intensity.datacols<span class="p">],</span><span class="m">1</span><span class="p">,</span><span class="kp">max</span><span class="p">)</span>
+<span class="c1"># Actually, we only need the max, it seems. Uncomment these if needed.</span>
+<span class="c1">#intensity$data$MIN &lt;- apply(intensity$data[intensity.datacols],1,min)</span>
+<span class="c1">#intensity$data$MEAN &lt;- rowMeans(intensity$data[intensity.datacols]) # Don&#39;t need that one</span>
+<span class="c1">#intensity$data$SD &lt;- apply(intensity$data[intensity.datacols],1,sd) # Don&#39;t need that one</span>
+
+<span class="c1"># Sort probes into three bins: absent, present, and saturated</span>
+<span class="c1"># The integer value of BIN also serves as a rank:</span>
+<span class="c1"># present &lt; saturated &lt; absent; lower is better</span>
+intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span><span class="m">1</span> <span class="o">+</span> <span class="p">(</span>intensity<span class="o">$</span>data<span class="o">$</span>MAX <span class="o">&gt;</span> threshold.high<span class="p">)</span> <span class="o">+</span> <span class="p">(</span>intensity<span class="o">$</span>data<span class="o">$</span>MAX <span class="o">&lt;</span> threshold.low<span class="p">)</span> <span class="o">*</span> <span class="m">2</span><span class="p">,</span> labels <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;present&quot;</span><span class="p">,</span> <span class="s">&quot;saturated&quot;</span><span class="p">,</span> <span class="s">&quot;absent&quot;</span><span class="p">))</span>
+
+<span class="c1"># Count how many probes from each probeset (SEQ_ID) went into each bin</span>
+<span class="c1"># Also count total probes per set</span>
+<span class="c1"># Counting is done by measuring length of data aggregated by SEQ_ID</span>
+num.probes <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="kt">list</span><span class="p">(</span>total<span class="o">=</span>intensity<span class="o">$</span>data<span class="p">),</span><span class="kp">split</span><span class="p">(</span>x <span class="o">=</span> intensity<span class="o">$</span>data<span class="p">,</span> f <span class="o">=</span> intensity<span class="o">$</span>data<span class="o">$</span>BIN<span class="p">)),</span> FUN <span class="o">=</span> <span class="kr">function</span> <span class="p">(</span>y<span class="p">)</span> aggregate<span class="p">(</span>x<span class="o">=</span><span class="kp">rep</span><span class="p">(</span><span class="kc">NA</span><span class="p">,</span><span class="kp">dim</span><span class="p">(</span>y<span class="p">)[</span><span class="m">1</span><span class="p">]),</span>by<span class="o">=</span><span class="kt">list</span><span class="p">(</span>SEQ_ID<span class="o">=</span>y<span class="o">$</span>SEQ_ID<span class="p">),</span>FUN<span class="o">=</span><span class="kp">length</span><span class="p">))</span>
+<span class="kr">if</span><span class="p">(</span><span class="kp">mean</span><span class="p">(</span>num.probes<span class="o">$</span>present<span class="o">$</span>x<span class="p">)</span> <span class="o">&lt;</span> <span class="m">3</span><span class="p">)</span> <span class="p">{</span> <span class="kp">print</span><span class="p">(</span><span class="s">&quot;Error: Not enough present probes.&quot;</span><span class="p">)</span> <span class="p">}</span>
+
+
+<span class="c1"># Something below this needs updating</span>
+
+
+
+<span class="c1"># 2 rankings: Nimblegen&#39;s and distance from probeset mean</span>
+<span class="c1"># Nimblegen&#39;s rank is read from the design file</span>
+<span class="c1"># Probeinfo file</span>
+<span class="c1"># This is information parsed from the design file.</span>
+<span class="c1"># The design file can&#39;t be used directly because info on probe</span>
+<span class="c1"># selection is not in separate columns.</span>
+filename.probeinfo <span class="o">=</span> <span class="s">&quot;Design_Files/071031_U_Va_Tobacco_Expr.probeinfo&quot;</span>
+probeinfo <span class="o">&lt;-</span> read.table<span class="p">(</span>filename.probeinfo<span class="p">,</span>header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>row.names<span class="o">=</span><span class="s">&quot;PROBE_ID&quot;</span><span class="p">,</span>as.is<span class="o">=</span><span class="s">&quot;SEQ&quot;</span><span class="p">)</span>
+<span class="c1"># Use probe names as row names for indexing</span>
+<span class="c1">#row.names(probeinfo) &lt;- as.character(probeinfo$PROBE_ID)</span>
+
+<span class="c1"># Add Nimblegen rank to the main data frame</span>
+intensity<span class="o">$</span>data<span class="p">[</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;RANK&quot;</span><span class="p">,</span><span class="s">&quot;SEQ&quot;</span><span class="p">)]</span> <span class="o">&lt;-</span> probeinfo<span class="p">[</span><span class="kp">row.names</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="p">),</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;RANK&quot;</span><span class="p">,</span><span class="s">&quot;SEQ&quot;</span><span class="p">)]</span>
+
+
+<span class="c1"># For present probes, rank is based on correlation to probeset mean</span>
+
+<span class="c1"># Read probeset means from calls file</span>
+filename.calls <span class="o">=</span> <span class="s">&quot;Processed_Data_Files/Normalized_Calls_Files/All_norm_calls.txt&quot;</span>
+calls.lin <span class="o">&lt;-</span> read.table<span class="p">(</span>filename.calls<span class="p">,</span>header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>row.names<span class="o">=</span><span class="s">&quot;SEQ_ID&quot;</span><span class="p">)</span>
+<span class="c1"># This line simultaneously logs the data and gives the same column order as the intensity table</span>
+probeset.means <span class="o">=</span> <span class="kp">log</span><span class="p">(</span>calls.lin<span class="p">[</span>intensity.datacols<span class="p">],</span> base<span class="o">=</span>log.base<span class="p">)</span>
+<span class="kp">rm</span><span class="p">(</span>calls.lin<span class="p">)</span>
+
+<span class="c1"># Collect the relevant data into matrices for efficiency</span>
+probes.present.data <span class="o">=</span> <span class="kp">t</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">==</span> <span class="s">&quot;present&quot;</span><span class="p">,</span>intensity.datacols<span class="p">])</span>
+probeset.means.data <span class="o">=</span> <span class="kp">t</span><span class="p">(</span>probeset.means<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>SEQ_ID<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">==</span> <span class="s">&quot;present&quot;</span><span class="p">],])</span>
+
+<span class="c1"># We invert the correlation so that lower is better</span>
+intensity<span class="o">$</span>data<span class="o">$</span>RANK<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">==</span> <span class="s">&quot;present&quot;</span><span class="p">]</span> <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span><span class="m">1</span><span class="o">:</span><span class="kp">dim</span><span class="p">(</span>probes.present.data<span class="p">)[</span><span class="m">2</span><span class="p">],</span><span class="kr">function</span> <span class="p">(</span>x<span class="p">)</span> <span class="p">{</span> <span class="o">-</span>cor<span class="p">(</span>probes.present.data<span class="p">[,</span>x<span class="p">],</span>probeset.means.data<span class="p">[,</span>x<span class="p">])</span> <span class="p">})</span>
+
+<span class="c1"># Done with these</span>
+<span class="kp">rm</span><span class="p">(</span><span class="s">&quot;probes.present.data&quot;</span><span class="p">,</span><span class="s">&quot;probeset.means.data&quot;</span><span class="p">)</span>
+
+<span class="c1"># Sort by bin, then rank</span>
+intensity<span class="o">$</span>data.ranked <span class="o">=</span> intensity<span class="o">$</span>data<span class="p">[</span><span class="kp">order</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN<span class="p">,</span>intensity<span class="o">$</span>data<span class="o">$</span>RANK<span class="p">),]</span>
+
+<span class="c1"># Split by SEQ_ID</span>
+probes.ranked <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>x<span class="o">=</span><span class="kp">row.names</span><span class="p">(</span>intensity<span class="o">$</span>data.ranked<span class="p">),</span>
+                       f<span class="o">=</span>intensity<span class="o">$</span>data.ranked<span class="o">$</span>SEQ_ID<span class="p">,</span>
+                       drop<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+
+<span class="c1"># Set the desired number of probes per sequence</span>
+num.probes.desired <span class="o">&lt;-</span> <span class="m">3</span>
+
+<span class="c1"># A function to make any vector have length n, by truncating longer ones and padding shorter ones with NA</span>
+firstN <span class="o">&lt;-</span> <span class="kr">function</span> <span class="p">(</span>v<span class="p">,</span>n<span class="p">)</span> <span class="kt">c</span><span class="p">(</span>v<span class="p">,</span><span class="kp">rep</span><span class="p">(</span><span class="kc">NA</span><span class="p">,</span>n<span class="p">))[</span><span class="m">1</span><span class="o">:</span>n<span class="p">]</span>
+<span class="c1"># A function to take a string and append P1, P2, P3, etc. up to PN.</span>
+probenamesN <span class="o">&lt;-</span> <span class="kr">function</span> <span class="p">(</span>s<span class="p">,</span>n<span class="p">)</span> <span class="p">(</span><span class="kp">paste</span><span class="p">(</span>s<span class="p">,</span><span class="s">&quot;P&quot;</span><span class="p">,</span><span class="m">1</span><span class="o">:</span>n<span class="p">,</span>sep<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">))[</span><span class="m">1</span><span class="o">:</span>n<span class="p">]</span>
+
+probes.selected <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="kp">sapply</span><span class="p">(</span>probes.ranked<span class="p">,</span>firstN<span class="p">,</span>num.probes.desired<span class="p">))</span>
+<span class="kp">names</span><span class="p">(</span>probes.selected<span class="p">)</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="kp">sapply</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>probes.ranked<span class="p">),</span>probenamesN<span class="p">,</span>num.probes.desired<span class="p">))</span>
+<span class="c1"># Filter NA</span>
+probes.selected <span class="o">&lt;-</span> probes.selected<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>probes.selected<span class="p">)]</span>
+
+probes.selected.fasta <span class="o">&lt;-</span> <span class="kp">paste</span><span class="p">(</span>sep<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">,</span><span class="s">&quot;&gt;&quot;</span><span class="p">,</span><span class="kp">names</span><span class="p">(</span>probes.selected<span class="p">),</span><span class="s">&quot;\n&quot;</span><span class="p">,</span>intensity<span class="o">$</span>data<span class="p">[</span>probes.selected<span class="p">,</span><span class="s">&quot;SEQ&quot;</span><span class="p">])</span>
+
+<span class="kp">cat</span><span class="p">(</span>sep<span class="o">=</span><span class="s">&quot;\n&quot;</span><span class="p">,</span>probes.selected.fasta<span class="p">,</span>file<span class="o">=</span><span class="s">&quot;selected_probes.fasta&quot;</span><span class="p">)</span>
+
+<span class="kp">save.image</span><span class="p">(</span>file<span class="o">=</span><span class="s">&quot;probesel.rda&quot;</span><span class="p">)</span>
+</pre></div>
+</body>
+</html>

+ 336 - 0
examples/edger-pipeline.R.html

@@ -0,0 +1,336 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
+   "http://www.w3.org/TR/html4/strict.dtd">
+
+<html>
+<head>
+  <title></title>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #888888 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0044DD } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mb { color: #666666 } /* Literal.Number.Bin */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sa { color: #BA2121 } /* Literal.String.Affix */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .dl { color: #BA2121 } /* Literal.String.Delimiter */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .fm { color: #0000FF } /* Name.Function.Magic */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .vm { color: #19177C } /* Name.Variable.Magic */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+</head>
+<body>
+<h2></h2>
+
+<div class="highlight"><pre><span></span><span class="c1">#!/usr/bin/Rscript</span>
+
+<span class="kn">source</span><span class="p">(</span><span class="s">&quot;common.R&quot;</span><span class="p">)</span>
+
+<span class="kn">library</span><span class="p">(</span>rtracklayer<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>DiffBind<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>plyr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>doMC<span class="p">)</span>
+registerDoMC<span class="p">()</span>
+<span class="kp">options</span><span class="p">(</span>cores<span class="o">=</span>multicore<span class="o">:::</span>detectCores<span class="p">())</span>
+<span class="kn">library</span><span class="p">(</span>snow<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>stringr<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>RColorBrewer<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>xlsx<span class="p">)</span>
+<span class="kn">library</span><span class="p">(</span>edgeR<span class="p">)</span>
+
+tsmsg <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kp">message</span><span class="p">(</span><span class="kp">date</span><span class="p">(),</span> <span class="s">&quot;: &quot;</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span>
+<span class="p">}</span>
+
+tsmsgf <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  tsmsg<span class="p">(</span><span class="kp">sprintf</span><span class="p">(</span><span class="kc">...</span><span class="p">))</span>
+<span class="p">}</span>
+
+<span class="c1">## Additional args are passed to every call of addDataFrame</span>
+write.xlsx.multisheet <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>data.frames<span class="p">,</span> <span class="kp">file</span><span class="p">,</span> sheetNames<span class="o">=</span><span class="kp">names</span><span class="p">(</span>data.frames<span class="p">),</span> <span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kr">if</span> <span class="p">(</span><span class="kp">is.null</span><span class="p">(</span>sheetNames<span class="p">))</span> <span class="p">{</span>
+    sheetNames <span class="o">&lt;-</span> str_c<span class="p">(</span><span class="s">&quot;Sheet&quot;</span><span class="p">,</span> <span class="kp">seq_along</span><span class="p">(</span>data.frames<span class="p">))</span>
+  <span class="p">}</span>
+  <span class="c1">## Ensure correct number of sheetNames</span>
+  <span class="kp">stopifnot</span><span class="p">(</span><span class="kp">length</span><span class="p">(</span>sheetNames<span class="p">)</span> <span class="o">==</span> <span class="kp">length</span><span class="p">(</span>data.frames<span class="p">))</span>
+  <span class="c1">## Fill in missing names if needed</span>
+  sheetNames<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>sheetNames<span class="p">)]</span> <span class="o">&lt;-</span> str_c<span class="p">(</span><span class="s">&quot;Sheet&quot;</span><span class="p">,</span> <span class="kp">seq_along</span><span class="p">(</span>data.frames<span class="p">))[</span><span class="kp">is.na</span><span class="p">(</span>sheetNames<span class="p">)]</span>
+  wb <span class="o">&lt;-</span> createWorkbook<span class="p">()</span>
+  sheets <span class="o">&lt;-</span> llply<span class="p">(</span>sheetNames<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> createSheet<span class="p">(</span>wb<span class="p">,</span> sheetName<span class="o">=</span>x<span class="p">))</span>
+  mlply<span class="p">(</span><span class="kp">cbind</span><span class="p">(</span>sheet<span class="o">=</span>sheets<span class="p">,</span> x<span class="o">=</span>data.frames<span class="p">),</span> <span class="m">.</span>fun<span class="o">=</span>addDataFrame<span class="p">,</span> <span class="m">.</span>parallel<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span> <span class="kc">...</span><span class="p">)</span>
+  saveWorkbook<span class="p">(</span>wb<span class="p">,</span> <span class="kp">file</span><span class="p">)</span>
+<span class="p">}</span>
+
+renice.self <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>niceness<span class="o">=</span><span class="m">19</span><span class="p">)</span>  <span class="p">{</span>
+  <span class="kp">system2</span><span class="p">(</span><span class="s">&quot;renice&quot;</span><span class="p">,</span> args<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;-n&quot;</span><span class="p">,</span> <span class="kp">as.character</span><span class="p">(</span>niceness<span class="p">),</span> <span class="kp">Sys.getpid</span><span class="p">()))</span>
+<span class="p">}</span>
+
+makeNiceCluster <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span><span class="kc">...</span><span class="p">)</span> <span class="p">{</span>
+  cl <span class="o">&lt;-</span> makeCluster<span class="p">(</span><span class="kc">...</span><span class="p">)</span>
+  clusterCall<span class="p">(</span>cl<span class="p">,</span> renice.self<span class="p">)</span>
+  cl
+<span class="p">}</span>
+
+select.nearest <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">,</span> y<span class="p">)</span> <span class="p">{</span>
+  y<span class="p">[</span>nearest<span class="p">(</span>x<span class="p">,</span>y<span class="p">)]</span>
+<span class="p">}</span>
+
+kgxref <span class="o">&lt;-</span> get.ucsc.table<span class="p">(</span><span class="s">&quot;knownGene&quot;</span><span class="p">,</span><span class="s">&quot;kgXref&quot;</span><span class="p">)</span>
+get.kgxref <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>kgID<span class="p">)</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> <span class="kp">merge</span><span class="p">(</span>x<span class="o">=</span>DataFrame<span class="p">(</span>kgID<span class="o">=</span>kgID<span class="p">),</span> y<span class="o">=</span>kgxref<span class="p">,</span>
+             all.x<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> all.y<span class="o">=</span><span class="kc">FALSE</span><span class="p">)</span>
+  x<span class="p">[</span><span class="kp">names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">!=</span> <span class="s">&quot;kgID&quot;</span><span class="p">]</span>
+<span class="p">}</span>
+
+read.htseq.counts <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>f<span class="p">)</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> read.table<span class="p">(</span>f<span class="p">,</span> header<span class="o">=</span><span class="kc">FALSE</span><span class="p">,</span> sep<span class="o">=</span><span class="s">&quot;\t&quot;</span><span class="p">)</span>
+  <span class="kp">names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;ID&quot;</span><span class="p">,</span> <span class="s">&quot;count&quot;</span><span class="p">)</span>
+  <span class="c1">## Discard the last 5 lines</span>
+  exception.rows <span class="o">&lt;-</span> <span class="p">(</span><span class="kp">nrow</span><span class="p">(</span>x<span class="p">)</span><span class="m">-4</span><span class="p">)</span><span class="o">:</span><span class="kp">nrow</span><span class="p">(</span>x<span class="p">)</span>
+  <span class="kp">attr</span><span class="p">(</span>x<span class="p">,</span> <span class="s">&quot;exception_counts&quot;</span><span class="p">)</span> <span class="o">&lt;-</span> x<span class="p">[</span>exception.rows<span class="p">,]</span>
+  x <span class="o">&lt;-</span> x<span class="p">[</span><span class="o">-</span>exception.rows<span class="p">,]</span>
+  x
+<span class="p">}</span>
+
+get.transcript.lengths <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>transcript.ids<span class="p">,</span> exon.lengths<span class="p">)</span> <span class="p">{</span>
+  exons.by.transcript <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>exon.lengths<span class="p">,</span> transcript.ids<span class="p">)</span>
+  <span class="kp">sapply</span><span class="p">(</span>exons.by.transcript<span class="p">,</span> <span class="kp">sum</span><span class="p">)</span>
+<span class="p">}</span>
+
+get.gene.lengths <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>gene.ids<span class="p">,</span> transcript.lengths<span class="p">,</span> method<span class="o">=</span><span class="s">&quot;max&quot;</span><span class="p">)</span> <span class="p">{</span>
+  <span class="kr">if</span> <span class="p">(</span><span class="kp">is.character</span><span class="p">(</span>method<span class="p">))</span> <span class="p">{</span>
+    method <span class="o">&lt;-</span> <span class="kp">get</span><span class="p">(</span>method<span class="p">)</span>
+  <span class="p">}</span>
+  transcripts.by.genes <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>transcript.lengths<span class="p">,</span> gene.ids<span class="p">)</span>
+  <span class="kp">sapply</span><span class="p">(</span>transcripts.by.genes<span class="p">,</span> method<span class="p">)</span>
+<span class="p">}</span>
+
+str_matches <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>string<span class="p">,</span> pattern<span class="p">)</span> <span class="p">{</span>
+  <span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>str_match<span class="p">(</span>string<span class="p">,</span> pattern<span class="p">)[,</span><span class="m">1</span><span class="p">])</span>
+<span class="p">}</span>
+
+annotate.ensp.in.hsap.or.mmul <span class="o">&lt;-</span> <span class="kr">function</span><span class="p">(</span>ensp.ids<span class="p">)</span> <span class="p">{</span>
+  ensembl<span class="o">=</span>useMart<span class="p">(</span><span class="s">&quot;ensembl&quot;</span><span class="p">)</span>
+  datasets <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;mmulatta_gene_ensembl&quot;</span><span class="p">,</span> <span class="s">&quot;hsapiens_gene_ensembl&quot;</span><span class="p">)</span>
+  x <span class="o">&lt;-</span> <span class="kp">Reduce</span><span class="p">(</span><span class="kp">rbind</span><span class="p">,</span> llply<span class="p">(</span>datasets<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
+    ensembl <span class="o">&lt;-</span> useDataset<span class="p">(</span>x<span class="p">,</span> mart<span class="o">=</span>ensembl<span class="p">)</span>
+    getBM<span class="p">(</span>filters<span class="o">=</span><span class="s">&quot;ensembl_peptide_id&quot;</span><span class="p">,</span>
+          values<span class="o">=</span><span class="kp">unique</span><span class="p">(</span>ensp.ids<span class="p">),</span>
+          attributes<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;hgnc_symbol&quot;</span><span class="p">,</span> <span class="s">&quot;wikigene_name&quot;</span><span class="p">,</span> <span class="s">&quot;ensembl_gene_id&quot;</span><span class="p">,</span> <span class="s">&quot;ensembl_transcript_id&quot;</span><span class="p">,</span> <span class="s">&quot;ensembl_peptide_id&quot;</span><span class="p">,</span> <span class="s">&quot;description&quot;</span><span class="p">,</span> <span class="s">&quot;wikigene_description&quot;</span><span class="p">),</span>
+          mart<span class="o">=</span>ensembl<span class="p">,</span>
+          uniqueRows<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+  <span class="p">}))</span>
+  <span class="c1">## Convert empty string to NA in all columns</span>
+  x <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>llply<span class="p">(</span>x<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>column<span class="p">)</span> <span class="kp">ifelse</span><span class="p">(</span>column <span class="o">==</span> <span class="s">&quot;&quot;</span><span class="p">,</span> <span class="kc">NA</span><span class="p">,</span> column<span class="p">)))</span>
+
+  <span class="c1">## Unify symbols &amp; descriptions</span>
+  unified.symbol <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span><span class="kp">is.na</span><span class="p">(</span>x<span class="o">$</span>hgnc_symbol<span class="p">),</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>wikigene_name<span class="p">),</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>hgnc_symbol<span class="p">))</span>
+  unified.desc <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span><span class="kp">is.na</span><span class="p">(</span>x<span class="o">$</span>description<span class="p">),</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>wikigene_description<span class="p">),</span> <span class="kp">as.character</span><span class="p">(</span>x<span class="o">$</span>description<span class="p">))</span>
+  x <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>symbol<span class="o">=</span>unified.symbol<span class="p">,</span> x<span class="p">[</span><span class="o">!</span> <span class="kp">names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">%in%</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;hgnc_symbol&quot;</span><span class="p">,</span> <span class="s">&quot;wikigene_name&quot;</span><span class="p">,</span> <span class="s">&quot;description&quot;</span><span class="p">,</span> <span class="s">&quot;wikigene_description&quot;</span><span class="p">)],</span> description<span class="o">=</span>unified.desc<span class="p">)</span>
+
+  <span class="c1">## Reorder rows so that more-preferred rows for the same input are on top</span>
+  x <span class="o">&lt;-</span> x<span class="p">[</span><span class="kp">order</span><span class="p">(</span><span class="kp">is.na</span><span class="p">(</span>x<span class="o">$</span>symbol<span class="p">),</span> str_matches<span class="p">(</span>x<span class="o">$</span>symbol<span class="p">,</span> <span class="s">&quot;^LOC\\d+$&quot;</span><span class="p">),</span> <span class="kp">is.na</span><span class="p">(</span>x<span class="o">$</span>description<span class="p">)),]</span>
+
+  <span class="c1">## Keep only the first row for each input</span>
+  x <span class="o">&lt;-</span> x<span class="p">[</span><span class="o">!</span><span class="kp">duplicated</span><span class="p">(</span>x<span class="o">$</span>ensembl_peptide_id<span class="p">),]</span>
+
+  <span class="kt">data.frame</span><span class="p">(</span>x<span class="p">,</span> row.names<span class="o">=</span>x<span class="o">$</span>ensembl_peptide_id<span class="p">)</span>
+<span class="p">}</span>
+
+<span class="c1">## TODO: Replace by biomart</span>
+
+CE.name.annot <span class="o">&lt;-</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> setNames<span class="p">(</span>nm<span class="o">=</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;CE_ensembl_peptide_id&quot;</span><span class="p">,</span> <span class="s">&quot;symbol&quot;</span><span class="p">,</span> <span class="s">&quot;description&quot;</span><span class="p">),</span>
+                read.table<span class="p">(</span><span class="s">&quot;annotation/CE.name&quot;</span><span class="p">,</span> sep<span class="o">=</span><span class="s">&quot;\t&quot;</span><span class="p">,</span> quote<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">))</span>
+  x<span class="o">$</span>ensembl_peptide_id <span class="o">&lt;-</span> str_replace<span class="p">(</span>x<span class="o">$</span>CE_ensembl_peptide_id<span class="p">,</span> <span class="s">&quot;^CE_&quot;</span><span class="p">,</span> <span class="s">&quot;&quot;</span><span class="p">)</span>
+  <span class="kp">row.names</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> x<span class="o">$</span>ensembl_peptide_id
+  x
+<span class="p">}</span>
+
+annotation.gff <span class="o">&lt;-</span> import<span class="p">(</span><span class="s">&quot;cuffmerge_results/merged.gff&quot;</span><span class="p">)</span>
+
+transcripts <span class="o">&lt;-</span> annotation.gff<span class="p">[</span>annotation.gff<span class="o">$</span>type <span class="o">==</span> <span class="s">&quot;transcript&quot;</span><span class="p">,</span> <span class="p">]</span>
+exons <span class="o">&lt;-</span> annotation.gff<span class="p">[</span>annotation.gff<span class="o">$</span>type <span class="o">==</span> <span class="s">&quot;exon&quot;</span><span class="p">,</span> <span class="p">]</span>
+transcript.lengths <span class="o">&lt;-</span> get.transcript.lengths<span class="p">(</span><span class="kp">unlist</span><span class="p">(</span>exons<span class="o">$</span>Parent<span class="p">),</span> width<span class="p">(</span>exons<span class="p">))</span>
+gene.lengths <span class="o">&lt;-</span> get.gene.lengths<span class="p">(</span>transcripts<span class="o">$</span>geneID<span class="p">,</span> transcript.lengths<span class="p">[</span>transcripts<span class="o">$</span>ID<span class="p">])</span>
+
+gene.ref.lists <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span><span class="kp">split</span><span class="p">(</span>transcripts<span class="o">$</span>nearest_ref<span class="p">,</span> transcripts<span class="o">$</span>geneID<span class="p">),</span>
+                         <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="kp">unique</span><span class="p">(</span>x<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>x<span class="p">)]))</span>
+
+<span class="c1">## For genes with multiple ref IDs, join them with commas</span>
+gene.refs <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span>gene.ref.lists<span class="p">,</span> <span class="kr">function</span> <span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
+  <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>x<span class="p">)</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+    <span class="kc">NA</span>
+  <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+    str_c<span class="p">(</span>x<span class="p">,</span> collapse<span class="o">=</span><span class="s">&quot;,&quot;</span><span class="p">)</span>
+  <span class="p">}</span>
+<span class="p">})</span>
+
+<span class="c1">## For genes with multiple ref IDs, just pick the first one</span>
+gene.first.refs <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span>gene.ref.lists<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>x<span class="p">)</span> <span class="p">{</span>
+  <span class="kr">if</span> <span class="p">(</span><span class="kp">length</span><span class="p">(</span>x<span class="p">)</span> <span class="o">==</span> <span class="m">0</span><span class="p">)</span> <span class="p">{</span>
+    <span class="kc">NA</span>
+  <span class="p">}</span> <span class="kr">else</span> <span class="p">{</span>
+    x<span class="p">[[</span><span class="m">1</span><span class="p">]]</span>
+  <span class="p">}</span>
+<span class="p">})</span>
+
+<span class="c1">## Use delayed assignment so that the cluster won&#39;t be created until it is actually needed</span>
+<span class="kp">delayedAssign</span><span class="p">(</span><span class="s">&quot;cl&quot;</span><span class="p">,</span> makeNiceCluster<span class="p">(</span><span class="kp">rep</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;salomon14&quot;</span><span class="p">,</span> <span class="s">&quot;salomon18&quot;</span><span class="p">,</span> <span class="s">&quot;salomon19&quot;</span><span class="p">),</span> <span class="m">8</span><span class="p">)))</span>
+
+mart.raw.annot <span class="o">&lt;-</span> annotate.ensp.in.hsap.or.mmul<span class="p">(</span>na.omit<span class="p">(</span>gene.first.refs<span class="p">))</span>
+gene.annot <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>mart.raw.annot<span class="p">[</span>gene.first.refs<span class="p">,],</span> row.names<span class="o">=</span><span class="kp">names</span><span class="p">(</span>gene.first.refs<span class="p">))</span>
+
+<span class="c1">## Fill in missing values that are available from the CE.name file</span>
+<span class="c1">## provided by the cyno genome paper authors</span>
+suppl.gene.annot <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>CE.name.annot<span class="p">[</span>gene.first.refs<span class="p">,],</span> row.names<span class="o">=</span><span class="kp">names</span><span class="p">(</span>gene.first.refs<span class="p">))</span>
+
+suppl.ensp <span class="o">&lt;-</span> <span class="kp">is.na</span><span class="p">(</span>gene.annot<span class="o">$</span>ensembl_peptide_id<span class="p">)</span> <span class="o">&amp;</span> <span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>suppl.gene.annot<span class="o">$</span>ensembl_peptide_id<span class="p">)</span>
+suppl.symbols <span class="o">&lt;-</span> <span class="kp">is.na</span><span class="p">(</span>gene.annot<span class="o">$</span>symbol<span class="p">)</span> <span class="o">&amp;</span> <span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>suppl.gene.annot<span class="o">$</span>symbol<span class="p">)</span>
+gene.annot<span class="o">$</span>ensembl_peptide_id <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span>suppl.ensp<span class="p">,</span>
+                                        <span class="kp">as.character</span><span class="p">(</span>suppl.gene.annot<span class="o">$</span>ensembl_peptide_id<span class="p">),</span>
+                                        <span class="kp">as.character</span><span class="p">(</span>gene.annot<span class="o">$</span>ensembl_peptide_id<span class="p">))</span>
+gene.annot<span class="o">$</span>symbol <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span>suppl.symbols<span class="p">,</span>
+                            <span class="kp">as.character</span><span class="p">(</span>suppl.gene.annot<span class="o">$</span>symbol<span class="p">),</span>
+                            <span class="kp">as.character</span><span class="p">(</span>gene.annot<span class="o">$</span>symbol<span class="p">))</span>
+<span class="c1">## Replace description whenever we replace the symbol to keep things consistent</span>
+gene.annot<span class="o">$</span>description <span class="o">&lt;-</span> <span class="kp">ifelse</span><span class="p">(</span>suppl.symbols<span class="p">,</span>
+                            <span class="kp">as.character</span><span class="p">(</span>suppl.gene.annot<span class="o">$</span>description<span class="p">),</span>
+                            <span class="kp">as.character</span><span class="p">(</span>gene.annot<span class="o">$</span>description<span class="p">))</span>
+gene.annot<span class="o">$</span>symbol<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>gene.annot<span class="o">$</span>ensembl_peptide_id<span class="p">)]</span> <span class="o">&lt;-</span> <span class="s">&quot;[No annotation]&quot;</span>
+gene.annot<span class="o">$</span>symbol<span class="p">[</span><span class="kp">is.na</span><span class="p">(</span>gene.annot<span class="o">$</span>symbol<span class="p">)]</span> <span class="o">&lt;-</span> <span class="s">&quot;[No symbol for ENSP]&quot;</span>
+
+expdata <span class="o">&lt;-</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> read.xlsx<span class="p">(</span><span class="s">&quot;kenyon-cyno-expdata.xls&quot;</span><span class="p">,</span> <span class="m">1</span><span class="p">)</span>
+  x<span class="o">$</span>Animal.ID <span class="o">&lt;-</span> str_trim<span class="p">(</span>x<span class="o">$</span>Animal.ID<span class="p">)</span>
+  x<span class="o">$</span>Condition <span class="o">&lt;-</span> str_replace_all<span class="p">(</span>x<span class="o">$</span>Condition<span class="p">,</span> <span class="s">&quot;γ&quot;</span><span class="p">,</span> <span class="s">&quot;g&quot;</span><span class="p">)</span>
+  x<span class="o">$</span>Sample.ID <span class="o">&lt;-</span>
+    <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%s_%s_%s&quot;</span><span class="p">,</span>
+            str_replace_all<span class="p">(</span>x<span class="o">$</span>Animal.ID<span class="p">,</span> <span class="s">&quot;[ -]&quot;</span><span class="p">,</span> <span class="s">&quot;_&quot;</span><span class="p">),</span>
+            <span class="kp">ifelse</span><span class="p">(</span>x<span class="o">$</span>Condition <span class="o">==</span> <span class="s">&quot;Control&quot;</span><span class="p">,</span> <span class="s">&quot;CTRL&quot;</span><span class="p">,</span> <span class="s">&quot;IFNg&quot;</span><span class="p">),</span>
+            x<span class="o">$</span>Passage<span class="p">)</span>
+  x<span class="o">$</span>Sample.Name <span class="o">&lt;-</span>
+    <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;%s_%s_%s_L%03i&quot;</span><span class="p">,</span>
+            str_replace_all<span class="p">(</span>x<span class="o">$</span>Animal.ID<span class="p">,</span> <span class="s">&quot;[ -]&quot;</span><span class="p">,</span> <span class="s">&quot;_&quot;</span><span class="p">),</span>
+            <span class="kp">ifelse</span><span class="p">(</span>x<span class="o">$</span>Condition <span class="o">==</span> <span class="s">&quot;Control&quot;</span><span class="p">,</span> <span class="m">1</span><span class="p">,</span> <span class="m">2</span><span class="p">),</span>
+            x<span class="o">$</span>Index.Seq<span class="p">,</span>
+            x<span class="o">$</span>Lane<span class="p">)</span>
+  x<span class="o">$</span>FASTQ.Read1.File <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;seqprep_results/%s/%s_R1_001.fastq&quot;</span><span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">)</span>
+  x<span class="o">$</span>FASTQ.Read2.File <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;seqprep_results/%s/%s_R2_001.fastq&quot;</span><span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">)</span>
+  x<span class="o">$</span>BAM.File <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;tophat_results/%s/accepted_hits.bam&quot;</span><span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">)</span>
+  x<span class="o">$</span>GFF.File <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;cufflinks_quantification/%s/transcripts.gff&quot;</span><span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">)</span>
+  x<span class="o">$</span>Counts.File <span class="o">&lt;-</span> <span class="kp">sprintf</span><span class="p">(</span><span class="s">&quot;htseq_counts/%s/counts.txt&quot;</span><span class="p">,</span> x<span class="o">$</span>Sample.Name<span class="p">)</span>
+  mcparallel<span class="p">(</span>write.xlsx<span class="p">(</span>x<span class="p">,</span> <span class="s">&quot;expdata.xlsx&quot;</span><span class="p">))</span>
+  <span class="kp">rownames</span><span class="p">(</span>x<span class="p">)</span> <span class="o">&lt;-</span> x<span class="o">$</span>Sample.ID
+  x
+<span class="p">}</span>
+
+counts.vectors <span class="o">&lt;-</span> setNames<span class="p">(</span>llply<span class="p">(</span>expdata<span class="o">$</span>Counts.File<span class="p">,</span> <span class="m">.</span>parallel<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span> <span class="kr">function</span><span class="p">(</span>f<span class="p">)</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> read.htseq.counts<span class="p">(</span>f<span class="p">)</span>
+  setNames<span class="p">(</span>x<span class="o">$</span>count<span class="p">,</span> x<span class="o">$</span>ID<span class="p">)</span>
+<span class="p">}),</span> expdata<span class="o">$</span>Sample.ID<span class="p">)</span>
+
+<span class="c1">## Put the data into a matrix, making sure we account for the</span>
+<span class="c1">## possibility that not all genes are listed in all samples.</span>
+all.geneIDs <span class="o">&lt;-</span> <span class="kp">sort</span><span class="p">(</span><span class="kp">unique</span><span class="p">(</span><span class="kp">unlist</span><span class="p">(</span>llply<span class="p">(</span>counts.vectors<span class="p">,</span> <span class="kp">names</span><span class="p">))))</span>
+
+counts <span class="o">&lt;-</span> <span class="p">{</span>
+  x <span class="o">&lt;-</span> <span class="kt">matrix</span><span class="p">(</span>data<span class="o">=</span><span class="m">0</span><span class="p">,</span> ncol<span class="o">=</span><span class="kp">length</span><span class="p">(</span>counts.vectors<span class="p">),</span> nrow<span class="o">=</span><span class="kp">length</span><span class="p">(</span>all.geneIDs<span class="p">),</span>
+              dimnames<span class="o">=</span><span class="kt">list</span><span class="p">(</span><span class="sb">`geneID`</span><span class="o">=</span>all.geneIDs<span class="p">,</span> <span class="sb">`sample`</span><span class="o">=</span><span class="kp">names</span><span class="p">(</span>counts.vectors<span class="p">)))</span>
+  <span class="kr">for</span> <span class="p">(</span>i <span class="kr">in</span> expdata<span class="o">$</span>Sample.ID<span class="p">)</span> <span class="p">{</span>
+    cvec <span class="o">&lt;-</span> counts.vectors<span class="p">[[</span>i<span class="p">]]</span>
+    x<span class="p">[</span><span class="kp">names</span><span class="p">(</span>cvec<span class="p">),</span>i<span class="p">]</span> <span class="o">&lt;-</span> <span class="kp">cbind</span><span class="p">(</span>cvec<span class="p">)</span>
+  <span class="p">}</span>
+  x
+<span class="p">}</span>
+
+blocked.design <span class="o">&lt;-</span> model.matrix<span class="p">(</span><span class="o">~</span>Animal.ID<span class="o">+</span>Passage<span class="o">+</span>Condition<span class="p">,</span> data<span class="o">=</span>expdata<span class="p">)</span>
+unblocked.design <span class="o">&lt;-</span> model.matrix<span class="p">(</span><span class="o">~</span>Condition<span class="p">,</span> data<span class="o">=</span>expdata<span class="p">)</span>
+dge <span class="o">&lt;-</span> DGEList<span class="p">(</span>counts<span class="o">=</span>counts<span class="p">,</span>
+               group<span class="o">=</span>expdata<span class="o">$</span>Condition<span class="p">,</span>
+               genes<span class="o">=</span>gene.annot<span class="p">[</span><span class="kp">rownames</span><span class="p">(</span>counts<span class="p">),])</span>
+dge <span class="o">&lt;-</span> calcNormFactors<span class="p">(</span>dge<span class="p">)</span>
+
+blocked.dge <span class="o">&lt;-</span> estimateGLMCommonDisp<span class="p">(</span>dge<span class="p">,</span> blocked.design<span class="p">,</span> verbose<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+blocked.dge <span class="o">&lt;-</span> estimateGLMTrendedDisp<span class="p">(</span>blocked.dge<span class="p">,</span> blocked.design<span class="p">)</span>
+blocked.dge <span class="o">&lt;-</span> estimateGLMTagwiseDisp<span class="p">(</span>blocked.dge<span class="p">,</span> blocked.design<span class="p">)</span>
+blocked.fit <span class="o">&lt;-</span> glmFit<span class="p">(</span>blocked.dge<span class="p">,</span> blocked.design<span class="p">)</span>
+blocked.lrt <span class="o">&lt;-</span> glmLRT<span class="p">(</span>blocked.dge<span class="p">,</span> blocked.fit<span class="p">,</span> coef<span class="o">=</span><span class="s">&quot;ConditionIFNg activated&quot;</span><span class="p">)</span>
+blocked.tt <span class="o">&lt;-</span> topTags<span class="p">(</span>blocked.lrt<span class="p">,</span> n<span class="o">=</span><span class="kp">nrow</span><span class="p">(</span>counts<span class="p">))</span>
+
+unblocked.dge <span class="o">&lt;-</span> estimateGLMCommonDisp<span class="p">(</span>dge<span class="p">,</span> unblocked.design<span class="p">,</span> verbose<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
+unblocked.dge <span class="o">&lt;-</span> estimateGLMTrendedDisp<span class="p">(</span>unblocked.dge<span class="p">,</span> unblocked.design<span class="p">)</span>
+unblocked.dge <span class="o">&lt;-</span> estimateGLMTagwiseDisp<span class="p">(</span>unblocked.dge<span class="p">,</span> unblocked.design<span class="p">)</span>
+unblocked.fit <span class="o">&lt;-</span> glmFit<span class="p">(</span>unblocked.dge<span class="p">,</span> unblocked.design<span class="p">)</span>
+unblocked.lrt <span class="o">&lt;-</span> glmLRT<span class="p">(</span>unblocked.dge<span class="p">,</span> unblocked.fit<span class="p">,</span> coef<span class="o">=</span><span class="s">&quot;ConditionIFNg activated&quot;</span><span class="p">)</span>
+unblocked.tt <span class="o">&lt;-</span> topTags<span class="p">(</span>unblocked.lrt<span class="p">,</span> n<span class="o">=</span><span class="kp">nrow</span><span class="p">(</span>counts<span class="p">))</span>
+
+write.csv<span class="p">(</span>blocked.tt<span class="o">$</span><span class="kp">table</span><span class="p">,</span> <span class="s">&quot;edgeR-genes-prelim&quot;</span><span class="p">)</span>
+write.xlsx.multisheet<span class="p">(</span><span class="kt">list</span><span class="p">(</span>blocked<span class="o">=</span>blocked.tt<span class="o">$</span><span class="kp">table</span><span class="p">,</span>
+                           unblocked<span class="o">=</span>unblocked.tt<span class="o">$</span><span class="kp">table</span><span class="p">),</span>
+                      <span class="s">&quot;edgeR-genes.xlsx&quot;</span><span class="p">)</span>
+</pre></div>
+</body>
+</html>

+ 18 - 18
ryan_thompson_resume.html

@@ -143,36 +143,36 @@ Echols Scholar</dd>
 <ul class="itemize"><a id='magicparlabel-23' /><li class="itemize_item">Created an open source, reproducible workflow to analyze a large multi-omics next-gen sequencing dataset of 220 RNA-seq and ChIP-seq samples to reveal interactions between differential histone methylation and differential gene expression during T-cell activation, as well as key differences in activation between naïve and memory cells [<a href='#LyXCite-lamere2016'><span class="bib-label">2</span></a>, <a href='#LyXCite-lamere2016_JMJD3'>#<span class="bib-key">lamere2016-JMJD3</span></a>].
 <br />
 
-Links: <a href="https://github.com/DarwinAwardWinner/CD4-csaw#re-analysis-of-a-combined-chip-seq--rna-seq-data-set">Reproducible workflow</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/CD4/ChIP-Seq%20presentation.pdf">Slides</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/CD4/">Example results and visualizations</a></li>
+Links: <a href="https://github.com/DarwinAwardWinner/CD4-csaw#re-analysis-of-a-combined-chip-seq--rna-seq-data-set">Reproducible workflow</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/CD4/ChIP-Seq%20presentation.pdf">Slides</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/CD4/">Example results and visualizations</a></li>
 <li class="itemize_item">Investigated effects of life-span-extending drug on worm gene expression over time revealing that the drug retards age-related &ldquo;transcriptional drift&rdquo;, preserving a youthful phenotype at the molecular level. [<a href='#LyXCite-Rangarajue08833'><span class="bib-label">3</span></a>] 
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/mdsplots-multidim.pdf">PCoA Plot</a> </li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/mdsplots-multidim.pdf">PCoA Plot</a> </li>
 <li class="itemize_item">Significantly improved performce of machine learning classifier for identifying transplant rejection by developing appropriate single-sample microarray normalization procedures[<a href='#LyXCite-kurian2014molecular'><span class="bib-label">1</span></a>], including training a custom set of frozen RMA normalization vectors. Classifier is currently being developed into a clinical test for transplant dysfunction.
 <br />
 
-Links: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/fRMA/">fRMA example code &amp; plots</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Classifier%20Math%20Write-up.pdf">Classifier Method Write-up</a></li>
+Links: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/fRMA/">fRMA example code &amp; plots</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Classifier%20Math%20Write-up.pdf">Classifier Method Write-up</a></li>
 <li class="itemize_item">Implemented a systems biology tool to analyze and efficiently present and summarize differential expression for multiple gene set &amp; pathway methods run on multiple pathway databases, as well as differential expression of individual genes within each pathway.
 <br />
 
-Links: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Pathway%20Presentation.pdf">Presentation</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Pathway%20Analysis%20Example.html">Example Results</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Cyno%20Pathway%20Summary.pdf">Summary</a></li>
+Links: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Pathways/Pathway%20Presentation.pdf">Presentation</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Pathways/Pathway%20Analysis%20Example.html">Example Results</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Pathways/Cyno%20Pathway%20Summary.pdf">Summary</a></li>
 <li class="itemize_item">Performed comparative analysis of multiple differential expression statistical models to define best practice for optimal sensitivity while maintaining false positive control. Presented on theoretical and practical similarities and differences between methods.
 <br />
 
-Links: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/DGE%20Presentation.pdf">RNA-seq Presentation</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Advanced%20RNA-seq%20Analysis.pdf">Advanced RNA-seq Presentation</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/globin/pval-comparisons.pdf">Example plot </a></li>
+Links: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/DGE%20Presentation.pdf">RNA-seq Presentation</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Advanced%20RNA-seq%20Analysis.pdf">Advanced RNA-seq Presentation</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/globin/pval-comparisons.pdf">Example plot </a></li>
 <li class="itemize_item">Taught basic RNA-seq theory and practical analysis for the graduate-level introductory bioinformatics course.
 <br />
 
-Links: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Teaching/RNA-Seq Lecture.pdf">Lecture Slides</a>, <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/Teaching/RNA-Seq Lab.html">Hands-on lab section</a></li>
+Links: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Teaching/RNA-Seq Lecture.pdf">Lecture Slides</a>, <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/Teaching/RNA-Seq Lab.html">Hands-on lab section</a></li>
 <li class="itemize_item">Evaluated and optimized cost and performance of custom protocol for RNA-seq of human and primate blood samples while minimizing nuisance globin reads. Increased yield of useful reads nearly 2-fold. [<a href='#LyXCite-globin_reduction'><span class="bib-label">6</span></a>]
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/globin/">Example results</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/globin/">Example results</a></li>
 <li class="itemize_item">Adapted common normalization methods from RNA-seq to improve performance in analysis of RASL-seq experiments. [<a href='#LyXCite-Scott036061'><span class="bib-label">4</span></a>]</li>
 <li class="itemize_item">Performed a comprehensive comparative evaluation of over 20 subtly different statistical models for differential methylation in Illumina 450k arrays, selecting a model that best explained the observed sources and trends of variation in the data, including cross-domain application of a method originally designed for RNA-seq data.
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Salomon/450k/">Example diagnostic plots</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/Salomon/450k/">Example diagnostic plots</a></li>
 <li class="itemize_item">Active member of the Bioconductor community and contributing developer for several Bioconductor packages. 
 <br />
 
@@ -188,7 +188,7 @@ Links: <a href="http://bioconductor.org/packages/release/bioc/html/BiocParallel.
 <ul class="itemize"><a id='magicparlabel-36' /><li class="itemize_item">Built a transcriptome assembly and quantification pipeline using Cufflinks, including fully-automated cluster job control for high-throughput reproducible analysis, and presented a conceptual overview of Cufflinks' assembly and quantification algorithms to help the team understand Cufflinks.
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/cufflinks-presentation.pdf">Presentation Slides</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/cufflinks-presentation.pdf">Presentation Slides</a></li>
 <li class="itemize_item">Assisted in a molecular genetics study to evaluate peformance of two variant calling algorithms in detection of causal mutations in antibiotic-resistant bacterial genomes.</li>
 </ul>
 <div class="subsection"><a id='magicparlabel-38' /><span class="subsection_label"></span> Gaasterland Lab, UCSD Bioinformatcs La Jolla, CA&emsp;&emsp; 2010 - 2012</div>
@@ -200,24 +200,24 @@ Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/cufflinks-presenta
 <ul class="itemize"><a id='magicparlabel-40' /><li class="itemize_item">Designed and implemented Deloxer, a critical software step in a new Illumina mate-pair sequencing protocol using Cre recombination. Deloxer is published[<a href='#LyXCite-van2011illumina'><span class="bib-label">7</span></a>] and now in use in several labs around the world. 
 <br />
 
-Links: <a href="http://genomes.sdsc.edu/downloads/deloxer/index.html">Documentation</a>; <a href="http://genomes.sdsc.edu/downloads/deloxer/delox.R">Code</a></li>
+Links: <a href="http://genomes.sdsc.edu/downloads/deloxer/index.html">Documentation</a>; <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/delox.R.html">Code</a></li>
 <li class="itemize_item">Performed a molecular genetics study to find potential causal mutations for <a href="https://en.wikipedia.org/wiki/Black_rhinoceros#Threats">fatal iron overload disease</a> in critically endangered black rhinoceros by <em>de novo</em> assembly of black rhino transcriptome using Trinity and comparison to closely-related white rhino transcriptome. Developed a custom pipeline to match up ortholog gene pairs, discover single-nucleotide differences between them, and functionally annotate these differences, and delivered a list of potential causal variants to collaborators for follow-up. 
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/ipi-results-small.txt">Example results</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/ipi-results-small.txt">Example results</a></li>
 <li class="itemize_item">Helped design &amp; implement a large-scale high-throughput exome sequencing pipeline for SNP discovery and functional annotation, including QC and validation of on-target coverage depth and reproducibility of coverage. 
 <br />
 
-Links: <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/neartarget.pdf">Example 1</a>; <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/on-off-coverage.pdf">Example 2</a>; <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/depth-consistency.pdf">Example 3</a></li>
+Links: <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/neartarget.pdf">Example 1</a>; <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/on-off-coverage.pdf">Example 2</a>; <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/depth-consistency.pdf">Example 3</a></li>
 <li class="itemize_item">Created a fully-automated pipeline to produce quality-control metrics and plots for Illumina high-throughput sequencing data for early identification of failed runs or samples. 
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/illumina-qc.html">Example results</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/illumina-qc.html">Example results</a></li>
 <li class="itemize_item">Investigated the binding motif specificity of ZASC1 transcription factor in mouse T-cells using Affymetrix expression microarrays in ZASC1 siRNA knockdown experiment.</li>
 <li class="itemize_item">Analyzed miRNA target predictions using GO &amp; KEGG grouping to identify target pathways of autophagy-related miRNAs for biological validation. 
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/mirna-results.html">Example results</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/Gaasterland/mirna-results.html">Example results</a></li>
 </ul>
 <div class="subsection"><a id='magicparlabel-46' /><span class="subsection_label"></span> Timko Lab, U. of Virginia BiologyCharlottesville, VA&emsp;&emsp; 2007 - 2009</div>
 
@@ -229,11 +229,11 @@ Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/mirna-
 <li class="itemize_item">Investigated transcription factors mediating plant stress response using expression microarray time-course, and refined the custom microarray design using data from previous runs to identify and eliminate uninformative probes, yielding an improved design for future studies. 
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/UVa/probe-selection.R">Code</a></li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/UVa/probe-selection.R.html">Code</a></li>
 <li class="itemize_item">Acted as interpreter to explain complex biological concepts to programmers and explain complex computational problems to biologists.
 <br />
 
-Link: <a href="http://mneme.homenet.org/~ryan/resume/examples/UVa/blast-slides.pdf">Presentation Slides</a> </li>
+Link: <a href="https://darwinawardwinner.github.io/resume/examples/UVa/blast-slides.pdf">Presentation Slides</a> </li>
 </ul>
 <div class="section"><a id='magicparlabel-51' /><span class="section_label"></span> Skills</div>
 
@@ -255,7 +255,7 @@ StackOverflow Profile: <a href="http://stackoverflow.com/users/125921">http://st
 <div class="section"><a id='magicparlabel-56' /><span class="section_label"></span> Other Work Experience</div>
 <div class="subsection"><a id='magicparlabel-57' /><span class="subsection_label"></span> Computing Advisor &amp; Help Desk, U.&thinsp;Va. IT Dept.  Charlottesville, VA &emsp;&emsp; 2005 - 2007 </div>
 
-<ul class="itemize"><a id='magicparlabel-58' /><li class="itemize_item">Provided support via phone and in person for students, faculty, and staff having problems with computers, phone system, network access, malware, hardware setup, and university web services</li>
+<ul class="itemize"><a id='magicparlabel-58' /><li class="itemize_item">Provided support via phone and on-site for students, faculty, and staff having problems with computers, phone system, network access, malware, hardware setup, and university web services</li>
 <li class="itemize_item">Tasks included support for university-provided software, virus removal, iPod recovery, printer setup and repair, diagnosis of hardware malfunctions, and data recovery from failing hard disks</li>
 </ul>
 <div class="subsection"><a id='magicparlabel-60' /><span class="subsection_label"></span> Summer Sailing Instructor, Raritan Yacht Club  Perth Amboy, NJ &emsp;&emsp; 2006 - 2009 </div>
@@ -284,6 +284,6 @@ StackOverflow Profile: <a href="http://stackoverflow.com/users/125921">http://st
 
 <div class="standard"><a id='magicparlabel-80' /><div style='height:1em'></div></div>
 
-<div class="standard"><a id='magicparlabel-81' />Online version (with links): <a href="http://mneme.homenet.org/~ryan/resume/ryan_thompson_resume.pdf">http://mneme.homenet.org/~ryan/resume/ryan_thompson_resume.pdf</a></div>
+<div class="standard"><a id='magicparlabel-81' />Online version (with links): <a href="https://darwinawardwinner.github.io/resume/ryan_thompson_resume.pdf">https://darwinawardwinner.github.io/resume/ryan_thompson_resume.pdf</a></div>
 </body>
 </html>

+ 26 - 26
ryan_thompson_resume.lyx

@@ -263,7 +263,7 @@ target "https://github.com/DarwinAwardWinner/CD4-csaw#re-analysis-of-a-combined-
 \begin_inset CommandInset href
 LatexCommand href
 name "Slides"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/CD4/ChIP-Seq%20presentation.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/CD4/ChIP-Seq%20presentation.pdf"
 
 \end_inset
 
@@ -271,7 +271,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/CD4/ChIP-Seq%20pr
 \begin_inset CommandInset href
 LatexCommand href
 name "Example results and visualizations"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/CD4/"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/CD4/"
 
 \end_inset
 
@@ -304,7 +304,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "PCoA Plot"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/mdsplots-multidim.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/mdsplots-multidim.pdf"
 
 \end_inset
 
@@ -331,7 +331,7 @@ Links:
 \begin_inset CommandInset href
 LatexCommand href
 name "fRMA example code & plots"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/fRMA/"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/fRMA/"
 
 \end_inset
 
@@ -339,7 +339,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/fRMA/"
 \begin_inset CommandInset href
 LatexCommand href
 name "Classifier Method Write-up"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Classifier%20Math%20Write-up.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Classifier%20Math%20Write-up.pdf"
 
 \end_inset
 
@@ -358,7 +358,7 @@ Links:
 \begin_inset CommandInset href
 LatexCommand href
 name "Presentation"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Pathway%20Presentation.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Pathways/Pathway%20Presentation.pdf"
 
 \end_inset
 
@@ -366,7 +366,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Pathway%
 \begin_inset CommandInset href
 LatexCommand href
 name "Example Results"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Pathway%20Analysis%20Example.html"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Pathways/Pathway%20Analysis%20Example.html"
 
 \end_inset
 
@@ -374,7 +374,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Pathway%
 \begin_inset CommandInset href
 LatexCommand href
 name "Summary"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Pathways/Cyno%20Pathway%20Summary.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Pathways/Cyno%20Pathway%20Summary.pdf"
 
 \end_inset
 
@@ -394,7 +394,7 @@ Links:
 \begin_inset CommandInset href
 LatexCommand href
 name "RNA-seq Presentation"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/DGE%20Presentation.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/DGE%20Presentation.pdf"
 
 \end_inset
 
@@ -402,7 +402,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/DGE%20Presentatio
 \begin_inset CommandInset href
 LatexCommand href
 name "Advanced RNA-seq Presentation"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Advanced%20RNA-seq%20Analysis.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Advanced%20RNA-seq%20Analysis.pdf"
 
 \end_inset
 
@@ -410,7 +410,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Advanced%20RNA-se
 \begin_inset CommandInset href
 LatexCommand href
 name "Example plot "
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/globin/pval-comparisons.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/globin/pval-comparisons.pdf"
 
 \end_inset
 
@@ -427,7 +427,7 @@ Links:
 \begin_inset CommandInset href
 LatexCommand href
 name "Lecture Slides"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Teaching/RNA-Seq Lecture.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Teaching/RNA-Seq Lecture.pdf"
 
 \end_inset
 
@@ -435,7 +435,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Teaching/RNA-Seq
 \begin_inset CommandInset href
 LatexCommand href
 name "Hands-on lab section"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/Teaching/RNA-Seq Lab.html"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/Teaching/RNA-Seq Lab.html"
 
 \end_inset
 
@@ -461,7 +461,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Example results"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/globin/"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/globin/"
 
 \end_inset
 
@@ -494,7 +494,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Example diagnostic plots"
-target "http://mneme.homenet.org/~ryan/resume/examples/Salomon/450k/"
+target "https://darwinawardwinner.github.io/resume/examples/Salomon/450k/"
 
 \end_inset
 
@@ -585,7 +585,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Presentation Slides"
-target "http://mneme.homenet.org/~ryan/resume/examples/cufflinks-presentation.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/cufflinks-presentation.pdf"
 
 \end_inset
 
@@ -650,7 +650,7 @@ target "http://genomes.sdsc.edu/downloads/deloxer/index.html"
 \begin_inset CommandInset href
 LatexCommand href
 name "Code"
-target "http://genomes.sdsc.edu/downloads/deloxer/delox.R"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/delox.R.html"
 
 \end_inset
 
@@ -685,7 +685,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Example results"
-target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/ipi-results-small.txt"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/ipi-results-small.txt"
 
 \end_inset
 
@@ -704,7 +704,7 @@ Links:
 \begin_inset CommandInset href
 LatexCommand href
 name "Example 1"
-target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/neartarget.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/neartarget.pdf"
 
 \end_inset
 
@@ -712,7 +712,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/neartarget.pd
 \begin_inset CommandInset href
 LatexCommand href
 name "Example 2"
-target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/on-off-coverage.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/on-off-coverage.pdf"
 
 \end_inset
 
@@ -720,7 +720,7 @@ target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/on-off-covera
 \begin_inset CommandInset href
 LatexCommand href
 name "Example 3"
-target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/depth-consistency.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/depth-consistency.pdf"
 
 \end_inset
 
@@ -739,7 +739,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Example results"
-target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/illumina-qc.html"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/illumina-qc.html"
 
 \end_inset
 
@@ -763,7 +763,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Example results"
-target "http://mneme.homenet.org/~ryan/resume/examples/Gaasterland/mirna-results.html"
+target "https://darwinawardwinner.github.io/resume/examples/Gaasterland/mirna-results.html"
 
 \end_inset
 
@@ -824,7 +824,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Code"
-target "http://mneme.homenet.org/~ryan/resume/examples/UVa/probe-selection.R"
+target "https://darwinawardwinner.github.io/resume/examples/UVa/probe-selection.R.html"
 
 \end_inset
 
@@ -841,7 +841,7 @@ Link:
 \begin_inset CommandInset href
 LatexCommand href
 name "Presentation Slides"
-target "http://mneme.homenet.org/~ryan/resume/examples/UVa/blast-slides.pdf"
+target "https://darwinawardwinner.github.io/resume/examples/UVa/blast-slides.pdf"
 
 \end_inset
 
@@ -1060,7 +1060,7 @@ The following spacing hackery puts the URL right-justified on the bottom
 Online version (with links): 
 \begin_inset CommandInset href
 LatexCommand href
-target "http://mneme.homenet.org/~ryan/resume/ryan_thompson_resume.pdf"
+target "https://darwinawardwinner.github.io/resume/ryan_thompson_resume.pdf"
 
 \end_inset
 

二进制
ryan_thompson_resume.pdf