Bladeren bron

Initial setup for nomenclature handling

Ryan C. Thompson 5 jaren geleden
bovenliggende
commit
c1418f00de
4 gewijzigde bestanden met toevoegingen van 897 en 140 verwijderingen
  1. 12 0
      Snakefile
  2. 54 0
      abbrevs.tex
  3. 1 1
      code-refs.bib
  4. 830 139
      thesis.lyx

+ 12 - 0
Snakefile

@@ -128,6 +128,17 @@ account.
             if include_dirs or s.group(1) == '-':
             if include_dirs or s.group(1) == '-':
                 yield s.group(2)
                 yield s.group(2)
 
 
+def lyx_input_deps(lyxfile):
+    '''Return an iterator over all tex files included by a Lyx file.'''
+    with open(lyxfile) as f:
+        lyx_text = f.read()
+    tex_names = regex.search('\\\\input{(.*?[.]tex)}', lyx_text).group(1).split(',')
+    # Unfortunately LyX doesn't indicate which bib names refer to
+    # files in the current directory and which don't. Currently that's
+    # not a problem for me since all my refs are in bib files in the
+    # current directory.
+    yield from tex_names
+
 def lyx_bib_deps(lyxfile):
 def lyx_bib_deps(lyxfile):
     '''Return an iterator over all bib files referenced by a Lyx file.
     '''Return an iterator over all bib files referenced by a Lyx file.
 
 
@@ -188,6 +199,7 @@ rule lyx_to_pdf:
     input: lyxfile = '{basename}.lyx',
     input: lyxfile = '{basename}.lyx',
            gfx_deps = lambda wildcards: lyx_gfx_deps(wildcards.basename + '.lyx'),
            gfx_deps = lambda wildcards: lyx_gfx_deps(wildcards.basename + '.lyx'),
            bib_deps = lambda wildcards: lyx_bib_deps(wildcards.basename + '.lyx'),
            bib_deps = lambda wildcards: lyx_bib_deps(wildcards.basename + '.lyx'),
+           tex_deps = lambda wildcards: lyx_input_deps(wildcards.basename + '.lyx'),
     # Need to exclude pdfs in graphics/
     # Need to exclude pdfs in graphics/
     output: pdf='{basename,(?!graphics/).*}.pdf'
     output: pdf='{basename,(?!graphics/).*}.pdf'
     run:
     run:

+ 54 - 0
abbrevs.tex

@@ -0,0 +1,54 @@
+\newabbreviation{RNA-seq}{RNA-seq}{high-throughput RNA sequencing}
+
+% TODO
+\newabbreviation{ChIP-seq}{ChIP-seq}{chromatin immunoprecipitation followed by high-throughput DNA sequencing}
+\newabbreviation{GLM}{GLM}{generalized linear model}
+\newabbreviation{IDR}{IDR}{irreproducible discovery rate}
+\newabbreviation{FDR}{FDR}{false discovery rate}
+\newabbreviation{RMA}{RMA}{robust multichip average}
+\newabbreviation{fRMA}{fRMA}{frozen robust multichip average}
+\newabbreviation{GRSN}{GRSN}{global rank-invariant set normalization}
+\newabbreviation{SCAN}{SCAN}{single-channel array normalization}
+\newabbreviation{CPM}{CPM}{counts per million}
+\newabbreviation{logCPM}{logCPM}{logarithm of counts per million}
+\newabbreviation{SVD}{SVD}{singular value decomposition}
+\newabbreviation{SVA}{SVA}{surrogate variable analysis}
+\newabbreviation{PCA}{PCA}{principal component analysis}
+\newabbreviation{PCoA}{PCoA}{principal coordinate analysis} % AKA MDS?
+\newabbreviation{MOFA}{MOFA}{multi-omics factor analysis}
+\newabbreviation{LF}{LF}{latent factor}
+\newabbreviation{TSS}{TSS}{transcription start site}
+\newabbreviation{MSC}{MSC}{mesenchymal stem cell}
+% Figure out the exactly correct way to write interferon gamma
+\newabbreviation{IFNg}{IFN-g}{interferon gamma}
+\newabbreviation{TSS}{TSS}{transcription start site}
+\newabbreviation{SRA}{SRA}{Sequence Read Archive}
+\newabbreviation{GEO}{GEO}{Gene Expression Omnibus}
+\newabbreviation{TMM}{TMM}{trimmed mean of M-values}
+\newabbreviation{FPKM}{FPKM}{fragments per kilobase per million fragments}
+\newabbreviation{CpGi}{CpGi}{CpG island}
+\newabbreviation{AUC}{AUC}{area under ROC curve}
+% ROC
+
+% effective promoter radius?
+% DNA? RNA?
+\newabbreviation{TX}{TX}{healthy transplant}
+\newabbreviation{AR}{AR}{acute rejection}
+\newabbreviation{ADNR}{ADNR}{acute dysfunction with no rejection}
+\newabbreviation{CAN}{CAN}{chronic allograft nephropathy}
+\newabbreviation{T1D}{T1D}{Type 1 disbetes}
+\newabbreviation{T2D}{T2D}{Type 2 disbetes}
+\newabbreviation{SWAN}{SWAN}{subset-quantile within array normalization}
+\newabbreviation{BH}{BH}{Benjamini-Hochberg}
+% MA plot
+\newabbreviation{mRNA}{mRNA}{messenger RNA}
+% oligo?
+% HBA/B?
+% cDNA
+% GB = globin block
+\newabbreviation{BCV}{BCV}{biological coefficient of variation}
+
+
+% These are just here as examples
+\newabbreviation{XML}{XML}{eXtensible Markup Language}
+\newabbreviation{HTML}{HTML}{Hyper-Text Markup Language}

+ 1 - 1
code-refs.bib

@@ -1,7 +1,7 @@
 %% This BibTeX bibliography file was created using BibDesk.
 %% This BibTeX bibliography file was created using BibDesk.
 %% http://bibdesk.sourceforge.net/
 %% http://bibdesk.sourceforge.net/
 
 
-%% Created for Ryan C. Thompson at 2019-10-01 18:06:24 -0700 
+%% Created for Ryan C. Thompson at 2019-10-02 00:31:58 -0700 
 
 
 
 
 %% Saved with string encoding Unicode (UTF-8) 
 %% Saved with string encoding Unicode (UTF-8) 

+ 830 - 139
thesis.lyx

@@ -40,6 +40,23 @@
 
 
 % This one breaks subfigs so it's disabled
 % This one breaks subfigs so it's disabled
 % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
 % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
+
+% Bold all nomenclature entries
+\renewcommand{\nomlabel}[1]{\textsf{\textbf{#1}}}
+
+% https://tex.stackexchange.com/a/31083/5654
+%\let\nomenclOrig\nomenclature
+%\renewcommand*{\nomenclature}[3][]{#2\nomenclOrig[#1]{#2}{#3}}
+
+\usepackage[nohypertypes={abbreviation}]{glossaries-extra}
+\setabbreviationstyle{long-short}
+\input{abbrevs.tex}
+\makeglossaries
+
+% arara: pdflatex
+% arara: biblatex
+% arara: makeglossaries
+% arara: pdflatex
 \end_preamble
 \end_preamble
 \use_default_options true
 \use_default_options true
 \begin_modules
 \begin_modules
@@ -47,6 +64,26 @@ todonotes
 logicalmkup
 logicalmkup
 \end_modules
 \end_modules
 \maintain_unincluded_children false
 \maintain_unincluded_children false
+\begin_local_layout
+Format 66
+InsetLayout "Flex:Glossary Term"
+        LyxType               custom
+        LabelString           gls
+        LatexType             command
+        LatexName             gls*
+        InToc                 true
+        CustomPars            false
+End
+
+InsetLayout "Flex:Glossary Term (Capital)"
+        LyxType               custom
+        LabelString           Gls
+        LatexType             command
+        LatexName             Gls*
+        InToc                 true
+        CustomPars            false
+End
+\end_local_layout
 \language english
 \language english
 \language_package default
 \language_package default
 \inputencoding utf8
 \inputencoding utf8
@@ -224,7 +261,67 @@ LatexCommand tableofcontents
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-[List of Abbreviations]
+\begin_inset Note Note
+status open
+
+\begin_layout Plain Layout
+To create a new nomenclature entry:
+\end_layout
+
+\begin_layout Enumerate
+Add an entry to abbrevs.tex
+\end_layout
+
+\begin_layout Enumerate
+Find the first instance of the term, and wrap it in Insert -> Custom Insets
+ -> Glossary Term (use Capital if starting a sentence)
+\end_layout
+
+\begin_layout Enumerate
+Add a nomenclature entry after the first instance
+\end_layout
+
+\begin_layout Enumerate
+Replace every relevant instance throughout the document with the Glossary
+ Term wrapped version, using Edit -> Find & Replace (Advanced).
+ Skip section headers and floats.
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset CommandInset href
+LatexCommand href
+target "https://ctan.org/pkg/glossaries?lang=en"
+literal "false"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Plain Layout
+\begin_inset CommandInset href
+LatexCommand href
+target "https://wiki.lyx.org/Tips/Nomenclature"
+literal "false"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset nomencl_print
+LatexCommand printnomenclature
+set_width "auto"
+
+\end_inset
+
+
 \end_layout
 \end_layout
 
 
 \begin_layout List of TODOs
 \begin_layout List of TODOs
@@ -808,8 +905,27 @@ literal "false"
  there is one height measurement per person.
  there is one height measurement per person.
  However, when analyzing genomic data, each sample consists of observations
  However, when analyzing genomic data, each sample consists of observations
  of thousands of dependent variables.
  of thousands of dependent variables.
- For example, in an RNA-seq experiment, the dependent variables may be the
- count of RNA-seq reads for each annotated gene.
+ For example, in a 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ experiment, the dependent variables may be the count of 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ reads for each annotated gene.
  In abstract terms, each dependent variable being measured is referred to
  In abstract terms, each dependent variable being measured is referred to
  as a feature.
  as a feature.
  The simplest approach to analyzing such data would be to fit the same model
  The simplest approach to analyzing such data would be to fit the same model
@@ -846,8 +962,18 @@ Limma
 \end_inset
 \end_inset
 
 
  is typically used to analyze expression microarray data, and more recently
  is typically used to analyze expression microarray data, and more recently
- RNA-seq data, but it can also be used to analyze any other data for which
- linear modeling is appropriate.
+ 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, but it can also be used to analyze any other data for which linear
+ modeling is appropriate.
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
@@ -861,6 +987,7 @@ The central challenge when fitting a linear model is to estimate the variance
  variance estimates.
  variance estimates.
  However, this would require the assumption that every feature is equally
  However, this would require the assumption that every feature is equally
  variable, which is known to be false for most genomic data sets.
  variable, which is known to be false for most genomic data sets.
+ 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -897,7 +1024,7 @@ on of the two yields a variance estimate for each feature with greater precision
  toward the common value introduces some bias – the variance will be underestima
  toward the common value introduces some bias – the variance will be underestima
 ted for features with high variance and overestimated for features with
 ted for features with high variance and overestimated for features with
  low variance.
  low variance.
- Essentially,
+ Essentially, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -915,7 +1042,7 @@ y to yield greater statistical power than either the individual feature
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-On top of this core framework,
+On top of this core framework, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -927,7 +1054,7 @@ limma
 
 
  also implements many other enhancements that, further relax the assumptions
  also implements many other enhancements that, further relax the assumptions
  of the model and extend the scope of what kinds of data it can analyze.
  of the model and extend the scope of what kinds of data it can analyze.
- Instead of squeezing toward a single common variance value,
+ Instead of squeezing toward a single common variance value, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -947,10 +1074,20 @@ literal "false"
 \end_inset
 \end_inset
 
 
 .
 .
- This is essential for RNA-seq data, where higher gene counts yield more
- precise expression measurements and therefore smaller variances than low-count
- genes.
+ This is essential for 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, where higher gene counts yield more precise expression measurements
+ and therefore smaller variances than low-count genes.
  While linear models typically assume that all samples have equal variance,
  While linear models typically assume that all samples have equal variance,
+ 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -970,7 +1107,7 @@ literal "false"
 \end_inset
 \end_inset
 
 
 .
 .
- In addition,
+ In addition, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -991,7 +1128,7 @@ literal "false"
 \end_inset
 \end_inset
 
 
 .
 .
- Once again,
+ Once again, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1006,7 +1143,16 @@ limma
 \end_layout
 \end_layout
 
 
 \begin_layout Subsubsection
 \begin_layout Subsubsection
-edgeR provides
+\begin_inset Flex Code
+status open
+
+\begin_layout Plain Layout
+edgeR
+\end_layout
+
+\end_inset
+
+ provides 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1020,7 +1166,7 @@ limma
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-Although
+Although 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1030,10 +1176,29 @@ limma
 
 
 \end_inset
 \end_inset
 
 
- can be applied to read counts from RNA-seq data, it is less suitable for
- counts from ChIP-seq data, which tend to be much smaller and therefore
- violate the assumption of a normal distribution more severely.
- For all count-based data, the
+ can be applied to read counts from 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, it is less suitable for counts from 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+ChIP-seq
+\end_layout
+
+\end_inset
+
+ data, which tend to be much smaller and therefore violate the assumption
+ of a normal distribution more severely.
+ For all count-based data, the 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1043,7 +1208,7 @@ edgeR
 
 
 \end_inset
 \end_inset
 
 
- package works similarly to
+ package works similarly to 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1054,7 +1219,7 @@ limma
 \end_inset
 \end_inset
 
 
 , but uses a generalized linear model instead of a linear model.
 , but uses a generalized linear model instead of a linear model.
- The most important difference is that the GLM in
+ The most important difference is that the GLM in 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1095,7 +1260,7 @@ noise
  The choice of a gamma distribution is arbitrary and motivated by mathematical
  The choice of a gamma distribution is arbitrary and motivated by mathematical
  convenience, since a gamma-Poisson mixture yields the numerically tractable
  convenience, since a gamma-Poisson mixture yields the numerically tractable
  negative binomial distribution.
  negative binomial distribution.
- Thus,
+ Thus, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1110,7 +1275,7 @@ edgeR
 a prioi 
 a prioi 
 \emph default
 \emph default
 that the variation in abundances between replicates follows a gamma distribution.
 that the variation in abundances between replicates follows a gamma distribution.
- For differential abundance testing,
+ For differential abundance testing, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1138,9 +1303,19 @@ ChIP-seq Peak calling
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-Unlike RNA-seq data, in which gene annotations provide a well-defined set
- of discrete genomic regions in which to count reads, ChIP-seq reads can
- potentially occur anywhere in the genome.
+Unlike 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, in which gene annotations provide a well-defined set of discrete
+ genomic regions in which to count reads, ChIP-seq reads can potentially
+ occur anywhere in the genome.
  However, most genome regions will not contain significant ChIP-seq read
  However, most genome regions will not contain significant ChIP-seq read
  coverage, and analyzing every position in the entire genome is statistically
  coverage, and analyzing every position in the entire genome is statistically
  and computationally infeasible, so it is necessary to identify regions
  and computationally infeasible, so it is necessary to identify regions
@@ -1270,7 +1445,7 @@ In addition to other considerations, if called peaks are to be used as regions
  to call peaks in a way that is blind to differential abundance between
  to call peaks in a way that is blind to differential abundance between
  experimental conditions, or else the statistical significance calculations
  experimental conditions, or else the statistical significance calculations
  for differential abundance will overstate their confidence in the results.
  for differential abundance will overstate their confidence in the results.
- The
+ The 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1338,21 +1513,59 @@ frozen
 \begin_layout Standard
 \begin_layout Standard
 In contrast, high-throughput sequencing data present very different normalizatio
 In contrast, high-throughput sequencing data present very different normalizatio
 n challenges.
 n challenges.
- The simplest case is RNA-seq in which read counts are obtained for a set
- of gene annotations, yielding a matrix of counts with rows representing
- genes and columns representing samples.
- Because RNA-seq approximates a process of sampling from a population with
- replacement, each gene's count is only interpretable as a fraction of the
- total reads for that sample.
- For that reason, RNA-seq abundances are often reported as counts per million
- (CPM).
+ The simplest case is 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ in which read counts are obtained for a set of gene annotations, yielding
+ a matrix of counts with rows representing genes and columns representing
+ samples.
+ Because 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ approximates a process of sampling from a population with replacement,
+ each gene's count is only interpretable as a fraction of the total reads
+ for that sample.
+ For that reason, 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ abundances are often reported as counts per million (CPM).
  Furthermore, if the abundance of a single gene increases, then in order
  Furthermore, if the abundance of a single gene increases, then in order
  for its fraction of the total reads to increase, all other genes' fractions
  for its fraction of the total reads to increase, all other genes' fractions
  must decrease to accommodate it.
  must decrease to accommodate it.
  This effect is known as composition bias, and it is an artifact of the
  This effect is known as composition bias, and it is an artifact of the
  read sampling process that has nothing to do with the biology of the samples
  read sampling process that has nothing to do with the biology of the samples
  and must therefore be normalized out.
  and must therefore be normalized out.
- The most commonly used methods to normalize for composition bias in RNA-seq
+ The most commonly used methods to normalize for composition bias in 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
  data seek to equalize the average gene abundance across samples, under
  data seek to equalize the average gene abundance across samples, under
  the assumption that the average gene is likely not changing 
  the assumption that the average gene is likely not changing 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
@@ -1367,7 +1580,7 @@ literal "false"
 
 
 \begin_layout Standard
 \begin_layout Standard
 In ChIP-seq data, normalization is not as straightforward.
 In ChIP-seq data, normalization is not as straightforward.
- The
+ The 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1396,8 +1609,18 @@ literal "false"
  consistent across all samples, then normalizing the background coverage
  consistent across all samples, then normalizing the background coverage
  to be equal across all samples is a reasonable strategy.
  to be equal across all samples is a reasonable strategy.
  If this is not a safe assumption, then the preferred strategy is to normalize
  If this is not a safe assumption, then the preferred strategy is to normalize
- the signal regions in a way similar to RNA-seq data by assuming that the
- average signal region is not changing abundance between samples.
+ the signal regions in a way similar to 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data by assuming that the average signal region is not changing abundance
+ between samples.
  Beyond this, if a ChIP-seq experiment has a more complicated structure
  Beyond this, if a ChIP-seq experiment has a more complicated structure
  that doesn't show the typical bimodal count distribution, it may be necessary
  that doesn't show the typical bimodal count distribution, it may be necessary
  to implement a normalization as a smooth function of abundance.
  to implement a normalization as a smooth function of abundance.
@@ -1424,7 +1647,7 @@ In addition to well-understood effects that can be easily normalized out,
  However, as with variance estimation, estimating the differences in batch
  However, as with variance estimation, estimating the differences in batch
  means is not necessarily robust at the feature level, so the ComBat method
  means is not necessarily robust at the feature level, so the ComBat method
  adds empirical Bayes squeezing of the batch mean differences toward a common
  adds empirical Bayes squeezing of the batch mean differences toward a common
- value, analogous to
+ value, analogous to 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -1570,7 +1793,17 @@ Test IFN-g treated MSC infusion as a therapy to delay graft rejection in
 \end_layout
 \end_layout
 
 
 \begin_layout Itemize
 \begin_layout Itemize
-Monitor animals post-transplant using blood RNA-seq at serial time points
+Monitor animals post-transplant using blood 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ at serial time points
 \end_layout
 \end_layout
 
 
 \begin_layout Subsection
 \begin_layout Subsection
@@ -1614,6 +1847,22 @@ Chapter author list: Me, Sarah, Dan
 \end_inset
 \end_inset
 
 
 
 
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+
+
+\backslash
+glsresetall
+\end_layout
+
+\end_inset
+
+
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
@@ -1733,9 +1982,18 @@ deactivating
 \begin_layout Standard
 \begin_layout Standard
 In order to investigate the relationship between gene expression and these
 In order to investigate the relationship between gene expression and these
  histone modifications in the context of naïve and memory CD4 T-cell activation,
  histone modifications in the context of naïve and memory CD4 T-cell activation,
- a previously published data set of combined RNA-seq and ChIP-seq data was
- re-analyzed using up-to-date methods designed to address the specific analysis
- challenges posed by this data set.
+ a previously published data set of 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data and ChIP-seq data was re-analyzed using up-to-date methods designed
+ to address the specific analysis challenges posed by this data set.
  The data set contains naïve and memory CD4 T-cell samples in a time course
  The data set contains naïve and memory CD4 T-cell samples in a time course
  before and after activation.
  before and after activation.
  Like the original analysis, this analysis looks at the dynamics of these
  Like the original analysis, this analysis looks at the dynamics of these
@@ -1775,7 +2033,16 @@ Look up some more details from the papers (e.g.
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-A reproducible workflow was written to analyze the raw ChIP-seq and RNA-seq
+A reproducible workflow was written to analyze the raw ChIP-seq and 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
  data from previous studies 
  data from previous studies 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
@@ -1785,8 +2052,17 @@ literal "true"
 \end_inset
 \end_inset
 
 
 .
 .
- Briefly, this data consists of RNA-seq and ChIP-seq from CD4 T-cells cultured
- from 4 donors.
+ Briefly, this data consists of 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ and ChIP-seq from CD4 T-cells cultured from 4 donors.
  From each donor, naïve and memory CD4 T-cells were isolated separately.
  From each donor, naïve and memory CD4 T-cells were isolated separately.
  Then cultures of both cells were activated [how?], and samples were taken
  Then cultures of both cells were activated [how?], and samples were taken
  at 4 time points: Day 0 (pre-activation), Day 1 (early activation), Day
  at 4 time points: Day 0 (pre-activation), Day 1 (early activation), Day
@@ -2073,7 +2349,17 @@ literal "false"
 
 
 .
 .
  Five different alignment and quantification methods were tested for the
  Five different alignment and quantification methods were tested for the
- RNA-seq data 
+ 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
 key "Dobin2012,Kim2019,Liao2014,Pimentel2016,Patro2017,gh-shoal,gh-hg38-ref"
 key "Dobin2012,Kim2019,Liao2014,Pimentel2016,Patro2017,gh-shoal,gh-hg38-ref"
@@ -2320,7 +2606,7 @@ However, removing the systematic component of the batch effect still leaves
  the noise component.
  the noise component.
  The gene quantifications from the first batch are substantially noisier
  The gene quantifications from the first batch are substantially noisier
  than those in the second batch.
  than those in the second batch.
- This analysis corrected for this by using
+ This analysis corrected for this by using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -2346,8 +2632,17 @@ literal "false"
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-In any case, the RNA-seq counts were first normalized using trimmed mean
- of M-values 
+In any case, the 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ counts were first normalized using trimmed mean of M-values 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
 key "Robinson2010"
 key "Robinson2010"
@@ -2375,7 +2670,7 @@ literal "false"
 
 
 , and batch-corrected at this point using ComBat.
 , and batch-corrected at this point using ComBat.
  A linear model was fit to the batch-corrected, quality-weighted data for
  A linear model was fit to the batch-corrected, quality-weighted data for
- each gene using
+ each gene using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -2385,7 +2680,7 @@ limma
 
 
 \end_inset
 \end_inset
 
 
-, and each gene was tested for differential expression using
+, and each gene was tested for differential expression using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -2664,8 +2959,17 @@ literal "false"
 \end_inset
 \end_inset
 
 
 .
 .
- Artifact regions were annotated using a custom implementation of the GreyListCh
-IP algorithm, and these 
+ Artifact regions were annotated using a custom implementation of the 
+\begin_inset Flex Code
+status open
+
+\begin_layout Plain Layout
+GreyListChIP
+\end_layout
+
+\end_inset
+
+ algorithm, and these 
 \begin_inset Quotes eld
 \begin_inset Quotes eld
 \end_inset
 \end_inset
 
 
@@ -3062,7 +3366,7 @@ PCoA plots of ChIP-seq sliding window data, before and after subtracting
 
 
 \begin_layout Standard
 \begin_layout Standard
 Reads in promoters, peaks, and sliding windows across the genome were counted
 Reads in promoters, peaks, and sliding windows across the genome were counted
- and normalized using
+ and normalized using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -3072,7 +3376,7 @@ csaw
 
 
 \end_inset
 \end_inset
 
 
- and analyzed for differential modification using
+ and analyzed for differential modification using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -3339,8 +3643,18 @@ end{landscape}
 
 
 \begin_layout Standard
 \begin_layout Standard
 MOFA was run on all the ChIP-seq windows overlapping consensus peaks for
 MOFA was run on all the ChIP-seq windows overlapping consensus peaks for
- each histone mark, as well as the RNA-seq data, in order to identify patterns
- of coordinated variation across all data sets 
+ each histone mark, as well as the 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, in order to identify patterns of coordinated variation across all
+ data sets 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
 key "Argelaguet2018"
 key "Argelaguet2018"
@@ -3383,7 +3697,17 @@ noprefix "false"
 \end_inset
 \end_inset
 
 
 ).
 ).
- Latent factor 2 captures the batch effect in the RNA-seq data.
+ Latent factor 2 captures the batch effect in the 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data.
  Removing the effect of LF2 using MOFA theoretically yields a batch correction
  Removing the effect of LF2 using MOFA theoretically yields a batch correction
  that does not depend on knowing the experimental factors.
  that does not depend on knowing the experimental factors.
  When this was attempted, the resulting batch correction was comparable
  When this was attempted, the resulting batch correction was comparable
@@ -3968,8 +4292,18 @@ trajectory
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-Genes called present in the RNA-seq data were tested for differential expression
- between all time points and cell types.
+Genes called present in the 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data were tested for differential expression between all time points and
+ cell types.
  The counts of differentially expressed genes are shown in Table 
  The counts of differentially expressed genes are shown in Table 
 \begin_inset CommandInset ref
 \begin_inset CommandInset ref
 LatexCommand ref
 LatexCommand ref
@@ -3985,7 +4319,17 @@ noprefix "false"
  called differentially expressed than any of the results for other time
  called differentially expressed than any of the results for other time
  points.
  points.
  This is an unfortunate result of the difference in sample quality between
  This is an unfortunate result of the difference in sample quality between
- the two batches of RNA-seq data.
+ the two batches of 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data.
  All the samples in Batch 1, which includes all the samples from Days 0
  All the samples in Batch 1, which includes all the samples from Days 0
  and 5, have substantially more variability than the samples in Batch 2,
  and 5, have substantially more variability than the samples in Batch 2,
  which includes the other time points.
  which includes the other time points.
@@ -5633,8 +5977,17 @@ noprefix "false"
 .
 .
  For all histone marks, evidence of differential modification between naïve
  For all histone marks, evidence of differential modification between naïve
  and memory samples was detected at every time point except day 14.
  and memory samples was detected at every time point except day 14.
- The day 14 convergence pattern is also present in the RNA-seq data (Figure
- 
+ The day 14 convergence pattern is also present in the 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data (Figure 
 \begin_inset CommandInset ref
 \begin_inset CommandInset ref
 LatexCommand ref
 LatexCommand ref
 reference "fig:RNA-PCA-group"
 reference "fig:RNA-PCA-group"
@@ -5661,8 +6014,18 @@ noprefix "false"
 \end_inset
 \end_inset
 
 
 ), which accounts for shared variation across all 3 histone marks and the
 ), which accounts for shared variation across all 3 histone marks and the
- RNA-seq data, confirming that this convergence is a coordinated pattern
- across all 4 data sets.
+ 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, confirming that this convergence is a coordinated pattern across
+ all 4 data sets.
  While this observation does not prove that the naïve cells have differentiated
  While this observation does not prove that the naïve cells have differentiated
  into memory cells at Day 14, it is consistent with that hypothesis.
  into memory cells at Day 14, it is consistent with that hypothesis.
 \end_layout
 \end_layout
@@ -7218,9 +7581,19 @@ Reproduced with permission.
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-In H3K4me2, H3K4me3, and RNA-seq, this convergence appears to be in progress
- already by Day 5, shown by the smaller distance between naïve and memory
- cells at day 5 along the 
+In H3K4me2, H3K4me3, and 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+, this convergence appears to be in progress already by Day 5, shown by
+ the smaller distance between naïve and memory cells at day 5 along the
+ 
 \begin_inset Formula $y$
 \begin_inset Formula $y$
 \end_inset
 \end_inset
 
 
@@ -7491,9 +7864,17 @@ end{landscape}
 
 
 \end_layout
 \end_layout
 
 
-\begin_layout Standard
-The analyses described in this chapter were organized into a reproducible
- workflow using the Snakemake workflow management system.
+\begin_layout Standard
+The analyses described in this chapter were organized into a reproducible
+ workflow using the Snakemake workflow management system 
+\begin_inset CommandInset citation
+LatexCommand cite
+key "Koster2012"
+literal "false"
+
+\end_inset
+
+.
  As shown in Figure 
  As shown in Figure 
 \begin_inset CommandInset ref
 \begin_inset CommandInset ref
 LatexCommand ref
 LatexCommand ref
@@ -7540,12 +7921,28 @@ noprefix "false"
 \end_inset
 \end_inset
 
 
 ), named 
 ), named 
-\begin_inset Formula $\texttt{chipseq\_count\_tss\_neighborhoods}$
+\begin_inset Flex Code
+status open
+
+\begin_layout Plain Layout
+chipseq_count_tss_neighborhoods
+\end_layout
+
+\end_inset
+
+, depends on the 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
 \end_inset
 \end_inset
 
 
-, depends on the RNA-seq abundance estimates in order to select the most-used
- TSS for each gene, the aligned ChIP-seq reads, the index for those reads,
- and the blacklist of regions to be excluded from ChIP-seq analysis.
+ abundance estimates in order to select the most-used TSS for each gene,
+ the aligned ChIP-seq reads, the index for those reads, and the blacklist
+ of regions to be excluded from ChIP-seq analysis.
  Each step declares its inputs and outputs, and Snakemake uses these to
  Each step declares its inputs and outputs, and Snakemake uses these to
  determine the dependencies between steps.
  determine the dependencies between steps.
  Each step is marked as depending on all the steps whose outputs match its
  Each step is marked as depending on all the steps whose outputs match its
@@ -7568,9 +7965,28 @@ noprefix "false"
 In addition to simply making it easier to organize the steps in the analysis,
 In addition to simply making it easier to organize the steps in the analysis,
  structuring the analysis as a workflow allowed for some analysis strategies
  structuring the analysis as a workflow allowed for some analysis strategies
  that would not have been practical otherwise.
  that would not have been practical otherwise.
- For example, 5 different RNA-seq quantification methods were tested against
- two different reference transcriptome annotations for a total of 10 different
- quantifications of the same RNA-seq data.
+ For example, 5 different 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ quantification methods were tested against two different reference transcriptom
+e annotations for a total of 10 different quantifications of the same 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data.
  These were then compared against each other in the exploratory data analysis
  These were then compared against each other in the exploratory data analysis
  step, to determine that the results were not very sensitive to either the
  step, to determine that the results were not very sensitive to either the
  choice of quantification method or the choice of annotation.
  choice of quantification method or the choice of annotation.
@@ -7609,9 +8025,18 @@ Future Directions
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-The analysis of RNA-seq and ChIP-seq in CD4 T-cells in Chapter 2 is in many
- ways a preliminary study that suggests a multitude of new avenues of investigat
-ion.
+The analysis of 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ and ChIP-seq in CD4 T-cells in Chapter 2 is in many ways a preliminary
+ study that suggests a multitude of new avenues of investigation.
  Here we consider a selection of such avenues.
  Here we consider a selection of such avenues.
 \end_layout
 \end_layout
 
 
@@ -8042,6 +8467,22 @@ Chapter author list: Me, Sunil, Tom, Padma, Dan
 \end_inset
 \end_inset
 
 
 
 
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+
+
+\backslash
+glsresetall
+\end_layout
+
+\end_inset
+
+
 \end_layout
 \end_layout
 
 
 \begin_layout Section
 \begin_layout Section
@@ -8225,8 +8666,8 @@ DNA methylation arrays are a relatively new kind of assay that uses microarrays
  to measure the degree of methylation on cytosines in specific regions arrayed
  to measure the degree of methylation on cytosines in specific regions arrayed
  across the genome.
  across the genome.
  First, bisulfite treatment converts all unmethylated cytosines to uracil
  First, bisulfite treatment converts all unmethylated cytosines to uracil
- (which then become thymine after amplification) while leaving methylated
- cytosines unaffected.
+ (which are read as thymine during amplification and sequencing) while leaving
+ methylated cytosines unaffected.
  Then, each target region is interrogated with two probes: one binds to
  Then, each target region is interrogated with two probes: one binds to
  the original genomic sequence and interrogates the level of methylated
  the original genomic sequence and interrogates the level of methylated
  DNA, and the other binds to the same sequence with all cytosines replaced
  DNA, and the other binds to the same sequence with all cytosines replaced
@@ -8337,8 +8778,17 @@ However, the steep slope of the sigmoid transformation near 0 and 1 tends
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-RNA-seq read count data are also known to show heteroskedasticity, and the
- voom method was introduced for modeling this heteroskedasticity by estimating
+\begin_inset Flex Glossary Term (Capital)
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ read count data are also known to show heteroskedasticity, and the voom
+ method was introduced for modeling this heteroskedasticity by estimating
  the mean-variance trend in the data and using this trend to assign precision
  the mean-variance trend in the data and using this trend to assign precision
  weights to each observation 
  weights to each observation 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
@@ -8350,10 +8800,19 @@ literal "false"
 
 
 .
 .
  While methylation array data are not derived from counts and have a very
  While methylation array data are not derived from counts and have a very
- different mean-variance relationship from that of typical RNA-seq data,
- the voom method makes no specific assumptions on the shape of the mean-variance
- relationship – it only assumes that the relationship can be modeled as
- a smooth curve.
+ different mean-variance relationship from that of typical 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ data, the voom method makes no specific assumptions on the shape of the
+ mean-variance relationship – it only assumes that the relationship can
+ be modeled as a smooth curve.
  Hence, the method is sufficiently general to model the mean-variance relationsh
  Hence, the method is sufficiently general to model the mean-variance relationsh
 ip in methylation array data.
 ip in methylation array data.
  However, the standard implementation of voom assumes that the input is
  However, the standard implementation of voom assumes that the input is
@@ -12739,7 +13198,16 @@ literal "false"
 
 
 \begin_layout Standard
 \begin_layout Standard
 Fortunately, the requirement for equal-size batches is not inherent to the
 Fortunately, the requirement for equal-size batches is not inherent to the
- fRMA algorithm but rather a limitation of the implementation in the frmaTools
+ fRMA algorithm but rather a limitation of the implementation in the 
+\begin_inset Flex Code
+status open
+
+\begin_layout Plain Layout
+frmaTools
+\end_layout
+
+\end_inset
+
  package.
  package.
  In personal communication, the package's author, Matthew McCall, has indicated
  In personal communication, the package's author, Matthew McCall, has indicated
  that with some work, it should be possible to improve the implementation
  that with some work, it should be possible to improve the implementation
@@ -12834,6 +13302,22 @@ Globin-blocking for more effective blood RNA-seq analysis in primate animal
  model
  model
 \end_layout
 \end_layout
 
 
+\begin_layout Standard
+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+
+
+\backslash
+glsresetall
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
 \begin_layout Standard
 \begin_layout Standard
 \begin_inset Flex TODO Note (inline)
 \begin_inset Flex TODO Note (inline)
 status open
 status open
@@ -12895,8 +13379,27 @@ Background
 Primate blood contains high concentrations of globin messenger RNA.
 Primate blood contains high concentrations of globin messenger RNA.
  Globin reduction is a standard technique used to improve the expression
  Globin reduction is a standard technique used to improve the expression
  results obtained by DNA microarrays on RNA from blood samples.
  results obtained by DNA microarrays on RNA from blood samples.
- However, with whole transcriptome RNA-sequencing (RNA-seq) quickly replacing
- microarrays for many applications, the impact of globin reduction for RNA-seq
+ However, with 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ quickly replacing microarrays for many applications, the impact of globin
+ reduction for 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
  has not been previously studied.
  has not been previously studied.
  Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
  Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
  primates.
  primates.
@@ -12908,9 +13411,18 @@ Results
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-Here we report a protocol for RNA-seq in primate blood samples that uses
- complimentary oligonucleotides to block reverse transcription of the alpha
- and beta globin genes.
+Here we report a protocol for 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ in primate blood samples that uses complimentary oligonucleotides to block
+ reverse transcription of the alpha and beta globin genes.
  In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
  In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
  blocking protocol approximately doubles the yield of informative (non-globin)
  blocking protocol approximately doubles the yield of informative (non-globin)
  reads by greatly reducing the fraction of globin reads, while also improving
  reads by greatly reducing the fraction of globin reads, while also improving
@@ -12930,7 +13442,33 @@ eness of mRNA sequencing in primate blood samples by doubling the yield
  of useful reads, allowing detection of more genes, and improving the precision
  of useful reads, allowing detection of more genes, and improving the precision
  of gene expression measurements.
  of gene expression measurements.
  Based on these results, a globin reducing or blocking protocol is recommended
  Based on these results, a globin reducing or blocking protocol is recommended
- for all RNA-seq studies of primate blood samples.
+ for all 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ studies of primate blood samples.
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status collapsed
+
+\begin_layout Plain Layout
+
+
+\backslash
+glsresetall
+\end_layout
+
+\end_inset
+
+
 \end_layout
 \end_layout
 
 
 \begin_layout Section
 \begin_layout Section
@@ -12979,9 +13517,18 @@ Existing protocols use a separate globin pulldown step, slowing down processing
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-Increasingly, researchers are turning to high-throughput mRNA sequencing
- technologies (RNA-seq) in preference to expression microarrays for analysis
- of gene expression 
+Increasingly, researchers are turning to 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ in preference to expression microarrays for analysis of gene expression
+ 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
 key "Mutz2012"
 key "Mutz2012"
@@ -13004,8 +13551,18 @@ literal "false"
 \end_inset
 \end_inset
 
 
 .
 .
- The importance of globin reduction for RNA-seq of blood has only been evaluated
- for a deepSAGE protocol on human samples 
+ The importance of globin reduction for 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ of blood has only been evaluated for a deepSAGE protocol on human samples
+ 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
 key "Mastrokolias2012"
 key "Mastrokolias2012"
@@ -13015,13 +13572,42 @@ literal "false"
 
 
 .
 .
  In the present report, we evaluated globin reduction using custom blocking
  In the present report, we evaluated globin reduction using custom blocking
- oligonucleotides for deep RNA-seq of peripheral blood samples from a nonhuman
- primate, cynomolgus monkey, using the Illumina technology platform.
+ oligonucleotides for deep 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ of peripheral blood samples from a nonhuman primate, cynomolgus monkey,
+ using the Illumina technology platform.
  We demonstrate that globin reduction significantly improves the cost-effectiven
  We demonstrate that globin reduction significantly improves the cost-effectiven
-ess of RNA-seq in blood samples.
+ess of 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ in blood samples.
  Thus, our protocol offers a significant advantage to any investigator planning
  Thus, our protocol offers a significant advantage to any investigator planning
- to use RNA-seq for gene expression profiling of nonhuman primate blood
- samples.
+ to use 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ for gene expression profiling of nonhuman primate blood samples.
  Our method can be generally applied to any species by designing complementary
  Our method can be generally applied to any species by designing complementary
  oligonucleotide blocking probes to the globin gene sequences of that species.
  oligonucleotide blocking probes to the globin gene sequences of that species.
  Indeed, any highly expressed but biologically uninformative transcripts
  Indeed, any highly expressed but biologically uninformative transcripts
@@ -13240,8 +13826,8 @@ literal "false"
  First, no ortholog is annotated for alpha globin in the cynomolgus genome,
  First, no ortholog is annotated for alpha globin in the cynomolgus genome,
  presumably because the human genome has two alpha globin genes with nearly
  presumably because the human genome has two alpha globin genes with nearly
  identical sequences, making the orthology relationship ambiguous.
  identical sequences, making the orthology relationship ambiguous.
- However, two loci in the cynomolgus genome are as “hemoglobin subunit alpha-lik
-e” (LOC102136192 and LOC102136846).
+ However, two loci in the cynomolgus genome are annotated as “hemoglobin
+ subunit alpha-like” (LOC102136192 and LOC102136846).
  LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
  LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
  as protein-coding.
  as protein-coding.
  Our globin reduction protocol was designed to include blocking of these
  Our globin reduction protocol was designed to include blocking of these
@@ -13261,8 +13847,17 @@ e” (LOC102136192 and LOC102136846).
  Therefore, stranded sense counts were used for all further analysis in
  Therefore, stranded sense counts were used for all further analysis in
  the present study to insure that we accurately accounted for globin transcript
  the present study to insure that we accurately accounted for globin transcript
  reduction.
  reduction.
- However, we note that stranded reads are not necessary for RNA-seq using
- our protocol in standard practice.
+ However, we note that stranded reads are not necessary for 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ using our protocol in standard practice.
  
  
 \end_layout
 \end_layout
 
 
@@ -13291,7 +13886,7 @@ literal "false"
 
 
 .
 .
  Log2 counts per million values (logCPM) were calculated using the cpm function
  Log2 counts per million values (logCPM) were calculated using the cpm function
- in
+ in 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -13301,11 +13896,24 @@ edgeR
 
 
 \end_inset
 \end_inset
 
 
- for individual samples and aveLogCPM function for averages across groups
- of samples, using those functions’ default prior count values to avoid
- taking the logarithm of 0.
+ for individual samples and 
+\begin_inset Flex Code
+status open
+
+\begin_layout Plain Layout
+aveLogCPM
+\end_layout
+
+\end_inset
+
+ function for averages across groups of samples, using those functions’
+ default prior count values to avoid taking the logarithm of 0.
  Genes were considered “present” if their average normalized logCPM values
  Genes were considered “present” if their average normalized logCPM values
- across all libraries were at least -1.
+ across all libraries were at least 
+\begin_inset Formula $-1$
+\end_inset
+
+.
  Normalizing for gene length was unnecessary because the sequencing protocol
  Normalizing for gene length was unnecessary because the sequencing protocol
  is 3’-biased and hence the expected read count for each gene is related
  is 3’-biased and hence the expected read count for each gene is related
  to the transcript’s copy number but not its length.
  to the transcript’s copy number but not its length.
@@ -13352,7 +13960,7 @@ Differential Expression Analysis
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-All tests for differential gene expression were performed using
+All tests for differential gene expression were performed using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -14080,9 +14688,19 @@ end{landscape}
 
 
 \begin_layout Standard
 \begin_layout Standard
 The objective of the present study was to validate a new protocol for deep
 The objective of the present study was to validate a new protocol for deep
- RNA-seq of whole blood drawn into PaxGene tubes from cynomolgus monkeys
- undergoing islet transplantation, with particular focus on minimizing the
- loss of useful sequencing space to uninformative globin reads.
+ 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ of whole blood drawn into PaxGene tubes from cynomolgus monkeys undergoing
+ islet transplantation, with particular focus on minimizing the loss of
+ useful sequencing space to uninformative globin reads.
  The details of the analysis with respect to transplant outcomes and the
  The details of the analysis with respect to transplant outcomes and the
  impact of mesenchymal stem cell treatment will be reported in a separate
  impact of mesenchymal stem cell treatment will be reported in a separate
  manuscript (in preparation).
  manuscript (in preparation).
@@ -14442,9 +15060,12 @@ noprefix "false"
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
-Based on these distributions, we selected a detection threshold of -1, which
- is approximately the leftmost edge of the trough between the signal and
- noise peaks.
+Based on these distributions, we selected a detection threshold of 
+\begin_inset Formula $-1$
+\end_inset
+
+, which is approximately the leftmost edge of the trough between the signal
+ and noise peaks.
  This represents the most liberal possible detection threshold that doesn't
  This represents the most liberal possible detection threshold that doesn't
  call substantial numbers of noise genes as detected.
  call substantial numbers of noise genes as detected.
  Among the full dataset, 13429 genes were detected at this threshold, and
  Among the full dataset, 13429 genes were detected at this threshold, and
@@ -14543,7 +15164,7 @@ noprefix "false"
 
 
 , and genes with an average logCPM below -1 were filtered out.
 , and genes with an average logCPM below -1 were filtered out.
  Each remaining gene was tested for differential abundance with respect
  Each remaining gene was tested for differential abundance with respect
- to globin blocking (GB) using
+ to globin blocking (GB) using 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -14555,7 +15176,7 @@ edgeR
 
 
 ’s quasi-likelihood F-test, fitting a negative binomial generalized linear
 ’s quasi-likelihood F-test, fitting a negative binomial generalized linear
  model to table of read counts in each library.
  model to table of read counts in each library.
- For each gene,
+ For each gene, 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -14690,7 +15311,7 @@ Comparison of inter-sample gene abundance correlations with and without
  All libraries were normalized together as described in Figure 2, and genes
  All libraries were normalized together as described in Figure 2, and genes
  with an average abundance (logCPM, log2 counts per million reads counted)
  with an average abundance (logCPM, log2 counts per million reads counted)
  less than -1 were filtered out.
  less than -1 were filtered out.
- Each gene’s logCPM was computed in each library using the
+ Each gene’s logCPM was computed in each library using the 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -14723,6 +15344,19 @@ edgeR
 \end_inset
 \end_inset
 
 
 
 
+\end_layout
+
+\begin_layout Standard
+\begin_inset Flex TODO Note (inline)
+status open
+
+\begin_layout Plain Layout
+Give these numbers the LaTeX math treatment
+\end_layout
+
+\end_inset
+
+
 \end_layout
 \end_layout
 
 
 \begin_layout Standard
 \begin_layout Standard
@@ -14748,7 +15382,7 @@ ons than the non-GB libraries.
  sign-rank test: V = 2195, P ≪ 2.2e-16).
  sign-rank test: V = 2195, P ≪ 2.2e-16).
  Performing the same tests on the Spearman correlations gave the same conclusion
  Performing the same tests on the Spearman correlations gave the same conclusion
  (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
  (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
- The
+ The 
 \begin_inset Flex Code
 \begin_inset Flex Code
 status open
 status open
 
 
@@ -15318,8 +15952,18 @@ The challenge of doing global gene expression profiling in cynomolgus monkeys
  cover this genome and have not been updated since the first assemblies
  cover this genome and have not been updated since the first assemblies
  of the cynomolgus genome were published.
  of the cynomolgus genome were published.
  Therefore, we determined that the best strategy for peripheral blood profiling
  Therefore, we determined that the best strategy for peripheral blood profiling
- was to do deep RNA-seq and inform the workflow using the latest available
- genome assembly and annotation 
+ was to do deep 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ and inform the workflow using the latest available genome assembly and
+ annotation 
 \begin_inset CommandInset citation
 \begin_inset CommandInset citation
 LatexCommand cite
 LatexCommand cite
 key "Wilson2013"
 key "Wilson2013"
@@ -15329,8 +15973,18 @@ literal "false"
 
 
 .
 .
  However, it was not immediately clear whether globin reduction was necessary
  However, it was not immediately clear whether globin reduction was necessary
- for RNA-seq or how much improvement in efficiency or sensitivity to detect
- differential gene expression would be achieved for the added cost and work.
+ for 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ or how much improvement in efficiency or sensitivity to detect differential
+ gene expression would be achieved for the added cost and work.
  
  
 \end_layout
 \end_layout
 
 
@@ -15351,7 +16005,17 @@ literal "false"
  and thus, significantly reduces the complexity of the transcriptome.
  and thus, significantly reduces the complexity of the transcriptome.
  Therefore, we could not determine how DeepSAGE results would translate
  Therefore, we could not determine how DeepSAGE results would translate
  to the common strategy in the field for assaying the entire transcript
  to the common strategy in the field for assaying the entire transcript
- population by whole-transcriptome 3’-end RNA-seq.
+ population by whole-transcriptome 3’-end 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+.
  Furthermore, if globin reduction is necessary, we also needed a globin
  Furthermore, if globin reduction is necessary, we also needed a globin
  reduction method specific to cynomolgus globin sequences that would work
  reduction method specific to cynomolgus globin sequences that would work
  an organism for which no kit is available off the shelf.
  an organism for which no kit is available off the shelf.
@@ -15379,11 +16043,29 @@ More importantly, globin blocking not only nearly doubles the yield of usable
  Globin blocking thus represents a cost-effective way to squeeze more data
  Globin blocking thus represents a cost-effective way to squeeze more data
  and statistical power out of the same blood samples and the same amount
  and statistical power out of the same blood samples and the same amount
  of sequencing.
  of sequencing.
- In conclusion, globin reduction greatly increases the yield of useful RNA-seq
+ In conclusion, globin reduction greatly increases the yield of useful 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
  reads mapping to the rest of the genome, with minimal perturbations in
  reads mapping to the rest of the genome, with minimal perturbations in
  the relative levels of non-globin genes.
  the relative levels of non-globin genes.
  Based on these results, globin transcript reduction using sequence-specific,
  Based on these results, globin transcript reduction using sequence-specific,
- complementary blocking oligonucleotides is recommended for all deep RNA-seq
+ complementary blocking oligonucleotides is recommended for all deep 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
  of cynomolgus and other nonhuman primate blood samples.
  of cynomolgus and other nonhuman primate blood samples.
 \end_layout
 \end_layout
 
 
@@ -15405,10 +16087,19 @@ te the effectiveness of the method in reducing globin reads while preserving
 
 
 \begin_layout Standard
 \begin_layout Standard
 The motivation for developing a fast practical way to enrich for non-globin
 The motivation for developing a fast practical way to enrich for non-globin
- reads in cyno blood samples was to enable a large-scale RNA-seq experiment
- investigating the effects of mesenchymal stem cell infusion on blood gene
- expression in cynomologus transplant recipients in a time course after
- transplantation.
+ reads in cyno blood samples was to enable a large-scale 
+\begin_inset Flex Glossary Term
+status open
+
+\begin_layout Plain Layout
+RNA-seq
+\end_layout
+
+\end_inset
+
+ experiment investigating the effects of mesenchymal stem cell infusion
+ on blood gene expression in cynomologus transplant recipients in a time
+ course after transplantation.
  With the globin blocking method in place, the way is now clear for this
  With the globin blocking method in place, the way is now clear for this
  experiment to proceed.
  experiment to proceed.
 \end_layout
 \end_layout