thesis.lyx 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191
  1. #LyX 2.3 created this file. For more info see http://www.lyx.org/
  2. \lyxformat 544
  3. \begin_document
  4. \begin_header
  5. \save_transient_properties true
  6. \origin unavailable
  7. \textclass extbook
  8. \begin_preamble
  9. % Add a DRAFT watermark
  10. \usepackage{draftwatermark}
  11. \SetWatermarkLightness{0.97}
  12. \SetWatermarkScale{1}
  13. % Set up required header format
  14. \usepackage{fancyhdr}
  15. \pagestyle{fancy}
  16. \renewcommand{\headrulewidth}{0pt}
  17. \rhead{}
  18. \lhead{}
  19. \rfoot{}
  20. \lfoot{}
  21. \cfoot{\thepage} % Page number bottom center
  22. % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
  23. \usepackage{xstring}
  24. \usepackage{etoolbox}
  25. \usepackage{caption}
  26. \captionsetup{labelfont=bf,tableposition=top}
  27. \makeatletter
  28. \newcommand\formatlabel[1]{%
  29. \noexpandarg
  30. \IfSubStr{#1}{.}{%
  31. \StrBefore{#1}{.}[\firstcaption]%
  32. \StrBehind{#1}{.}[\secondcaption]%
  33. \textbf{\firstcaption.} \secondcaption}{%
  34. #1}%
  35. }
  36. \patchcmd{\@caption}{#3}{\formatlabel{#3}}
  37. \makeatother
  38. \end_preamble
  39. \use_default_options true
  40. \maintain_unincluded_children false
  41. \language english
  42. \language_package default
  43. \inputencoding auto
  44. \fontencoding global
  45. \font_roman "default" "default"
  46. \font_sans "default" "default"
  47. \font_typewriter "default" "default"
  48. \font_math "auto" "auto"
  49. \font_default_family default
  50. \use_non_tex_fonts false
  51. \font_sc false
  52. \font_osf false
  53. \font_sf_scale 100 100
  54. \font_tt_scale 100 100
  55. \use_microtype false
  56. \use_dash_ligatures true
  57. \graphics default
  58. \default_output_format pdf4
  59. \output_sync 0
  60. \bibtex_command default
  61. \index_command default
  62. \paperfontsize 12
  63. \spacing double
  64. \use_hyperref true
  65. \pdf_bookmarks true
  66. \pdf_bookmarksnumbered false
  67. \pdf_bookmarksopen false
  68. \pdf_bookmarksopenlevel 1
  69. \pdf_breaklinks false
  70. \pdf_pdfborder false
  71. \pdf_colorlinks false
  72. \pdf_backref false
  73. \pdf_pdfusetitle true
  74. \papersize letterpaper
  75. \use_geometry true
  76. \use_package amsmath 1
  77. \use_package amssymb 1
  78. \use_package cancel 1
  79. \use_package esint 1
  80. \use_package mathdots 1
  81. \use_package mathtools 1
  82. \use_package mhchem 1
  83. \use_package stackrel 1
  84. \use_package stmaryrd 1
  85. \use_package undertilde 1
  86. \cite_engine basic
  87. \cite_engine_type default
  88. \biblio_style plain
  89. \use_bibtopic false
  90. \use_indices false
  91. \paperorientation portrait
  92. \suppress_date false
  93. \justification true
  94. \use_refstyle 1
  95. \use_minted 0
  96. \index Index
  97. \shortcut idx
  98. \color #008000
  99. \end_index
  100. \leftmargin 1.5in
  101. \topmargin 1in
  102. \rightmargin 1in
  103. \bottommargin 1in
  104. \secnumdepth 3
  105. \tocdepth 3
  106. \paragraph_separation indent
  107. \paragraph_indentation default
  108. \is_math_indent 0
  109. \math_numbering_side default
  110. \quotes_style english
  111. \dynamic_quotes 0
  112. \papercolumns 1
  113. \papersides 2
  114. \paperpagestyle default
  115. \tracking_changes false
  116. \output_changes false
  117. \html_math_output 0
  118. \html_css_as_file 0
  119. \html_be_strict false
  120. \end_header
  121. \begin_body
  122. \begin_layout Title
  123. Bioinformatic analysis of complex, high-throughput genomic and epigenomic
  124. data in the context of immunology and transplant rejection
  125. \end_layout
  126. \begin_layout Author
  127. A thesis presented
  128. \begin_inset Newline newline
  129. \end_inset
  130. by
  131. \begin_inset Newline newline
  132. \end_inset
  133. Ryan C.
  134. Thompson
  135. \begin_inset Newline newline
  136. \end_inset
  137. to
  138. \begin_inset Newline newline
  139. \end_inset
  140. The Scripps Research Institute Graduate Program
  141. \begin_inset Newline newline
  142. \end_inset
  143. in partial fulfillment of the requirements for the degree of
  144. \begin_inset Newline newline
  145. \end_inset
  146. Doctor of Philosophy in the subject of Biology
  147. \begin_inset Newline newline
  148. \end_inset
  149. for
  150. \begin_inset Newline newline
  151. \end_inset
  152. The Scripps Research Institute
  153. \begin_inset Newline newline
  154. \end_inset
  155. La Jolla, California
  156. \end_layout
  157. \begin_layout Date
  158. May 2019
  159. \end_layout
  160. \begin_layout Standard
  161. [Copyright notice]
  162. \end_layout
  163. \begin_layout Standard
  164. [Thesis acceptance form]
  165. \end_layout
  166. \begin_layout Standard
  167. [Dedication]
  168. \end_layout
  169. \begin_layout Standard
  170. [Acknowledgements]
  171. \end_layout
  172. \begin_layout Standard
  173. \begin_inset CommandInset toc
  174. LatexCommand tableofcontents
  175. \end_inset
  176. \end_layout
  177. \begin_layout Standard
  178. \begin_inset FloatList table
  179. \end_inset
  180. \end_layout
  181. \begin_layout Standard
  182. \begin_inset FloatList figure
  183. \end_inset
  184. \end_layout
  185. \begin_layout Standard
  186. [List of Abbreviations]
  187. \begin_inset Note Note
  188. status open
  189. \begin_layout Plain Layout
  190. https://wiki.lyx.org/Tips/Nomenclature
  191. \end_layout
  192. \end_inset
  193. \end_layout
  194. \begin_layout Standard
  195. [Abstract]
  196. \end_layout
  197. \begin_layout Chapter*
  198. Abstract
  199. \end_layout
  200. \begin_layout Chapter
  201. Introduction
  202. \end_layout
  203. \begin_layout Section
  204. Background & Significance
  205. \end_layout
  206. \begin_layout Subsection
  207. Biological motivation
  208. \end_layout
  209. \begin_layout Itemize
  210. Rejection is the major long-term threat to organ and tissue grafts
  211. \end_layout
  212. \begin_deeper
  213. \begin_layout Itemize
  214. Common mechanisms of rejection
  215. \end_layout
  216. \begin_layout Itemize
  217. Effective immune suppression requires monitoring for rejection and tuning
  218. \end_layout
  219. \begin_layout Itemize
  220. Current tests for rejection (tissue biopsy) are invasive and biased
  221. \end_layout
  222. \begin_layout Itemize
  223. A blood test based on microarrays would be less biased and invasive
  224. \end_layout
  225. \end_deeper
  226. \begin_layout Itemize
  227. Memory cells are resistant to immune suppression
  228. \end_layout
  229. \begin_deeper
  230. \begin_layout Itemize
  231. Mechanisms of resistance in memory cells are poorly understood
  232. \end_layout
  233. \begin_layout Itemize
  234. A better understanding of immune memory formation is needed
  235. \end_layout
  236. \end_deeper
  237. \begin_layout Itemize
  238. Mesenchymal stem cell infusion is a promising new treatment to prevent/delay
  239. rejection
  240. \end_layout
  241. \begin_deeper
  242. \begin_layout Itemize
  243. Demonstrated in mice, but not yet in primates
  244. \end_layout
  245. \begin_layout Itemize
  246. Mechanism currently unknown, but MSC are known to be immune modulatory
  247. \end_layout
  248. \end_deeper
  249. \begin_layout Subsection
  250. Overview of bioinformatic analysis methods
  251. \end_layout
  252. \begin_layout Standard
  253. An overview of all the methods used, including what problem they solve,
  254. what assumptions they make, and a basic description of how they work.
  255. \end_layout
  256. \begin_layout Itemize
  257. ChIP-seq Peak calling
  258. \end_layout
  259. \begin_deeper
  260. \begin_layout Itemize
  261. Cross-correlation analysis to determine fragment size
  262. \end_layout
  263. \begin_layout Itemize
  264. Broad vs narrow peaks
  265. \end_layout
  266. \begin_layout Itemize
  267. SICER for broad peaks
  268. \end_layout
  269. \begin_layout Itemize
  270. IDR for biologically reproducible peaks
  271. \end_layout
  272. \begin_layout Itemize
  273. csaw peak filtering guidelines for unbiased downstream analysis
  274. \end_layout
  275. \end_deeper
  276. \begin_layout Itemize
  277. Normalization is non-trivial and application-dependant
  278. \end_layout
  279. \begin_deeper
  280. \begin_layout Itemize
  281. Expression arrays: RMA & fRMA; why fRMA is needed
  282. \end_layout
  283. \begin_layout Itemize
  284. Methylation arrays: M-value transformation approximates normal data but
  285. induces heteroskedasticity
  286. \end_layout
  287. \begin_layout Itemize
  288. RNA-seq: normalize based on assumption that the average gene is not changing
  289. \end_layout
  290. \begin_layout Itemize
  291. ChIP-seq: complex with many considerations, dependent on experimental methods,
  292. biological system, and analysis goals
  293. \end_layout
  294. \end_deeper
  295. \begin_layout Itemize
  296. Limma: The standard linear modeling framework for genomics
  297. \end_layout
  298. \begin_deeper
  299. \begin_layout Itemize
  300. empirical Bayes variance modeling: limma's core feature
  301. \end_layout
  302. \begin_layout Itemize
  303. edgeR & DESeq2: Extend with negative bonomial GLM for RNA-seq and other
  304. count data
  305. \end_layout
  306. \begin_layout Itemize
  307. voom: Extend with precision weights to model mean-variance trend
  308. \end_layout
  309. \begin_layout Itemize
  310. arrayWeights and duplicateCorrelation to handle complex variance structures
  311. \end_layout
  312. \end_deeper
  313. \begin_layout Itemize
  314. sva and ComBat for batch correction
  315. \end_layout
  316. \begin_layout Itemize
  317. Factor analysis: PCA, MDS, MOFA
  318. \end_layout
  319. \begin_deeper
  320. \begin_layout Itemize
  321. Batch-corrected PCA is informative, but careful application is required
  322. to avoid bias
  323. \end_layout
  324. \end_deeper
  325. \begin_layout Itemize
  326. Gene set analysis: camera and SPIA
  327. \end_layout
  328. \begin_layout Section
  329. Innovation
  330. \end_layout
  331. \begin_layout Itemize
  332. MSC infusion to improve transplant outcomes (prevent/delay rejection)
  333. \end_layout
  334. \begin_deeper
  335. \begin_layout Itemize
  336. Characterize MSC response to interferon gamma
  337. \end_layout
  338. \begin_layout Itemize
  339. IFN-g is thought to stimulate their function
  340. \end_layout
  341. \begin_layout Itemize
  342. Test IFN-g treated MSC infusion as a therapy to delay graft rejection in
  343. cynomolgus monkeys
  344. \end_layout
  345. \begin_layout Itemize
  346. Monitor animals post-transplant using blood RNA-seq at serial time points
  347. \end_layout
  348. \end_deeper
  349. \begin_layout Itemize
  350. Investigate dynamics of histone marks in CD4 T-cell activation and memory
  351. \end_layout
  352. \begin_deeper
  353. \begin_layout Itemize
  354. Previous studies have looked at single snapshots of histone marks
  355. \end_layout
  356. \begin_layout Itemize
  357. Instead, look at changes in histone marks across activation and memory
  358. \end_layout
  359. \end_deeper
  360. \begin_layout Itemize
  361. High-throughput sequencing and microarray technologies
  362. \end_layout
  363. \begin_deeper
  364. \begin_layout Itemize
  365. Powerful methods for assaying gene expression and epigenetics across entire
  366. genomes
  367. \end_layout
  368. \begin_layout Itemize
  369. Proper analysis requires finding and exploiting systematic genome-wide trends
  370. \end_layout
  371. \end_deeper
  372. \begin_layout Chapter
  373. Reproducible genome-wide epigenetic analysis of H3K4 and H3K27 methylation
  374. in naive and memory CD4 T-cell activation
  375. \end_layout
  376. \begin_layout Section
  377. Approach
  378. \end_layout
  379. \begin_layout Itemize
  380. CD4 T-cells are central to all adaptive immune responses and memory
  381. \end_layout
  382. \begin_layout Itemize
  383. H3K4 and H3K27 methylation are major epigenetic regulators of gene expression
  384. \end_layout
  385. \begin_layout Itemize
  386. Canonically, H3K4 is activating and H3K27 is inhibitory, but the reality
  387. is complex
  388. \end_layout
  389. \begin_layout Itemize
  390. Looking at these marks during CD4 activation and memory should reveal new
  391. mechanistic details
  392. \end_layout
  393. \begin_layout Itemize
  394. Test
  395. \begin_inset Quotes eld
  396. \end_inset
  397. poised promoter
  398. \begin_inset Quotes erd
  399. \end_inset
  400. hypothesis in which H3K4 and H3K27 are both methylated
  401. \end_layout
  402. \begin_layout Itemize
  403. Expand scope of analysis beyond simple promoter counts
  404. \end_layout
  405. \begin_deeper
  406. \begin_layout Itemize
  407. Analyze peaks genome-wide, including in intergenic regions
  408. \end_layout
  409. \begin_layout Itemize
  410. Analysis of coverage distribution shape within promoters, e.g.
  411. upstream vs downstream coverage
  412. \end_layout
  413. \end_deeper
  414. \begin_layout Section
  415. Methods
  416. \end_layout
  417. \begin_layout Itemize
  418. Re-analyze previously published CD4 ChIP-seq & RNA-seq data
  419. \begin_inset CommandInset citation
  420. LatexCommand cite
  421. key "LaMere2016,Lamere2017"
  422. literal "true"
  423. \end_inset
  424. \end_layout
  425. \begin_deeper
  426. \begin_layout Itemize
  427. Completely reimplement analysis from scratch as a reproducible workflow
  428. \end_layout
  429. \begin_layout Itemize
  430. Use newly published methods & algorithms not available during the original
  431. analysis: SICER, csaw, MOFA, ComBat, sva, GREAT, and more
  432. \end_layout
  433. \end_deeper
  434. \begin_layout Itemize
  435. SICER, IDR, csaw, & GREAT to call ChIP-seq peaks genome-wide, perform differenti
  436. al abundance analysis, and relate those peaks to gene expression
  437. \end_layout
  438. \begin_layout Itemize
  439. Promoter counts in sliding windows around each gene's highest-expressed
  440. TSS to investigate coverage distribution within promoters
  441. \end_layout
  442. \begin_layout Section
  443. Results
  444. \end_layout
  445. \begin_layout Itemize
  446. Different histone marks have different effective promoter radii
  447. \end_layout
  448. \begin_layout Itemize
  449. H3K4 and RNA-seq data show clear evidence of naive convergence with memory
  450. between days 1 and 5
  451. \end_layout
  452. \begin_layout Itemize
  453. Promoter coverage distribution affects gene expression independent of total
  454. promoter count
  455. \end_layout
  456. \begin_layout Itemize
  457. Remaining analyses to complete:
  458. \end_layout
  459. \begin_deeper
  460. \begin_layout Itemize
  461. Look for naive-to-memory convergence in H3K27 data
  462. \end_layout
  463. \begin_layout Itemize
  464. Look at enriched pathways for day 0 to day 1 (activation) compared to day
  465. 1 to day 5 (putative naive-to-memory differentiation)
  466. \end_layout
  467. \begin_layout Itemize
  468. Find genes with different expression patterns in naive vs.
  469. memory and try to explain the difference with the Day 0 histone mark data
  470. \end_layout
  471. \begin_deeper
  472. \begin_layout Itemize
  473. Determine whether co-occurrence of H3K4me3 and H3K27me3 (proposed
  474. \begin_inset Quotes eld
  475. \end_inset
  476. poised
  477. \begin_inset Quotes erd
  478. \end_inset
  479. state) has effects on post-activation expression dynamics
  480. \end_layout
  481. \begin_layout Itemize
  482. Promoter coverage distribution dynamics throughout activation for interesting
  483. subsets of genes
  484. \end_layout
  485. \end_deeper
  486. \begin_layout Itemize
  487. (Backup) Compare and contrast behavior of promoter peaks vs intergenic (putative
  488. enhancer) peaks (GREAT analysis)
  489. \end_layout
  490. \begin_deeper
  491. \begin_layout Itemize
  492. Put results in context of important T-cell pathways & gene expression data
  493. \end_layout
  494. \end_deeper
  495. \end_deeper
  496. \begin_layout Section
  497. Discussion
  498. \end_layout
  499. \begin_layout Itemize
  500. "Promoter radius" is not constant and must be defined empirically for a
  501. given data set
  502. \end_layout
  503. \begin_layout Itemize
  504. Evaluate evidence for poised promoters and enhancer effects on gene expression
  505. dynamics of naive-to-memory differentiation
  506. \end_layout
  507. \begin_layout Itemize
  508. Compare to published work on other epigenetic marks (e.g.
  509. chromatin accessibility)
  510. \end_layout
  511. \begin_layout Chapter
  512. Improving array-based analyses of transplant rejection by optimizing data
  513. preprocessing
  514. \end_layout
  515. \begin_layout Section
  516. Approach
  517. \end_layout
  518. \begin_layout Itemize
  519. Machine-learning applications demand a "single-channel" normalization method
  520. \end_layout
  521. \begin_layout Itemize
  522. frozen RMA is a good solution, but not trivial to apply
  523. \end_layout
  524. \begin_layout Itemize
  525. Methylation array data preprocessing induces heteroskedasticity
  526. \end_layout
  527. \begin_layout Itemize
  528. Need to account for this mean-variance dependency in analysis
  529. \end_layout
  530. \begin_layout Section
  531. Methods
  532. \end_layout
  533. \begin_layout Itemize
  534. Expression array normalization for detecting acute rejection
  535. \end_layout
  536. \begin_layout Itemize
  537. Use frozen RMA, a single-channel variant of RMA
  538. \end_layout
  539. \begin_layout Itemize
  540. Generate custom fRMA normalization vectors for each tissue (biopsy, blood)
  541. \end_layout
  542. \begin_layout Itemize
  543. Methylation arrays for differential methylation in rejection vs.
  544. healthy transplant
  545. \end_layout
  546. \begin_layout Itemize
  547. Adapt voom method originally designed for RNA-seq to model mean-variance
  548. dependence
  549. \end_layout
  550. \begin_layout Itemize
  551. Use sample precision weighting and sva to adjust for other confounding factors
  552. \end_layout
  553. \begin_layout Section
  554. Results
  555. \end_layout
  556. \begin_layout Itemize
  557. custom fRMA normalization improved cross-validated classifier performance
  558. \begin_inset CommandInset citation
  559. LatexCommand cite
  560. key "Kurian2014"
  561. literal "true"
  562. \end_inset
  563. \end_layout
  564. \begin_layout Itemize
  565. voom, precision weights, and sva improved model fit
  566. \end_layout
  567. \begin_deeper
  568. \begin_layout Itemize
  569. Also increased sensitivity for detecting differential methylation
  570. \end_layout
  571. \end_deeper
  572. \begin_layout Section
  573. Discussion
  574. \end_layout
  575. \begin_layout Itemize
  576. fRMA enables classifying new samples without re-normalizing the entire data
  577. set
  578. \end_layout
  579. \begin_deeper
  580. \begin_layout Itemize
  581. Critical for translating a classifier into clinical practice
  582. \end_layout
  583. \end_deeper
  584. \begin_layout Itemize
  585. Methods like voom designed for RNA-seq can also help with array analysis
  586. \end_layout
  587. \begin_layout Itemize
  588. Extracting and modeling confounders common to many features improves model
  589. correspondence to known biology
  590. \end_layout
  591. \begin_layout Chapter
  592. Globin-blocking for more effective blood RNA-seq analysis in primate animal
  593. model
  594. \end_layout
  595. \begin_layout Standard
  596. \begin_inset Note Note
  597. status open
  598. \begin_layout Plain Layout
  599. TODO Choose between above and the paper title: Optimizing yield of deep
  600. RNA sequencing for gene expression profiling by globin reduction of peripheral
  601. blood samples from cynomolgus monkeys (Macaca fascicularis).
  602. \end_layout
  603. \end_inset
  604. \end_layout
  605. \begin_layout Standard
  606. \begin_inset Note Note
  607. status open
  608. \begin_layout Plain Layout
  609. How to integrate/credit sections written by others (e.g.
  610. wetlab methods)? (Majority of paper text is written by me.)Preprint the
  611. paper, then cite it.
  612. Every chapter has an author list, which may or may not be part of a citation
  613. to a published/preprinted paper.
  614. \end_layout
  615. \begin_layout Plain Layout
  616. TODO: Preprint the paper, then cite it.
  617. \end_layout
  618. \begin_layout Plain Layout
  619. TODO: Chapter author list: https://tex.stackexchange.com/questions/156862/displayi
  620. ng-author-for-each-chapter-in-book
  621. \end_layout
  622. \end_inset
  623. \end_layout
  624. \begin_layout Section*
  625. Abstract
  626. \end_layout
  627. \begin_layout Paragraph
  628. Background
  629. \end_layout
  630. \begin_layout Standard
  631. Primate blood contains high concentrations of globin messenger RNA.
  632. Globin reduction is a standard technique used to improve the expression
  633. results obtained by DNA microarrays on RNA from blood samples.
  634. However, with whole transcriptome RNA-sequencing (RNA-seq) quickly replacing
  635. microarrays for many applications, the impact of globin reduction for RNA-seq
  636. has not been previously studied.
  637. Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
  638. primates.
  639. \end_layout
  640. \begin_layout Paragraph
  641. Results
  642. \end_layout
  643. \begin_layout Standard
  644. Here we report a protocol for RNA-seq in primate blood samples that uses
  645. complimentary oligonucleotides to block reverse transcription of the alpha
  646. and beta globin genes.
  647. In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
  648. blocking protocol approximately doubles the yield of informative (non-globin)
  649. reads by greatly reducing the fraction of globin reads, while also improving
  650. the consistency in sequencing depth between samples.
  651. The increased yield enables detection of about 2000 more genes, significantly
  652. increases the correlation in measured gene expression levels between samples,
  653. and increases the sensitivity of differential gene expression tests.
  654. \end_layout
  655. \begin_layout Paragraph
  656. Conclusions
  657. \end_layout
  658. \begin_layout Standard
  659. These results show that globin blocking significantly improves the cost-effectiv
  660. eness of mRNA sequencing in primate blood samples by doubling the yield
  661. of useful reads, allowing detection of more genes, and improving the precision
  662. of gene expression measurements.
  663. Based on these results, a globin reducing or blocking protocol is recommended
  664. for all RNA-seq studies of primate blood samples.
  665. \end_layout
  666. \begin_layout Section
  667. Approach
  668. \end_layout
  669. \begin_layout Itemize
  670. Cynomolgus monkeys as a model organism
  671. \end_layout
  672. \begin_deeper
  673. \begin_layout Itemize
  674. Highly related to humans
  675. \end_layout
  676. \begin_layout Itemize
  677. Small size and short life cycle - good research animal
  678. \end_layout
  679. \begin_layout Itemize
  680. Genomics resources still in development
  681. \end_layout
  682. \end_deeper
  683. \begin_layout Itemize
  684. Inadequacy of existing blood RNA-seq protocols
  685. \end_layout
  686. \begin_deeper
  687. \begin_layout Itemize
  688. Existing protocols use a separate globin pulldown step, slowing down processing
  689. \end_layout
  690. \end_deeper
  691. \begin_layout Standard
  692. Increasingly, researchers are turning to high-throughput mRNA sequencing
  693. technologies (RNA-seq) in preference to expression microarrays for analysis
  694. of gene expression
  695. \begin_inset CommandInset citation
  696. LatexCommand cite
  697. key "Mutz2012"
  698. literal "false"
  699. \end_inset
  700. .
  701. The advantages are even greater for study of model organisms with no well-estab
  702. lished array platforms available, such as the cynomolgus monkey (Macaca
  703. fascicularis).
  704. High fractions of globin mRNA are naturally present in mammalian peripheral
  705. blood samples (up to 70% of total mRNA) and these are known to interfere
  706. with the results of array-based expression profiling
  707. \begin_inset CommandInset citation
  708. LatexCommand cite
  709. key "Winn2010"
  710. literal "false"
  711. \end_inset
  712. .
  713. The importance of globin reduction for RNA-seq of blood has only been evaluated
  714. for a deepSAGE protocol on human samples
  715. \begin_inset CommandInset citation
  716. LatexCommand cite
  717. key "Mastrokolias2012"
  718. literal "false"
  719. \end_inset
  720. .
  721. In the present report, we evaluated globin reduction using custom blocking
  722. oligonucleotides for deep RNA-seq of peripheral blood samples from a nonhuman
  723. primate, cynomolgus monkey, using the Illumina technology platform.
  724. We demonstrate that globin reduction significantly improves the cost-effectiven
  725. ess of RNA-seq in blood samples.
  726. Thus, our protocol offers a significant advantage to any investigator planning
  727. to use RNA-seq for gene expression profiling of nonhuman primate blood
  728. samples.
  729. Our method can be generally applied to any species by designing complementary
  730. oligonucleotide blocking probes to the globin gene sequences of that species.
  731. Indeed, any highly expressed but biologically uninformative transcripts
  732. can also be blocked to further increase sequencing efficiency and value
  733. \begin_inset CommandInset citation
  734. LatexCommand cite
  735. key "Arnaud2016"
  736. literal "false"
  737. \end_inset
  738. .
  739. \end_layout
  740. \begin_layout Section
  741. Methods
  742. \end_layout
  743. \begin_layout Subsection*
  744. Sample collection
  745. \end_layout
  746. \begin_layout Standard
  747. All research reported here was done under IACUC-approved protocols at the
  748. University of Miami and complied with all applicable federal and state
  749. regulations and ethical principles for nonhuman primate research.
  750. Blood draws occurred between 16 April 2012 and 18 June 2015.
  751. The experimental system involved intrahepatic pancreatic islet transplantation
  752. into Cynomolgus monkeys with induced diabetes mellitus with or without
  753. concomitant infusion of mesenchymal stem cells.
  754. Blood was collected at serial time points before and after transplantation
  755. into PAXgene Blood RNA tubes (PreAnalytiX/Qiagen, Valencia, CA) at the
  756. precise volume:volume ratio of 2.5 ml whole blood into 6.9 ml of PAX gene
  757. additive.
  758. \end_layout
  759. \begin_layout Subsection*
  760. Globin Blocking
  761. \end_layout
  762. \begin_layout Standard
  763. Four oligonucleotides were designed to hybridize to the 3’ end of the transcript
  764. s for Cynomolgus HBA1, HBA2 and HBB, with two hybridization sites for HBB
  765. and 2 sites for HBA (the chosen sites were identical in both HBA genes).
  766. All oligos were purchased from Sigma and were entirely composed of 2’O-Me
  767. bases with a C3 spacer positioned at the 3’ ends to prevent any polymerase
  768. mediated primer extension.
  769. \end_layout
  770. \begin_layout Quote
  771. HBA1/2 site 1: GCCCACUCAGACUUUAUUCAAAG-C3spacer
  772. \end_layout
  773. \begin_layout Quote
  774. HBA1/2 site 2: GGUGCAAGGAGGGGAGGAG-C3spacer
  775. \end_layout
  776. \begin_layout Quote
  777. HBB site 1: AAUGAAAAUAAAUGUUUUUUAUUAG-C3spacer
  778. \end_layout
  779. \begin_layout Quote
  780. HBB site 2: CUCAAGGCCCUUCAUAAUAUCCC-C3spacer
  781. \end_layout
  782. \begin_layout Subsection*
  783. RNA-seq Library Preparation
  784. \end_layout
  785. \begin_layout Standard
  786. Sequencing libraries were prepared with 200ng total RNA from each sample.
  787. Polyadenylated mRNA was selected from 200 ng aliquots of cynomologus blood-deri
  788. ved total RNA using Ambion Dynabeads Oligo(dT)25 beads (Invitrogen) following
  789. manufacturer’s recommended protocol.
  790. PolyA selected RNA was then combined with 8 pmol of HBA1/2 (site 1), 8
  791. pmol of HBA1/2 (site 2), 12 pmol of HBB (site 1) and 12 pmol of HBB (site
  792. 2) oligonucleotides.
  793. In addition, 20 pmol of RT primer containing a portion of the Illumina
  794. adapter sequence (B-oligo-dTV: GAGTTCCTTGGCACCCGAGAATTCCATTTTTTTTTTTTTTTTTTTV)
  795. and 4 µL of 5X First Strand buffer (250 mM Tris-HCl pH 8.3, 375 mM KCl,
  796. 15mM MgCl2) were added in a total volume of 15 µL.
  797. The RNA was fragmented by heating this cocktail for 3 minutes at 95°C and
  798. then placed on ice.
  799. This was followed by the addition of 2 µL 0.1 M DTT, 1 µL RNaseOUT, 1 µL
  800. 10mM dNTPs 10% biotin-16 aminoallyl-2’- dUTP and 10% biotin-16 aminoallyl-2’-
  801. dCTP (TriLink Biotech, San Diego, CA), 1 µL Superscript II (200U/ µL, Thermo-Fi
  802. sher).
  803. A second “unblocked” library was prepared in the same way for each sample
  804. but replacing the blocking oligos with an equivalent volume of water.
  805. The reaction was carried out at 25°C for 15 minutes and 42°C for 40 minutes,
  806. followed by incubation at 75°C for 10 minutes to inactivate the reverse
  807. transcriptase.
  808. \end_layout
  809. \begin_layout Standard
  810. The cDNA/RNA hybrid molecules were purified using 1.8X Ampure XP beads (Agencourt
  811. ) following supplier’s recommended protocol.
  812. The cDNA/RNA hybrid was eluted in 25 µL of 10 mM Tris-HCl pH 8.0, and then
  813. bound to 25 µL of M280 Magnetic Streptavidin beads washed per recommended
  814. protocol (Thermo-Fisher).
  815. After 30 minutes of binding, beads were washed one time in 100 µL 0.1N NaOH
  816. to denature and remove the bound RNA, followed by two 100 µL washes with
  817. 1X TE buffer.
  818. \end_layout
  819. \begin_layout Standard
  820. Subsequent attachment of the 5-prime Illumina A adapter was performed by
  821. on-bead random primer extension of the following sequence (A-N8 primer:
  822. TTCAGAGTTCTACAGTCCGACGATCNNNNNNNN).
  823. Briefly, beads were resuspended in a 20 µL reaction containing 5 µM A-N8
  824. primer, 40mM Tris-HCl pH 7.5, 20mM MgCl2, 50mM NaCl, 0.325U/µL Sequenase
  825. 2.0 (Affymetrix, Santa Clara, CA), 0.0025U/µL inorganic pyrophosphatase (Affymetr
  826. ix) and 300 µM each dNTP.
  827. Reaction was incubated at 22°C for 30 minutes, then beads were washed 2
  828. times with 1X TE buffer (200µL).
  829. \end_layout
  830. \begin_layout Standard
  831. The magnetic streptavidin beads were resuspended in 34 µL nuclease-free
  832. water and added directly to a PCR tube.
  833. The two Illumina protocol-specified PCR primers were added at 0.53 µM (Illumina
  834. TruSeq Universal Primer 1 and Illumina TruSeq barcoded PCR primer 2), along
  835. with 40 µL 2X KAPA HiFi Hotstart ReadyMix (KAPA, Willmington MA) and thermocycl
  836. ed as follows: starting with 98°C (2 min-hold); 15 cycles of 98°C, 20sec;
  837. 60°C, 30sec; 72°C, 30sec; and finished with a 72°C (2 min-hold).
  838. \end_layout
  839. \begin_layout Standard
  840. PCR products were purified with 1X Ampure Beads following manufacturer’s
  841. recommended protocol.
  842. Libraries were then analyzed using the Agilent TapeStation and quantitation
  843. of desired size range was performed by “smear analysis”.
  844. Samples were pooled in equimolar batches of 16 samples.
  845. Pooled libraries were size selected on 2% agarose gels (E-Gel EX Agarose
  846. Gels; Thermo-Fisher).
  847. Products were cut between 250 and 350 bp (corresponding to insert sizes
  848. of 130 to 230 bps).
  849. Finished library pools were then sequenced on the Illumina NextSeq500 instrumen
  850. t with 75 base read lengths.
  851. \end_layout
  852. \begin_layout Subsection*
  853. Read alignment and counting
  854. \end_layout
  855. \begin_layout Standard
  856. Reads were aligned to the cynomolgus genome using STAR
  857. \begin_inset CommandInset citation
  858. LatexCommand cite
  859. key "Dobin2013,Wilson2013"
  860. literal "false"
  861. \end_inset
  862. .
  863. Counts of uniquely mapped reads were obtained for every gene in each sample
  864. with the “featureCounts” function from the Rsubread package, using each
  865. of the three possibilities for the “strandSpecific” option: sense, antisense,
  866. and unstranded
  867. \begin_inset CommandInset citation
  868. LatexCommand cite
  869. key "Liao2014"
  870. literal "false"
  871. \end_inset
  872. .
  873. A few artifacts in the cynomolgus genome annotation complicated read counting.
  874. First, no ortholog is annotated for alpha globin in the cynomolgus genome,
  875. presumably because the human genome has two alpha globin genes with nearly
  876. identical sequences, making the orthology relationship ambiguous.
  877. However, two loci in the cynomolgus genome are as “hemoglobin subunit alpha-lik
  878. e” (LOC102136192 and LOC102136846).
  879. LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
  880. as protein-coding.
  881. Our globin reduction protocol was designed to include blocking of these
  882. two genes.
  883. Indeed, these two genes have almost the same read counts in each library
  884. as the properly-annotated HBB gene and much larger counts than any other
  885. gene in the unblocked libraries, giving confidence that reads derived from
  886. the real alpha globin are mapping to both genes.
  887. Thus, reads from both of these loci were counted as alpha globin reads
  888. in all further analyses.
  889. The second artifact is a small, uncharacterized non-coding RNA gene (LOC1021365
  890. 91), which overlaps the HBA-like gene (LOC102136192) on the opposite strand.
  891. If counting is not performed in stranded mode (or if a non-strand-specific
  892. sequencing protocol is used), many reads mapping to the globin gene will
  893. be discarded as ambiguous due to their overlap with this ncRNA gene, resulting
  894. in significant undercounting of globin reads.
  895. Therefore, stranded sense counts were used for all further analysis in
  896. the present study to insure that we accurately accounted for globin transcript
  897. reduction.
  898. However, we note that stranded reads are not necessary for RNA-seq using
  899. our protocol in standard practice.
  900. \end_layout
  901. \begin_layout Subsection*
  902. Normalization and Exploratory Data Analysis
  903. \end_layout
  904. \begin_layout Standard
  905. Libraries were normalized by computing scaling factors using the edgeR package’s
  906. Trimmed Mean of M-values method
  907. \begin_inset CommandInset citation
  908. LatexCommand cite
  909. key "Robinson2010"
  910. literal "false"
  911. \end_inset
  912. .
  913. Log2 counts per million values (logCPM) were calculated using the cpm function
  914. in edgeR for individual samples and aveLogCPM function for averages across
  915. groups of samples, using those functions’ default prior count values to
  916. avoid taking the logarithm of 0.
  917. Genes were considered “present” if their average normalized logCPM values
  918. across all libraries were at least -1.
  919. Normalizing for gene length was unnecessary because the sequencing protocol
  920. is 3’-biased and hence the expected read count for each gene is related
  921. to the transcript’s copy number but not its length.
  922. \end_layout
  923. \begin_layout Standard
  924. In order to assess the effect of blocking on reproducibility, Pearson and
  925. Spearman correlation coefficients were computed between the logCPM values
  926. for every pair of libraries within the globin-blocked (GB) and unblocked
  927. (non-GB) groups, and edgeR's “estimateDisp” function was used to compute
  928. negative binomial dispersions separately for the two groups
  929. \begin_inset CommandInset citation
  930. LatexCommand cite
  931. key "Chen2014"
  932. literal "false"
  933. \end_inset
  934. .
  935. \end_layout
  936. \begin_layout Subsection*
  937. Differential Expression Analysis
  938. \end_layout
  939. \begin_layout Standard
  940. All tests for differential gene expression were performed using edgeR, by
  941. first fitting a negative binomial generalized linear model to the counts
  942. and normalization factors and then performing a quasi-likelihood F-test
  943. with robust estimation of outlier gene dispersions
  944. \begin_inset CommandInset citation
  945. LatexCommand cite
  946. key "Lund2012,Phipson2016"
  947. literal "false"
  948. \end_inset
  949. .
  950. To investigate the effects of globin blocking on each gene, an additive
  951. model was fit to the full data with coefficients for globin blocking and
  952. SampleID.
  953. To test the effect of globin blocking on detection of differentially expressed
  954. genes, the GB samples and non-GB samples were each analyzed independently
  955. as follows: for each animal with both a pre-transplant and a post-transplant
  956. time point in the data set, the pre-transplant sample and the earliest
  957. post-transplant sample were selected, and all others were excluded, yielding
  958. a pre-/post-transplant pair of samples for each animal (N=7 animals with
  959. paired samples).
  960. These samples were analyzed for pre-transplant vs.
  961. post-transplant differential gene expression while controlling for inter-animal
  962. variation using an additive model with coefficients for transplant and
  963. animal ID.
  964. In all analyses, p-values were adjusted using the Benjamini-Hochberg procedure
  965. for FDR correction
  966. \begin_inset CommandInset citation
  967. LatexCommand cite
  968. key "Benjamini1995"
  969. literal "false"
  970. \end_inset
  971. .
  972. \end_layout
  973. \begin_layout Standard
  974. \begin_inset Note Note
  975. status open
  976. \begin_layout Itemize
  977. New blood RNA-seq protocol to block reverse transcription of globin genes
  978. \end_layout
  979. \begin_layout Itemize
  980. Blood RNA-seq time course after transplants with/without MSC infusion
  981. \end_layout
  982. \end_inset
  983. \end_layout
  984. \begin_layout Section
  985. Results
  986. \end_layout
  987. \begin_layout Subsection*
  988. Globin blocking yields a larger and more consistent fraction of useful reads
  989. \end_layout
  990. \begin_layout Standard
  991. The objective of the present study was to validate a new protocol for deep
  992. RNA-seq of whole blood drawn into PaxGene tubes from cynomolgus monkeys
  993. undergoing islet transplantation, with particular focus on minimizing the
  994. loss of useful sequencing space to uninformative globin reads.
  995. The details of the analysis with respect to transplant outcomes and the
  996. impact of mesenchymal stem cell treatment will be reported in a separate
  997. manuscript (in preparation).
  998. To focus on the efficacy of our globin blocking protocol, 37 blood samples,
  999. 16 from pre-transplant and 21 from post-transplant time points, were each
  1000. prepped once with and once without globin blocking oligos, and were then
  1001. sequenced on an Illumina NextSeq500 instrument.
  1002. The number of reads aligning to each gene in the cynomolgus genome was
  1003. counted.
  1004. Table 1 summarizes the distribution of read fractions among the GB and
  1005. non-GB libraries.
  1006. In the libraries with no globin blocking, globin reads made up an average
  1007. of 44.6% of total input reads, while reads assigned to all other genes made
  1008. up an average of 26.3%.
  1009. The remaining reads either aligned to intergenic regions (that include
  1010. long non-coding RNAs) or did not align with any annotated transcripts in
  1011. the current build of the cynomolgus genome.
  1012. In the GB libraries, globin reads made up only 3.48% and reads assigned
  1013. to all other genes increased to 50.4%.
  1014. Thus, globin blocking resulted in a 92.2% reduction in globin reads and
  1015. a 91.6% increase in yield of useful non-globin reads.
  1016. \end_layout
  1017. \begin_layout Standard
  1018. This reduction is not quite as efficient as the previous analysis showed
  1019. for human samples by DeepSAGE (<0.4% globin reads after globin reduction)
  1020. \begin_inset CommandInset citation
  1021. LatexCommand cite
  1022. key "Mastrokolias2012"
  1023. literal "false"
  1024. \end_inset
  1025. .
  1026. Nonetheless, this degree of globin reduction is sufficient to nearly double
  1027. the yield of useful reads.
  1028. Thus, globin blocking cuts the required sequencing effort (and costs) to
  1029. achieve a target coverage depth by almost 50%.
  1030. Consistent with this near doubling of yield, the average difference in
  1031. un-normalized logCPM across all genes between the GB libraries and non-GB
  1032. libraries is approximately 1 (mean = 1.01, median = 1.08), an overall 2-fold
  1033. increase.
  1034. Un-normalized values are used here because the TMM normalization correctly
  1035. identifies this 2-fold difference as biologically irrelevant and removes
  1036. it.
  1037. \end_layout
  1038. \begin_layout Standard
  1039. \begin_inset Float figure
  1040. wide false
  1041. sideways false
  1042. status open
  1043. \begin_layout Plain Layout
  1044. \align center
  1045. \begin_inset Graphics
  1046. filename graphics/Globin Paper/figure1 - globin-fractions.pdf
  1047. \end_inset
  1048. \end_layout
  1049. \begin_layout Plain Layout
  1050. \begin_inset Caption Standard
  1051. \begin_layout Plain Layout
  1052. \series bold
  1053. \begin_inset Argument 1
  1054. status collapsed
  1055. \begin_layout Plain Layout
  1056. Fraction of genic reads in each sample aligned to non-globin genes, with
  1057. and without globin blocking (GB).
  1058. \end_layout
  1059. \end_inset
  1060. \begin_inset CommandInset label
  1061. LatexCommand label
  1062. name "fig:Fraction-of-genic-reads"
  1063. \end_inset
  1064. Fraction of genic reads in each sample aligned to non-globin genes, with
  1065. and without globin blocking (GB).
  1066. \series default
  1067. All reads in each sequencing library were aligned to the cyno genome, and
  1068. the number of reads uniquely aligning to each gene was counted.
  1069. For each sample, counts were summed separately for all globin genes and
  1070. for the remainder of the genes (non-globin genes), and the fraction of
  1071. genic reads aligned to non-globin genes was computed.
  1072. Each point represents an individual sample.
  1073. Gray + signs indicate the means for globin-blocked libraries and unblocked
  1074. libraries.
  1075. The overall distribution for each group is represented as a notched box
  1076. plots.
  1077. Points are randomly spread vertically to avoid excessive overlapping.
  1078. \end_layout
  1079. \end_inset
  1080. \end_layout
  1081. \begin_layout Plain Layout
  1082. \end_layout
  1083. \end_inset
  1084. \end_layout
  1085. \begin_layout Standard
  1086. \begin_inset Float table
  1087. placement p
  1088. wide false
  1089. sideways true
  1090. status open
  1091. \begin_layout Plain Layout
  1092. \align center
  1093. \begin_inset Tabular
  1094. <lyxtabular version="3" rows="4" columns="7">
  1095. <features tabularvalignment="middle">
  1096. <column alignment="center" valignment="top">
  1097. <column alignment="center" valignment="top">
  1098. <column alignment="center" valignment="top">
  1099. <column alignment="center" valignment="top">
  1100. <column alignment="center" valignment="top">
  1101. <column alignment="center" valignment="top">
  1102. <column alignment="center" valignment="top">
  1103. <row>
  1104. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1105. \begin_inset Text
  1106. \begin_layout Plain Layout
  1107. \end_layout
  1108. \end_inset
  1109. </cell>
  1110. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1111. \begin_inset Text
  1112. \begin_layout Plain Layout
  1113. \family roman
  1114. \series medium
  1115. \shape up
  1116. \size normal
  1117. \emph off
  1118. \bar no
  1119. \strikeout off
  1120. \xout off
  1121. \uuline off
  1122. \uwave off
  1123. \noun off
  1124. \color none
  1125. Percent of Total Reads
  1126. \end_layout
  1127. \end_inset
  1128. </cell>
  1129. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1130. \begin_inset Text
  1131. \begin_layout Plain Layout
  1132. \end_layout
  1133. \end_inset
  1134. </cell>
  1135. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1136. \begin_inset Text
  1137. \begin_layout Plain Layout
  1138. \end_layout
  1139. \end_inset
  1140. </cell>
  1141. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1142. \begin_inset Text
  1143. \begin_layout Plain Layout
  1144. \end_layout
  1145. \end_inset
  1146. </cell>
  1147. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1148. \begin_inset Text
  1149. \begin_layout Plain Layout
  1150. \family roman
  1151. \series medium
  1152. \shape up
  1153. \size normal
  1154. \emph off
  1155. \bar no
  1156. \strikeout off
  1157. \xout off
  1158. \uuline off
  1159. \uwave off
  1160. \noun off
  1161. \color none
  1162. Percent of Genic Reads
  1163. \end_layout
  1164. \end_inset
  1165. </cell>
  1166. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1167. \begin_inset Text
  1168. \begin_layout Plain Layout
  1169. \end_layout
  1170. \end_inset
  1171. </cell>
  1172. </row>
  1173. <row>
  1174. <cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
  1175. \begin_inset Text
  1176. \begin_layout Plain Layout
  1177. GB
  1178. \end_layout
  1179. \end_inset
  1180. </cell>
  1181. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1182. \begin_inset Text
  1183. \begin_layout Plain Layout
  1184. \family roman
  1185. \series medium
  1186. \shape up
  1187. \size normal
  1188. \emph off
  1189. \bar no
  1190. \strikeout off
  1191. \xout off
  1192. \uuline off
  1193. \uwave off
  1194. \noun off
  1195. \color none
  1196. Non-globin Reads
  1197. \end_layout
  1198. \end_inset
  1199. </cell>
  1200. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1201. \begin_inset Text
  1202. \begin_layout Plain Layout
  1203. \family roman
  1204. \series medium
  1205. \shape up
  1206. \size normal
  1207. \emph off
  1208. \bar no
  1209. \strikeout off
  1210. \xout off
  1211. \uuline off
  1212. \uwave off
  1213. \noun off
  1214. \color none
  1215. Globin Reads
  1216. \end_layout
  1217. \end_inset
  1218. </cell>
  1219. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1220. \begin_inset Text
  1221. \begin_layout Plain Layout
  1222. \family roman
  1223. \series medium
  1224. \shape up
  1225. \size normal
  1226. \emph off
  1227. \bar no
  1228. \strikeout off
  1229. \xout off
  1230. \uuline off
  1231. \uwave off
  1232. \noun off
  1233. \color none
  1234. All Genic Reads
  1235. \end_layout
  1236. \end_inset
  1237. </cell>
  1238. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1239. \begin_inset Text
  1240. \begin_layout Plain Layout
  1241. \family roman
  1242. \series medium
  1243. \shape up
  1244. \size normal
  1245. \emph off
  1246. \bar no
  1247. \strikeout off
  1248. \xout off
  1249. \uuline off
  1250. \uwave off
  1251. \noun off
  1252. \color none
  1253. All Aligned Reads
  1254. \end_layout
  1255. \end_inset
  1256. </cell>
  1257. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1258. \begin_inset Text
  1259. \begin_layout Plain Layout
  1260. \family roman
  1261. \series medium
  1262. \shape up
  1263. \size normal
  1264. \emph off
  1265. \bar no
  1266. \strikeout off
  1267. \xout off
  1268. \uuline off
  1269. \uwave off
  1270. \noun off
  1271. \color none
  1272. Non-globin Reads
  1273. \end_layout
  1274. \end_inset
  1275. </cell>
  1276. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  1277. \begin_inset Text
  1278. \begin_layout Plain Layout
  1279. \family roman
  1280. \series medium
  1281. \shape up
  1282. \size normal
  1283. \emph off
  1284. \bar no
  1285. \strikeout off
  1286. \xout off
  1287. \uuline off
  1288. \uwave off
  1289. \noun off
  1290. \color none
  1291. Globin Reads
  1292. \end_layout
  1293. \end_inset
  1294. </cell>
  1295. </row>
  1296. <row>
  1297. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1298. \begin_inset Text
  1299. \begin_layout Plain Layout
  1300. \family roman
  1301. \series medium
  1302. \shape up
  1303. \size normal
  1304. \emph off
  1305. \bar no
  1306. \strikeout off
  1307. \xout off
  1308. \uuline off
  1309. \uwave off
  1310. \noun off
  1311. \color none
  1312. Yes
  1313. \end_layout
  1314. \end_inset
  1315. </cell>
  1316. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1317. \begin_inset Text
  1318. \begin_layout Plain Layout
  1319. \family roman
  1320. \series medium
  1321. \shape up
  1322. \size normal
  1323. \emph off
  1324. \bar no
  1325. \strikeout off
  1326. \xout off
  1327. \uuline off
  1328. \uwave off
  1329. \noun off
  1330. \color none
  1331. 50.4% ± 6.82
  1332. \end_layout
  1333. \end_inset
  1334. </cell>
  1335. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1336. \begin_inset Text
  1337. \begin_layout Plain Layout
  1338. \family roman
  1339. \series medium
  1340. \shape up
  1341. \size normal
  1342. \emph off
  1343. \bar no
  1344. \strikeout off
  1345. \xout off
  1346. \uuline off
  1347. \uwave off
  1348. \noun off
  1349. \color none
  1350. 3.48% ± 2.94
  1351. \end_layout
  1352. \end_inset
  1353. </cell>
  1354. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1355. \begin_inset Text
  1356. \begin_layout Plain Layout
  1357. \family roman
  1358. \series medium
  1359. \shape up
  1360. \size normal
  1361. \emph off
  1362. \bar no
  1363. \strikeout off
  1364. \xout off
  1365. \uuline off
  1366. \uwave off
  1367. \noun off
  1368. \color none
  1369. 53.9% ± 6.81
  1370. \end_layout
  1371. \end_inset
  1372. </cell>
  1373. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1374. \begin_inset Text
  1375. \begin_layout Plain Layout
  1376. \family roman
  1377. \series medium
  1378. \shape up
  1379. \size normal
  1380. \emph off
  1381. \bar no
  1382. \strikeout off
  1383. \xout off
  1384. \uuline off
  1385. \uwave off
  1386. \noun off
  1387. \color none
  1388. 89.7% ± 2.40
  1389. \end_layout
  1390. \end_inset
  1391. </cell>
  1392. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1393. \begin_inset Text
  1394. \begin_layout Plain Layout
  1395. \family roman
  1396. \series medium
  1397. \shape up
  1398. \size normal
  1399. \emph off
  1400. \bar no
  1401. \strikeout off
  1402. \xout off
  1403. \uuline off
  1404. \uwave off
  1405. \noun off
  1406. \color none
  1407. 93.5% ± 5.25
  1408. \end_layout
  1409. \end_inset
  1410. </cell>
  1411. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1412. \begin_inset Text
  1413. \begin_layout Plain Layout
  1414. \family roman
  1415. \series medium
  1416. \shape up
  1417. \size normal
  1418. \emph off
  1419. \bar no
  1420. \strikeout off
  1421. \xout off
  1422. \uuline off
  1423. \uwave off
  1424. \noun off
  1425. \color none
  1426. 6.49% ± 5.25
  1427. \end_layout
  1428. \end_inset
  1429. </cell>
  1430. </row>
  1431. <row>
  1432. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1433. \begin_inset Text
  1434. \begin_layout Plain Layout
  1435. \family roman
  1436. \series medium
  1437. \shape up
  1438. \size normal
  1439. \emph off
  1440. \bar no
  1441. \strikeout off
  1442. \xout off
  1443. \uuline off
  1444. \uwave off
  1445. \noun off
  1446. \color none
  1447. No
  1448. \end_layout
  1449. \end_inset
  1450. </cell>
  1451. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1452. \begin_inset Text
  1453. \begin_layout Plain Layout
  1454. \family roman
  1455. \series medium
  1456. \shape up
  1457. \size normal
  1458. \emph off
  1459. \bar no
  1460. \strikeout off
  1461. \xout off
  1462. \uuline off
  1463. \uwave off
  1464. \noun off
  1465. \color none
  1466. 26.3% ± 8.95
  1467. \end_layout
  1468. \end_inset
  1469. </cell>
  1470. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1471. \begin_inset Text
  1472. \begin_layout Plain Layout
  1473. \family roman
  1474. \series medium
  1475. \shape up
  1476. \size normal
  1477. \emph off
  1478. \bar no
  1479. \strikeout off
  1480. \xout off
  1481. \uuline off
  1482. \uwave off
  1483. \noun off
  1484. \color none
  1485. 44.6% ± 16.6
  1486. \end_layout
  1487. \end_inset
  1488. </cell>
  1489. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1490. \begin_inset Text
  1491. \begin_layout Plain Layout
  1492. \family roman
  1493. \series medium
  1494. \shape up
  1495. \size normal
  1496. \emph off
  1497. \bar no
  1498. \strikeout off
  1499. \xout off
  1500. \uuline off
  1501. \uwave off
  1502. \noun off
  1503. \color none
  1504. 70.1% ± 9.38
  1505. \end_layout
  1506. \end_inset
  1507. </cell>
  1508. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1509. \begin_inset Text
  1510. \begin_layout Plain Layout
  1511. \family roman
  1512. \series medium
  1513. \shape up
  1514. \size normal
  1515. \emph off
  1516. \bar no
  1517. \strikeout off
  1518. \xout off
  1519. \uuline off
  1520. \uwave off
  1521. \noun off
  1522. \color none
  1523. 90.7% ± 5.16
  1524. \end_layout
  1525. \end_inset
  1526. </cell>
  1527. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1528. \begin_inset Text
  1529. \begin_layout Plain Layout
  1530. \family roman
  1531. \series medium
  1532. \shape up
  1533. \size normal
  1534. \emph off
  1535. \bar no
  1536. \strikeout off
  1537. \xout off
  1538. \uuline off
  1539. \uwave off
  1540. \noun off
  1541. \color none
  1542. 38.8% ± 17.1
  1543. \end_layout
  1544. \end_inset
  1545. </cell>
  1546. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  1547. \begin_inset Text
  1548. \begin_layout Plain Layout
  1549. \family roman
  1550. \series medium
  1551. \shape up
  1552. \size normal
  1553. \emph off
  1554. \bar no
  1555. \strikeout off
  1556. \xout off
  1557. \uuline off
  1558. \uwave off
  1559. \noun off
  1560. \color none
  1561. 61.2% ± 17.1
  1562. \end_layout
  1563. \end_inset
  1564. </cell>
  1565. </row>
  1566. </lyxtabular>
  1567. \end_inset
  1568. \end_layout
  1569. \begin_layout Plain Layout
  1570. \begin_inset Caption Standard
  1571. \begin_layout Plain Layout
  1572. \series bold
  1573. \begin_inset Argument 1
  1574. status collapsed
  1575. \begin_layout Plain Layout
  1576. Fractions of reads mapping to genomic features in GB and non-GB samples.
  1577. \end_layout
  1578. \end_inset
  1579. \begin_inset CommandInset label
  1580. LatexCommand label
  1581. name "tab:Fractions-of-reads"
  1582. \end_inset
  1583. Fractions of reads mapping to genomic features in GB and non-GB samples.
  1584. \series default
  1585. All values are given as mean ± standard deviation.
  1586. \end_layout
  1587. \end_inset
  1588. \end_layout
  1589. \begin_layout Plain Layout
  1590. \end_layout
  1591. \end_inset
  1592. \end_layout
  1593. \begin_layout Standard
  1594. Another important aspect is that the standard deviations in Table
  1595. \begin_inset CommandInset ref
  1596. LatexCommand ref
  1597. reference "tab:Fractions-of-reads"
  1598. plural "false"
  1599. caps "false"
  1600. noprefix "false"
  1601. \end_inset
  1602. are uniformly smaller in the GB samples than the non-GB ones, indicating
  1603. much greater consistency of yield.
  1604. This is best seen in the percentage of non-globin reads as a fraction of
  1605. total reads aligned to annotated genes (genic reads).
  1606. For the non-GB samples, this measure ranges from 10.9% to 80.9%, while for
  1607. the GB samples it ranges from 81.9% to 99.9% (Figure
  1608. \begin_inset CommandInset ref
  1609. LatexCommand ref
  1610. reference "fig:Fraction-of-genic-reads"
  1611. plural "false"
  1612. caps "false"
  1613. noprefix "false"
  1614. \end_inset
  1615. ).
  1616. This means that for applications where it is critical that each sample
  1617. achieve a specified minimum coverage in order to provide useful information,
  1618. it would be necessary to budget up to 10 times the sequencing depth per
  1619. sample without globin blocking, even though the average yield improvement
  1620. for globin blocking is only 2-fold, because every sample has a chance of
  1621. being 90% globin and 10% useful reads.
  1622. Hence, the more consistent behavior of GB samples makes planning an experiment
  1623. easier and more efficient because it eliminates the need to over-sequence
  1624. every sample in order to guard against the worst case of a high-globin
  1625. fraction.
  1626. \end_layout
  1627. \begin_layout Subsection*
  1628. Globin blocking lowers the noise floor and allows detection of about 2000
  1629. more genes
  1630. \end_layout
  1631. \begin_layout Standard
  1632. \begin_inset Note Note
  1633. status collapsed
  1634. \begin_layout Plain Layout
  1635. TODO Remove extraneous titles from figures
  1636. \end_layout
  1637. \end_inset
  1638. \end_layout
  1639. \begin_layout Standard
  1640. \begin_inset Float figure
  1641. wide false
  1642. sideways false
  1643. status collapsed
  1644. \begin_layout Plain Layout
  1645. \align center
  1646. \begin_inset Graphics
  1647. filename graphics/Globin Paper/figure2 - aveLogCPM-colored.pdf
  1648. \end_inset
  1649. \end_layout
  1650. \begin_layout Plain Layout
  1651. \begin_inset Caption Standard
  1652. \begin_layout Plain Layout
  1653. \series bold
  1654. \begin_inset Argument 1
  1655. status collapsed
  1656. \begin_layout Plain Layout
  1657. Distributions of average group gene abundances when normalized separately
  1658. or together.
  1659. \end_layout
  1660. \end_inset
  1661. \begin_inset CommandInset label
  1662. LatexCommand label
  1663. name "fig:logcpm-dists"
  1664. \end_inset
  1665. Distributions of average group gene abundances when normalized separately
  1666. or together.
  1667. \series default
  1668. All reads in each sequencing library were aligned to the cyno genome, and
  1669. the number of reads uniquely aligning to each gene was counted.
  1670. Genes with zero counts in all libraries were discarded.
  1671. Libraries were normalized using the TMM method.
  1672. Libraries were split into globin-blocked (GB) and non-GB groups and the
  1673. average abundance for each gene in both groups, measured in log2 counts
  1674. per million reads counted, was computed using the aveLogCPM function.
  1675. The distribution of average gene logCPM values was plotted for both groups
  1676. using a kernel density plot to approximate a continuous distribution.
  1677. The logCPM GB distributions are marked in red, non-GB in blue.
  1678. The black vertical line denotes the chosen detection threshold of -1.
  1679. Top panel: Libraries were split into GB and non-GB groups first and normalized
  1680. separately.
  1681. Bottom panel: Libraries were all normalized together first and then split
  1682. into groups.
  1683. \end_layout
  1684. \end_inset
  1685. \end_layout
  1686. \begin_layout Plain Layout
  1687. \end_layout
  1688. \end_inset
  1689. \end_layout
  1690. \begin_layout Standard
  1691. Since globin blocking yields more usable sequencing depth, it should also
  1692. allow detection of more genes at any given threshold.
  1693. When we looked at the distribution of average normalized logCPM values
  1694. across all libraries for genes with at least one read assigned to them,
  1695. we observed the expected bimodal distribution, with a high-abundance "signal"
  1696. peak representing detected genes and a low-abundance "noise" peak representing
  1697. genes whose read count did not rise above the noise floor (Figure
  1698. \begin_inset CommandInset ref
  1699. LatexCommand ref
  1700. reference "fig:logcpm-dists"
  1701. plural "false"
  1702. caps "false"
  1703. noprefix "false"
  1704. \end_inset
  1705. ).
  1706. Consistent with the 2-fold increase in raw counts assigned to non-globin
  1707. genes, the signal peak for GB samples is shifted to the right relative
  1708. to the non-GB signal peak.
  1709. When all the samples are normalized together, this difference is normalized
  1710. out, lining up the signal peaks, and this reveals that, as expected, the
  1711. noise floor for the GB samples is about 2-fold lower.
  1712. This greater separation between signal and noise peaks in the GB samples
  1713. means that low-expression genes should be more easily detected and more
  1714. precisely quantified than in the non-GB samples.
  1715. \end_layout
  1716. \begin_layout Standard
  1717. \begin_inset Float figure
  1718. wide false
  1719. sideways false
  1720. status open
  1721. \begin_layout Plain Layout
  1722. \align center
  1723. \begin_inset Graphics
  1724. filename graphics/Globin Paper/figure3 - detection.pdf
  1725. \end_inset
  1726. \end_layout
  1727. \begin_layout Plain Layout
  1728. \begin_inset Caption Standard
  1729. \begin_layout Plain Layout
  1730. \series bold
  1731. \begin_inset Argument 1
  1732. status collapsed
  1733. \begin_layout Plain Layout
  1734. Gene detections as a function of abundance thresholds in globin-blocked
  1735. (GB) and non-GB samples.
  1736. \end_layout
  1737. \end_inset
  1738. \begin_inset CommandInset label
  1739. LatexCommand label
  1740. name "fig:Gene-detections"
  1741. \end_inset
  1742. Gene detections as a function of abundance thresholds in globin-blocked
  1743. (GB) and non-GB samples.
  1744. \series default
  1745. Average abundance (logCPM,
  1746. \begin_inset Formula $\log_{2}$
  1747. \end_inset
  1748. counts per million reads counted) was computed by separate group normalization
  1749. as described in Figure
  1750. \begin_inset CommandInset ref
  1751. LatexCommand ref
  1752. reference "fig:logcpm-dists"
  1753. plural "false"
  1754. caps "false"
  1755. noprefix "false"
  1756. \end_inset
  1757. for both the GB and non-GB groups, as well as for all samples considered
  1758. as one large group.
  1759. For each every integer threshold from -2 to 3, the number of genes detected
  1760. at or above that logCPM threshold was plotted for each group.
  1761. \end_layout
  1762. \end_inset
  1763. \end_layout
  1764. \begin_layout Plain Layout
  1765. \end_layout
  1766. \end_inset
  1767. \end_layout
  1768. \begin_layout Standard
  1769. Based on these distributions, we selected a detection threshold of -1, which
  1770. is approximately the leftmost edge of the trough between the signal and
  1771. noise peaks.
  1772. This represents the most liberal possible detection threshold that doesn't
  1773. call substantial numbers of noise genes as detected.
  1774. Among the full dataset, 13429 genes were detected at this threshold, and
  1775. 22276 were not.
  1776. When considering the GB libraries and non-GB libraries separately and re-comput
  1777. ing normalization factors independently within each group, 14535 genes were
  1778. detected in the GB libraries while only 12460 were detected in the non-GB
  1779. libraries.
  1780. Thus, GB allowed the detection of 2000 extra genes that were buried under
  1781. the noise floor without GB.
  1782. This pattern of at least 2000 additional genes detected with GB was also
  1783. consistent across a wide range of possible detection thresholds, from -2
  1784. to 3 (see Figure
  1785. \begin_inset CommandInset ref
  1786. LatexCommand ref
  1787. reference "fig:Gene-detections"
  1788. plural "false"
  1789. caps "false"
  1790. noprefix "false"
  1791. \end_inset
  1792. ).
  1793. \end_layout
  1794. \begin_layout Subsection*
  1795. Globin blocking does not add significant additional noise or decrease sample
  1796. quality
  1797. \end_layout
  1798. \begin_layout Standard
  1799. One potential worry is that the globin blocking protocol could perturb the
  1800. levels of non-globin genes.
  1801. There are two kinds of possible perturbations: systematic and random.
  1802. The former is not a major concern for detection of differential expression,
  1803. since a 2-fold change in every sample has no effect on the relative fold
  1804. change between samples.
  1805. In contrast, random perturbations would increase the noise and obscure
  1806. the signal in the dataset, reducing the capacity to detect differential
  1807. expression.
  1808. \end_layout
  1809. \begin_layout Standard
  1810. \begin_inset Float figure
  1811. wide false
  1812. sideways false
  1813. status open
  1814. \begin_layout Plain Layout
  1815. \align center
  1816. \begin_inset Graphics
  1817. filename graphics/Globin Paper/figure4 - maplot-colored.pdf
  1818. \end_inset
  1819. \end_layout
  1820. \begin_layout Plain Layout
  1821. \begin_inset Caption Standard
  1822. \begin_layout Plain Layout
  1823. \begin_inset Argument 1
  1824. status collapsed
  1825. \begin_layout Plain Layout
  1826. MA plot showing effects of globin blocking on each gene's abundance.
  1827. \end_layout
  1828. \end_inset
  1829. \begin_inset CommandInset label
  1830. LatexCommand label
  1831. name "fig:MA-plot"
  1832. \end_inset
  1833. \series bold
  1834. MA plot showing effects of globin blocking on each gene's abundance.
  1835. \series default
  1836. All libraries were normalized together as described in Figure
  1837. \begin_inset CommandInset ref
  1838. LatexCommand ref
  1839. reference "fig:logcpm-dists"
  1840. plural "false"
  1841. caps "false"
  1842. noprefix "false"
  1843. \end_inset
  1844. , and genes with an average logCPM below -1 were filtered out.
  1845. Each remaining gene was tested for differential abundance with respect
  1846. to globin blocking (GB) using edgeR’s quasi-likelihod F-test, fitting a
  1847. negative binomial generalized linear model to table of read counts in each
  1848. library.
  1849. For each gene, edgeR reported average abundance (logCPM),
  1850. \begin_inset Formula $\log_{2}$
  1851. \end_inset
  1852. fold change (logFC), p-value, and Benjamini-Hochberg adjusted false discovery
  1853. rate (FDR).
  1854. Each gene's logFC was plotted against its logCPM, colored by FDR.
  1855. Red points are significant at ≤10% FDR, and blue are not significant at
  1856. that threshold.
  1857. The alpha and beta globin genes targeted for blocking are marked with large
  1858. triangles, while all other genes are represented as small points.
  1859. \end_layout
  1860. \end_inset
  1861. \end_layout
  1862. \begin_layout Plain Layout
  1863. \end_layout
  1864. \end_inset
  1865. \end_layout
  1866. \begin_layout Standard
  1867. \begin_inset Note Note
  1868. status open
  1869. \begin_layout Plain Layout
  1870. TODO Standardize on
  1871. \begin_inset Quotes eld
  1872. \end_inset
  1873. log2
  1874. \begin_inset Quotes erd
  1875. \end_inset
  1876. notation
  1877. \end_layout
  1878. \end_inset
  1879. \end_layout
  1880. \begin_layout Standard
  1881. The data do indeed show small systematic perturbations in gene levels (Figure
  1882. \begin_inset CommandInset ref
  1883. LatexCommand ref
  1884. reference "fig:MA-plot"
  1885. plural "false"
  1886. caps "false"
  1887. noprefix "false"
  1888. \end_inset
  1889. ).
  1890. Other than the 3 designated alpha and beta globin genes, two other genes
  1891. stand out as having especially large negative log fold changes: HBD and
  1892. LOC1021365.
  1893. HBD, delta globin, is most likely targeted by the blocking oligos due to
  1894. high sequence homology with the other globin genes.
  1895. LOC1021365 is the aforementioned ncRNA that is reverse-complementary to
  1896. one of the alpha-like genes and that would be expected to be removed during
  1897. the globin blocking step.
  1898. All other genes appear in a cluster centered vertically at 0, and the vast
  1899. majority of genes in this cluster show an absolute log2(FC) of 0.5 or less.
  1900. Nevertheless, many of these small perturbations are still statistically
  1901. significant, indicating that the globin blocking oligos likely cause very
  1902. small but non-zero systematic perturbations in measured gene expression
  1903. levels.
  1904. \end_layout
  1905. \begin_layout Standard
  1906. \begin_inset Float figure
  1907. wide false
  1908. sideways false
  1909. status open
  1910. \begin_layout Plain Layout
  1911. \align center
  1912. \begin_inset Graphics
  1913. filename graphics/Globin Paper/figure5 - corrplot.pdf
  1914. \end_inset
  1915. \end_layout
  1916. \begin_layout Plain Layout
  1917. \begin_inset Caption Standard
  1918. \begin_layout Plain Layout
  1919. \series bold
  1920. \begin_inset Argument 1
  1921. status collapsed
  1922. \begin_layout Plain Layout
  1923. Comparison of inter-sample gene abundance correlations with and without
  1924. globin blocking.
  1925. \end_layout
  1926. \end_inset
  1927. \begin_inset CommandInset label
  1928. LatexCommand label
  1929. name "fig:gene-abundance-correlations"
  1930. \end_inset
  1931. Comparison of inter-sample gene abundance correlations with and without
  1932. globin blocking (GB).
  1933. \series default
  1934. All libraries were normalized together as described in Figure 2, and genes
  1935. with an average abundance (logCPM, log2 counts per million reads counted)
  1936. less than -1 were filtered out.
  1937. Each gene’s logCPM was computed in each library using the edgeR cpm function.
  1938. For each pair of biological samples, the Pearson correlation between those
  1939. samples' GB libraries was plotted against the correlation between the same
  1940. samples’ non-GB libraries.
  1941. Each point represents an unique pair of samples.
  1942. The solid gray line shows a quantile-quantile plot of distribution of GB
  1943. correlations vs.
  1944. that of non-GB correlations.
  1945. The thin dashed line is the identity line, provided for reference.
  1946. \end_layout
  1947. \end_inset
  1948. \end_layout
  1949. \begin_layout Plain Layout
  1950. \end_layout
  1951. \end_inset
  1952. \end_layout
  1953. \begin_layout Standard
  1954. To evaluate the possibility of globin blocking causing random perturbations
  1955. and reducing sample quality, we computed the Pearson correlation between
  1956. logCPM values for every pair of samples with and without GB and plotted
  1957. them against each other (Figure
  1958. \begin_inset CommandInset ref
  1959. LatexCommand ref
  1960. reference "fig:gene-abundance-correlations"
  1961. plural "false"
  1962. caps "false"
  1963. noprefix "false"
  1964. \end_inset
  1965. ).
  1966. The plot indicated that the GB libraries have higher sample-to-sample correlati
  1967. ons than the non-GB libraries.
  1968. Parametric and nonparametric tests for differences between the correlations
  1969. with and without GB both confirmed that this difference was highly significant
  1970. (2-sided paired t-test: t = 37.2, df = 665, P ≪ 2.2e-16; 2-sided Wilcoxon
  1971. sign-rank test: V = 2195, P ≪ 2.2e-16).
  1972. Performing the same tests on the Spearman correlations gave the same conclusion
  1973. (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
  1974. The edgeR package was used to compute the overall biological coefficient
  1975. of variation (BCV) for GB and non-GB libraries, and found that globin blocking
  1976. resulted in a negligible increase in the BCV (0.417 with GB vs.
  1977. 0.400 without).
  1978. The near equality of the BCVs for both sets indicates that the higher correlati
  1979. ons in the GB libraries are most likely a result of the increased yield
  1980. of useful reads, which reduces the contribution of Poisson counting uncertainty
  1981. to the overall variance of the logCPM values
  1982. \begin_inset CommandInset citation
  1983. LatexCommand cite
  1984. key "McCarthy2012"
  1985. literal "false"
  1986. \end_inset
  1987. .
  1988. This improves the precision of expression measurements and more than offsets
  1989. the negligible increase in BCV.
  1990. \end_layout
  1991. \begin_layout Subsection*
  1992. More differentially expressed genes are detected with globin blocking
  1993. \end_layout
  1994. \begin_layout Standard
  1995. \begin_inset Float table
  1996. wide false
  1997. sideways false
  1998. status open
  1999. \begin_layout Plain Layout
  2000. \align center
  2001. \begin_inset Tabular
  2002. <lyxtabular version="3" rows="5" columns="5">
  2003. <features tabularvalignment="middle">
  2004. <column alignment="center" valignment="top">
  2005. <column alignment="center" valignment="top">
  2006. <column alignment="center" valignment="top">
  2007. <column alignment="center" valignment="top">
  2008. <column alignment="center" valignment="top">
  2009. <row>
  2010. <cell alignment="center" valignment="top" usebox="none">
  2011. \begin_inset Text
  2012. \begin_layout Plain Layout
  2013. \end_layout
  2014. \end_inset
  2015. </cell>
  2016. <cell alignment="center" valignment="top" usebox="none">
  2017. \begin_inset Text
  2018. \begin_layout Plain Layout
  2019. \end_layout
  2020. \end_inset
  2021. </cell>
  2022. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2023. \begin_inset Text
  2024. \begin_layout Plain Layout
  2025. \series bold
  2026. No Globin Blocking
  2027. \end_layout
  2028. \end_inset
  2029. </cell>
  2030. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2031. \begin_inset Text
  2032. \begin_layout Plain Layout
  2033. \end_layout
  2034. \end_inset
  2035. </cell>
  2036. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2037. \begin_inset Text
  2038. \begin_layout Plain Layout
  2039. \end_layout
  2040. \end_inset
  2041. </cell>
  2042. </row>
  2043. <row>
  2044. <cell alignment="center" valignment="top" usebox="none">
  2045. \begin_inset Text
  2046. \begin_layout Plain Layout
  2047. \end_layout
  2048. \end_inset
  2049. </cell>
  2050. <cell alignment="center" valignment="top" usebox="none">
  2051. \begin_inset Text
  2052. \begin_layout Plain Layout
  2053. \end_layout
  2054. \end_inset
  2055. </cell>
  2056. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2057. \begin_inset Text
  2058. \begin_layout Plain Layout
  2059. \series bold
  2060. Up
  2061. \end_layout
  2062. \end_inset
  2063. </cell>
  2064. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2065. \begin_inset Text
  2066. \begin_layout Plain Layout
  2067. \series bold
  2068. NS
  2069. \end_layout
  2070. \end_inset
  2071. </cell>
  2072. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2073. \begin_inset Text
  2074. \begin_layout Plain Layout
  2075. \series bold
  2076. Down
  2077. \end_layout
  2078. \end_inset
  2079. </cell>
  2080. </row>
  2081. <row>
  2082. <cell multirow="3" alignment="center" valignment="middle" topline="true" bottomline="true" leftline="true" usebox="none">
  2083. \begin_inset Text
  2084. \begin_layout Plain Layout
  2085. \series bold
  2086. Globin-Blocking
  2087. \end_layout
  2088. \end_inset
  2089. </cell>
  2090. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2091. \begin_inset Text
  2092. \begin_layout Plain Layout
  2093. \series bold
  2094. Up
  2095. \end_layout
  2096. \end_inset
  2097. </cell>
  2098. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2099. \begin_inset Text
  2100. \begin_layout Plain Layout
  2101. \family roman
  2102. \series medium
  2103. \shape up
  2104. \size normal
  2105. \emph off
  2106. \bar no
  2107. \strikeout off
  2108. \xout off
  2109. \uuline off
  2110. \uwave off
  2111. \noun off
  2112. \color none
  2113. 231
  2114. \end_layout
  2115. \end_inset
  2116. </cell>
  2117. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2118. \begin_inset Text
  2119. \begin_layout Plain Layout
  2120. \family roman
  2121. \series medium
  2122. \shape up
  2123. \size normal
  2124. \emph off
  2125. \bar no
  2126. \strikeout off
  2127. \xout off
  2128. \uuline off
  2129. \uwave off
  2130. \noun off
  2131. \color none
  2132. 515
  2133. \end_layout
  2134. \end_inset
  2135. </cell>
  2136. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2137. \begin_inset Text
  2138. \begin_layout Plain Layout
  2139. \family roman
  2140. \series medium
  2141. \shape up
  2142. \size normal
  2143. \emph off
  2144. \bar no
  2145. \strikeout off
  2146. \xout off
  2147. \uuline off
  2148. \uwave off
  2149. \noun off
  2150. \color none
  2151. 2
  2152. \end_layout
  2153. \end_inset
  2154. </cell>
  2155. </row>
  2156. <row>
  2157. <cell multirow="4" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2158. \begin_inset Text
  2159. \begin_layout Plain Layout
  2160. \end_layout
  2161. \end_inset
  2162. </cell>
  2163. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2164. \begin_inset Text
  2165. \begin_layout Plain Layout
  2166. \series bold
  2167. NS
  2168. \end_layout
  2169. \end_inset
  2170. </cell>
  2171. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2172. \begin_inset Text
  2173. \begin_layout Plain Layout
  2174. \family roman
  2175. \series medium
  2176. \shape up
  2177. \size normal
  2178. \emph off
  2179. \bar no
  2180. \strikeout off
  2181. \xout off
  2182. \uuline off
  2183. \uwave off
  2184. \noun off
  2185. \color none
  2186. 160
  2187. \end_layout
  2188. \end_inset
  2189. </cell>
  2190. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2191. \begin_inset Text
  2192. \begin_layout Plain Layout
  2193. \family roman
  2194. \series medium
  2195. \shape up
  2196. \size normal
  2197. \emph off
  2198. \bar no
  2199. \strikeout off
  2200. \xout off
  2201. \uuline off
  2202. \uwave off
  2203. \noun off
  2204. \color none
  2205. 11235
  2206. \end_layout
  2207. \end_inset
  2208. </cell>
  2209. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2210. \begin_inset Text
  2211. \begin_layout Plain Layout
  2212. \family roman
  2213. \series medium
  2214. \shape up
  2215. \size normal
  2216. \emph off
  2217. \bar no
  2218. \strikeout off
  2219. \xout off
  2220. \uuline off
  2221. \uwave off
  2222. \noun off
  2223. \color none
  2224. 136
  2225. \end_layout
  2226. \end_inset
  2227. </cell>
  2228. </row>
  2229. <row>
  2230. <cell multirow="4" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2231. \begin_inset Text
  2232. \begin_layout Plain Layout
  2233. \end_layout
  2234. \end_inset
  2235. </cell>
  2236. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2237. \begin_inset Text
  2238. \begin_layout Plain Layout
  2239. \series bold
  2240. Down
  2241. \end_layout
  2242. \end_inset
  2243. </cell>
  2244. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2245. \begin_inset Text
  2246. \begin_layout Plain Layout
  2247. \family roman
  2248. \series medium
  2249. \shape up
  2250. \size normal
  2251. \emph off
  2252. \bar no
  2253. \strikeout off
  2254. \xout off
  2255. \uuline off
  2256. \uwave off
  2257. \noun off
  2258. \color none
  2259. 0
  2260. \end_layout
  2261. \end_inset
  2262. </cell>
  2263. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2264. \begin_inset Text
  2265. \begin_layout Plain Layout
  2266. \family roman
  2267. \series medium
  2268. \shape up
  2269. \size normal
  2270. \emph off
  2271. \bar no
  2272. \strikeout off
  2273. \xout off
  2274. \uuline off
  2275. \uwave off
  2276. \noun off
  2277. \color none
  2278. 548
  2279. \end_layout
  2280. \end_inset
  2281. </cell>
  2282. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2283. \begin_inset Text
  2284. \begin_layout Plain Layout
  2285. \family roman
  2286. \series medium
  2287. \shape up
  2288. \size normal
  2289. \emph off
  2290. \bar no
  2291. \strikeout off
  2292. \xout off
  2293. \uuline off
  2294. \uwave off
  2295. \noun off
  2296. \color none
  2297. 127
  2298. \end_layout
  2299. \end_inset
  2300. </cell>
  2301. </row>
  2302. </lyxtabular>
  2303. \end_inset
  2304. \end_layout
  2305. \begin_layout Plain Layout
  2306. \begin_inset Caption Standard
  2307. \begin_layout Plain Layout
  2308. \series bold
  2309. \begin_inset Argument 1
  2310. status open
  2311. \begin_layout Plain Layout
  2312. Comparison of significantly differentially expressed genes with and without
  2313. globin blocking.
  2314. \end_layout
  2315. \end_inset
  2316. \begin_inset CommandInset label
  2317. LatexCommand label
  2318. name "tab:Comparison-of-significant"
  2319. \end_inset
  2320. Comparison of significantly differentially expressed genes with and without
  2321. globin blocking.
  2322. \series default
  2323. Up, Down: Genes significantly up/down-regulated in post-transplant samples
  2324. relative to pre-transplant samples, with a false discovery rate of 10%
  2325. or less.
  2326. NS: Non-significant genes (false discovery rate greater than 10%).
  2327. \end_layout
  2328. \end_inset
  2329. \end_layout
  2330. \begin_layout Plain Layout
  2331. \end_layout
  2332. \end_inset
  2333. \end_layout
  2334. \begin_layout Standard
  2335. To compare performance on differential gene expression tests, we took subsets
  2336. of both the GB and non-GB libraries with exactly one pre-transplant and
  2337. one post-transplant sample for each animal that had paired samples available
  2338. for analysis (N=7 animals, N=14 samples in each subset).
  2339. The same test for pre- vs.
  2340. post-transplant differential gene expression was performed on the same
  2341. 7 pairs of samples from GB libraries and non-GB libraries, in each case
  2342. using an FDR of 10% as the threshold of significance.
  2343. Out of 12954 genes that passed the detection threshold in both subsets,
  2344. 358 were called significantly differentially expressed in the same direction
  2345. in both sets; 1063 were differentially expressed in the GB set only; 296
  2346. were differentially expressed in the non-GB set only; 2 genes were called
  2347. significantly up in the GB set but significantly down in the non-GB set;
  2348. and the remaining 11235 were not called differentially expressed in either
  2349. set.
  2350. These data are summarized in Table
  2351. \begin_inset CommandInset ref
  2352. LatexCommand ref
  2353. reference "tab:Comparison-of-significant"
  2354. plural "false"
  2355. caps "false"
  2356. noprefix "false"
  2357. \end_inset
  2358. .
  2359. The differences in BCV calculated by EdgeR for these subsets of samples
  2360. were negligible (BCV = 0.302 for GB and 0.297 for non-GB).
  2361. \end_layout
  2362. \begin_layout Standard
  2363. The key point is that the GB data results in substantially more differentially
  2364. expressed calls than the non-GB data.
  2365. Since there is no gold standard for this dataset, it is impossible to be
  2366. certain whether this is due to under-calling of differential expression
  2367. in the non-GB samples or over-calling in the GB samples.
  2368. However, given that both datasets are derived from the same biological
  2369. samples and have nearly equal BCVs, it is more likely that the larger number
  2370. of DE calls in the GB samples are genuine detections that were enabled
  2371. by the higher sequencing depth and measurement precision of the GB samples.
  2372. Note that the same set of genes was considered in both subsets, so the
  2373. larger number of differentially expressed gene calls in the GB data set
  2374. reflects a greater sensitivity to detect significant differential gene
  2375. expression and not simply the larger total number of detected genes in
  2376. GB samples described earlier.
  2377. \end_layout
  2378. \begin_layout Section
  2379. Discussion
  2380. \end_layout
  2381. \begin_layout Standard
  2382. The original experience with whole blood gene expression profiling on DNA
  2383. microarrays demonstrated that the high concentration of globin transcripts
  2384. reduced the sensitivity to detect genes with relatively low expression
  2385. levels, in effect, significantly reducing the sensitivity.
  2386. To address this limitation, commercial protocols for globin reduction were
  2387. developed based on strategies to block globin transcript amplification
  2388. during labeling or physically removing globin transcripts by affinity bead
  2389. methods
  2390. \begin_inset CommandInset citation
  2391. LatexCommand cite
  2392. key "Winn2010"
  2393. literal "false"
  2394. \end_inset
  2395. .
  2396. More recently, using the latest generation of labeling protocols and arrays,
  2397. it was determined that globin reduction was no longer necessary to obtain
  2398. sufficient sensitivity to detect differential transcript expression
  2399. \begin_inset CommandInset citation
  2400. LatexCommand cite
  2401. key "NuGEN2010"
  2402. literal "false"
  2403. \end_inset
  2404. .
  2405. However, we are not aware of any publications using these currently available
  2406. protocols the with latest generation of microarrays that actually compare
  2407. the detection sensitivity with and without globin reduction.
  2408. However, in practice this has now been adopted generally primarily driven
  2409. by concerns for cost control.
  2410. The main objective of our work was to directly test the impact of globin
  2411. gene transcripts and a new globin blocking protocol for application to
  2412. the newest generation of differential gene expression profiling determined
  2413. using next generation sequencing.
  2414. \end_layout
  2415. \begin_layout Standard
  2416. The challenge of doing global gene expression profiling in cynomolgus monkeys
  2417. is that the current available arrays were never designed to comprehensively
  2418. cover this genome and have not been updated since the first assemblies
  2419. of the cynomolgus genome were published.
  2420. Therefore, we determined that the best strategy for peripheral blood profiling
  2421. was to do deep RNA-seq and inform the workflow using the latest available
  2422. genome assembly and annotation
  2423. \begin_inset CommandInset citation
  2424. LatexCommand cite
  2425. key "Wilson2013"
  2426. literal "false"
  2427. \end_inset
  2428. .
  2429. However, it was not immediately clear whether globin reduction was necessary
  2430. for RNA-seq or how much improvement in efficiency or sensitivity to detect
  2431. differential gene expression would be achieved for the added cost and work.
  2432. \end_layout
  2433. \begin_layout Standard
  2434. We only found one report that demonstrated that globin reduction significantly
  2435. improved the effective read yields for sequencing of human peripheral blood
  2436. cell RNA using a DeepSAGE protocol
  2437. \begin_inset CommandInset citation
  2438. LatexCommand cite
  2439. key "Mastrokolias2012"
  2440. literal "false"
  2441. \end_inset
  2442. .
  2443. The approach to DeepSAGE involves two different restriction enzymes that
  2444. purify and then tag small fragments of transcripts at specific locations
  2445. and thus, significantly reduces the complexity of the transcriptome.
  2446. Therefore, we could not determine how DeepSAGE results would translate
  2447. to the common strategy in the field for assaying the entire transcript
  2448. population by whole-transcriptome 3’-end RNA-seq.
  2449. Furthermore, if globin reduction is necessary, we also needed a globin
  2450. reduction method specific to cynomolgus globin sequences that would work
  2451. an organism for which no kit is available off the shelf.
  2452. \end_layout
  2453. \begin_layout Standard
  2454. As mentioned above, the addition of globin blocking oligos has a very small
  2455. impact on measured expression levels of gene expression.
  2456. However, this is a non-issue for the purposes of differential expression
  2457. testing, since a systematic change in a gene in all samples does not affect
  2458. relative expression levels between samples.
  2459. However, we must acknowledge that simple comparisons of gene expression
  2460. data obtained by GB and non-GB protocols are not possible without additional
  2461. normalization.
  2462. \end_layout
  2463. \begin_layout Standard
  2464. More importantly, globin blocking not only nearly doubles the yield of usable
  2465. reads, it also increases inter-sample correlation and sensitivity to detect
  2466. differential gene expression relative to the same set of samples profiled
  2467. without blocking.
  2468. In addition, globin blocking does not add a significant amount of random
  2469. noise to the data.
  2470. Globin blocking thus represents a cost-effective way to squeeze more data
  2471. and statistical power out of the same blood samples and the same amount
  2472. of sequencing.
  2473. In conclusion, globin reduction greatly increases the yield of useful RNA-seq
  2474. reads mapping to the rest of the genome, with minimal perturbations in
  2475. the relative levels of non-globin genes.
  2476. Based on these results, globin transcript reduction using sequence-specific,
  2477. complementary blocking oligonucleotides is recommended for all deep RNA-seq
  2478. of cynomolgus and other nonhuman primate blood samples.
  2479. \end_layout
  2480. \begin_layout Chapter
  2481. Future Directions
  2482. \end_layout
  2483. \begin_layout Itemize
  2484. Study other epigenetic marks in more contexts
  2485. \end_layout
  2486. \begin_deeper
  2487. \begin_layout Itemize
  2488. DNA methylation, histone marks, chromatin accessibility & conformation in
  2489. CD4 T-cells
  2490. \end_layout
  2491. \begin_layout Itemize
  2492. Also look at other types lymphocytes: CD8 T-cells, B-cells, NK cells
  2493. \end_layout
  2494. \end_deeper
  2495. \begin_layout Itemize
  2496. Investigate epigenetic regulation of lifespan extension in
  2497. \emph on
  2498. C.
  2499. elegans
  2500. \end_layout
  2501. \begin_deeper
  2502. \begin_layout Itemize
  2503. ChIP-seq of important transcriptional regulators to see how transcriptional
  2504. drift is prevented
  2505. \end_layout
  2506. \end_deeper
  2507. \begin_layout Standard
  2508. \begin_inset ERT
  2509. status open
  2510. \begin_layout Plain Layout
  2511. % Use "References" instead of "Bibliography"
  2512. \end_layout
  2513. \begin_layout Plain Layout
  2514. \backslash
  2515. renewcommand{
  2516. \backslash
  2517. bibname}{References}
  2518. \end_layout
  2519. \end_inset
  2520. \end_layout
  2521. \begin_layout Standard
  2522. \begin_inset Note Note
  2523. status open
  2524. \begin_layout Plain Layout
  2525. TODO: Check bib entry formatting
  2526. \end_layout
  2527. \end_inset
  2528. \end_layout
  2529. \begin_layout Standard
  2530. \begin_inset CommandInset bibtex
  2531. LatexCommand bibtex
  2532. bibfiles "refs"
  2533. options "plain"
  2534. \end_inset
  2535. \end_layout
  2536. \end_body
  2537. \end_document