thesis.lyx 349 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418134191342013421134221342313424134251342613427134281342913430134311343213433134341343513436134371343813439134401344113442134431344413445134461344713448134491345013451134521345313454134551345613457134581345913460134611346213463134641346513466134671346813469134701347113472134731347413475134761347713478134791348013481134821348313484134851348613487134881348913490134911349213493134941349513496134971349813499135001350113502135031350413505135061350713508135091351013511135121351313514135151351613517135181351913520135211352213523135241352513526135271352813529135301353113532135331353413535135361353713538135391354013541135421354313544135451354613547135481354913550135511355213553135541355513556135571355813559135601356113562135631356413565135661356713568135691357013571135721357313574135751357613577135781357913580135811358213583135841358513586135871358813589135901359113592135931359413595135961359713598135991360013601136021360313604136051360613607136081360913610136111361213613136141361513616136171361813619136201362113622136231362413625136261362713628136291363013631136321363313634136351363613637136381363913640136411364213643136441364513646136471364813649136501365113652136531365413655136561365713658136591366013661136621366313664136651366613667136681366913670136711367213673136741367513676136771367813679136801368113682136831368413685136861368713688136891369013691136921369313694136951369613697136981369913700137011370213703137041370513706137071370813709137101371113712137131371413715137161371713718137191372013721137221372313724137251372613727137281372913730137311373213733137341373513736137371373813739137401374113742137431374413745137461374713748137491375013751137521375313754137551375613757137581375913760137611376213763137641376513766137671376813769137701377113772137731377413775137761377713778137791378013781137821378313784137851378613787137881378913790137911379213793137941379513796137971379813799138001380113802138031380413805138061380713808138091381013811138121381313814138151381613817138181381913820138211382213823138241382513826138271382813829138301383113832138331383413835138361383713838138391384013841138421384313844138451384613847138481384913850138511385213853138541385513856138571385813859138601386113862138631386413865138661386713868138691387013871138721387313874138751387613877138781387913880138811388213883138841388513886138871388813889138901389113892138931389413895138961389713898138991390013901139021390313904139051390613907139081390913910139111391213913139141391513916139171391813919139201392113922139231392413925139261392713928139291393013931139321393313934139351393613937139381393913940139411394213943139441394513946139471394813949139501395113952139531395413955139561395713958139591396013961139621396313964139651396613967139681396913970139711397213973139741397513976139771397813979139801398113982139831398413985139861398713988139891399013991139921399313994139951399613997139981399914000140011400214003140041400514006140071400814009140101401114012140131401414015140161401714018140191402014021140221402314024140251402614027140281402914030140311403214033140341403514036140371403814039140401404114042140431404414045140461404714048140491405014051140521405314054140551405614057140581405914060140611406214063140641406514066140671406814069140701407114072140731407414075140761407714078140791408014081140821408314084140851408614087140881408914090140911409214093140941409514096140971409814099141001410114102141031410414105141061410714108141091411014111141121411314114141151411614117141181411914120141211412214123141241412514126141271412814129141301413114132141331413414135141361413714138141391414014141141421414314144141451414614147141481414914150141511415214153141541415514156141571415814159141601416114162141631416414165141661416714168141691417014171141721417314174141751417614177141781417914180141811418214183141841418514186141871418814189141901419114192141931419414195141961419714198141991420014201142021420314204142051420614207142081420914210142111421214213142141421514216142171421814219142201422114222142231422414225142261422714228142291423014231142321423314234142351423614237142381423914240142411424214243142441424514246142471424814249142501425114252142531425414255142561425714258142591426014261142621426314264142651426614267142681426914270142711427214273142741427514276142771427814279142801428114282142831428414285142861428714288142891429014291142921429314294142951429614297142981429914300143011430214303143041430514306143071430814309143101431114312143131431414315143161431714318143191432014321143221432314324143251432614327143281432914330143311433214333143341433514336143371433814339143401434114342143431434414345143461434714348143491435014351143521435314354143551435614357143581435914360143611436214363143641436514366143671436814369143701437114372143731437414375143761437714378143791438014381143821438314384143851438614387143881438914390143911439214393143941439514396143971439814399144001440114402144031440414405144061440714408144091441014411144121441314414144151441614417144181441914420144211442214423144241442514426144271442814429144301443114432144331443414435144361443714438144391444014441144421444314444144451444614447144481444914450144511445214453144541445514456144571445814459144601446114462144631446414465144661446714468144691447014471144721447314474144751447614477144781447914480144811448214483144841448514486144871448814489144901449114492144931449414495144961449714498144991450014501145021450314504145051450614507145081450914510145111451214513145141451514516145171451814519145201452114522145231452414525145261452714528145291453014531145321453314534145351453614537145381453914540145411454214543145441454514546145471454814549145501455114552145531455414555145561455714558145591456014561145621456314564145651456614567145681456914570145711457214573145741457514576145771457814579145801458114582145831458414585145861458714588145891459014591145921459314594145951459614597145981459914600146011460214603146041460514606146071460814609146101461114612146131461414615146161461714618146191462014621146221462314624146251462614627146281462914630146311463214633146341463514636146371463814639146401464114642146431464414645146461464714648146491465014651146521465314654146551465614657146581465914660146611466214663146641466514666146671466814669146701467114672146731467414675146761467714678146791468014681146821468314684146851468614687146881468914690146911469214693146941469514696146971469814699147001470114702147031470414705147061470714708147091471014711147121471314714147151471614717147181471914720147211472214723147241472514726147271472814729147301473114732147331473414735147361473714738147391474014741147421474314744147451474614747147481474914750147511475214753147541475514756147571475814759147601476114762147631476414765147661476714768147691477014771147721477314774147751477614777147781477914780147811478214783147841478514786147871478814789147901479114792147931479414795147961479714798147991480014801148021480314804148051480614807148081480914810148111481214813148141481514816148171481814819148201482114822148231482414825148261482714828148291483014831148321483314834148351483614837148381483914840148411484214843148441484514846148471484814849148501485114852148531485414855148561485714858148591486014861148621486314864148651486614867148681486914870148711487214873148741487514876148771487814879148801488114882148831488414885148861488714888148891489014891148921489314894148951489614897148981489914900149011490214903149041490514906149071490814909149101491114912149131491414915149161491714918149191492014921149221492314924149251492614927149281492914930149311493214933149341493514936149371493814939149401494114942149431494414945149461494714948149491495014951149521495314954149551495614957149581495914960149611496214963149641496514966149671496814969149701497114972149731497414975149761497714978149791498014981149821498314984149851498614987149881498914990149911499214993149941499514996149971499814999150001500115002150031500415005150061500715008150091501015011150121501315014150151501615017150181501915020150211502215023150241502515026150271502815029150301503115032150331503415035150361503715038150391504015041150421504315044150451504615047150481504915050150511505215053150541505515056150571505815059150601506115062150631506415065150661506715068150691507015071150721507315074150751507615077150781507915080150811508215083150841508515086150871508815089150901509115092150931509415095150961509715098150991510015101151021510315104151051510615107151081510915110151111511215113151141511515116151171511815119151201512115122151231512415125151261512715128151291513015131151321513315134151351513615137151381513915140151411514215143151441514515146151471514815149151501515115152151531515415155151561515715158151591516015161151621516315164151651516615167151681516915170151711517215173151741517515176151771517815179151801518115182151831518415185151861518715188151891519015191151921519315194151951519615197151981519915200152011520215203152041520515206152071520815209152101521115212152131521415215152161521715218152191522015221152221522315224152251522615227152281522915230152311523215233152341523515236152371523815239152401524115242152431524415245152461524715248152491525015251152521525315254152551525615257152581525915260152611526215263152641526515266152671526815269152701527115272152731527415275152761527715278152791528015281152821528315284152851528615287152881528915290152911529215293152941529515296152971529815299153001530115302153031530415305153061530715308153091531015311153121531315314153151531615317153181531915320153211532215323153241532515326153271532815329153301533115332153331533415335153361533715338153391534015341153421534315344153451534615347153481534915350153511535215353153541535515356153571535815359153601536115362153631536415365153661536715368153691537015371153721537315374153751537615377153781537915380153811538215383153841538515386153871538815389153901539115392153931539415395153961539715398153991540015401154021540315404154051540615407154081540915410154111541215413154141541515416154171541815419154201542115422154231542415425154261542715428154291543015431154321543315434154351543615437154381543915440154411544215443154441544515446154471544815449154501545115452154531545415455154561545715458154591546015461154621546315464154651546615467154681546915470154711547215473154741547515476154771547815479154801548115482154831548415485154861548715488154891549015491154921549315494154951549615497154981549915500155011550215503155041550515506155071550815509155101551115512155131551415515155161551715518155191552015521155221552315524155251552615527155281552915530155311553215533155341553515536155371553815539155401554115542155431554415545155461554715548155491555015551155521555315554155551555615557155581555915560155611556215563155641556515566155671556815569155701557115572155731557415575155761557715578155791558015581155821558315584155851558615587155881558915590155911559215593155941559515596155971559815599156001560115602156031560415605156061560715608156091561015611156121561315614156151561615617156181561915620156211562215623156241562515626156271562815629156301563115632156331563415635156361563715638156391564015641156421564315644156451564615647156481564915650156511565215653156541565515656156571565815659156601566115662156631566415665156661566715668156691567015671156721567315674156751567615677156781567915680156811568215683156841568515686156871568815689156901569115692156931569415695156961569715698156991570015701157021570315704157051570615707157081570915710157111571215713157141571515716157171571815719157201572115722157231572415725157261572715728157291573015731157321573315734157351573615737157381573915740157411574215743157441574515746157471574815749157501575115752157531575415755157561575715758157591576015761157621576315764157651576615767157681576915770157711577215773157741577515776157771577815779157801578115782157831578415785157861578715788157891579015791157921579315794157951579615797157981579915800158011580215803158041580515806158071580815809158101581115812158131581415815158161581715818158191582015821158221582315824158251582615827158281582915830158311583215833158341583515836158371583815839158401584115842158431584415845158461584715848158491585015851158521585315854158551585615857158581585915860158611586215863158641586515866158671586815869158701587115872158731587415875158761587715878158791588015881158821588315884158851588615887158881588915890158911589215893158941589515896158971589815899159001590115902159031590415905159061590715908159091591015911159121591315914159151591615917159181591915920159211592215923159241592515926159271592815929159301593115932159331593415935159361593715938159391594015941159421594315944159451594615947159481594915950159511595215953159541595515956159571595815959159601596115962159631596415965159661596715968159691597015971159721597315974159751597615977159781597915980159811598215983159841598515986159871598815989159901599115992159931599415995159961599715998159991600016001160021600316004160051600616007160081600916010160111601216013160141601516016160171601816019160201602116022160231602416025160261602716028160291603016031160321603316034160351603616037160381603916040160411604216043160441604516046160471604816049160501605116052160531605416055160561605716058160591606016061160621606316064160651606616067160681606916070160711607216073160741607516076160771607816079160801608116082160831608416085160861608716088160891609016091160921609316094160951609616097160981609916100161011610216103161041610516106161071610816109161101611116112161131611416115161161611716118161191612016121161221612316124161251612616127161281612916130161311613216133161341613516136161371613816139161401614116142161431614416145161461614716148161491615016151161521615316154161551615616157161581615916160161611616216163161641616516166161671616816169161701617116172161731617416175161761617716178161791618016181161821618316184161851618616187161881618916190161911619216193
  1. #LyX 2.3 created this file. For more info see http://www.lyx.org/
  2. \lyxformat 544
  3. \begin_document
  4. \begin_header
  5. \save_transient_properties true
  6. \origin unavailable
  7. \textclass extbook
  8. \begin_preamble
  9. % List all used files in log output
  10. \listfiles
  11. % Add a DRAFT watermark
  12. \usepackage{draftwatermark}
  13. \usepackage{accsupp}
  14. \SetWatermarkLightness{0.97}
  15. \SetWatermarkScale{1}
  16. % Make watermark not copyable (in Adobe Reader)
  17. \SetWatermarkText{\BeginAccSupp{method=escape,ActualText={}}DRAFT\EndAccSupp{}}
  18. % Set up required header format
  19. \usepackage{fancyhdr}
  20. \pagestyle{fancy}
  21. \renewcommand{\headrulewidth}{0pt}
  22. \rhead{}
  23. \lhead{}
  24. \chead{}
  25. \rfoot{}
  26. \lfoot{}
  27. % Make page number not copyable (in Adobe Reader)
  28. \cfoot{\BeginAccSupp{method=escape,ActualText={}}\thepage\EndAccSupp{}} % Page number bottom center
  29. % Allow FloatBarrier command
  30. \usepackage{placeins}
  31. % Allow landscape pages
  32. \usepackage{pdflscape}
  33. % Allow doing things after the end of the current page
  34. % (to avoid landscape figures breaking up text)
  35. \usepackage{afterpage}
  36. % This one breaks subfigs so it's disabled
  37. % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
  38. % Bold all nomenclature entries
  39. \renewcommand{\nomlabel}[1]{\textsf{\textbf{#1}}}
  40. % https://tex.stackexchange.com/a/31083/5654
  41. %\let\nomenclOrig\nomenclature
  42. %\renewcommand*{\nomenclature}[3][]{#2\nomenclOrig[#1]{#2}{#3}}
  43. \usepackage[nohypertypes={abbreviation}]{glossaries-extra}
  44. \setabbreviationstyle{long-short}
  45. \input{abbrevs.tex}
  46. \makeglossaries
  47. % arara: pdflatex
  48. % arara: biblatex
  49. % arara: makeglossaries
  50. % arara: pdflatex
  51. \end_preamble
  52. \use_default_options true
  53. \begin_modules
  54. todonotes
  55. logicalmkup
  56. \end_modules
  57. \maintain_unincluded_children false
  58. \begin_local_layout
  59. Format 66
  60. InsetLayout "Flex:Glossary Term"
  61. LyxType custom
  62. LabelString gls
  63. LatexType command
  64. LatexName gls*
  65. InToc true
  66. CustomPars false
  67. End
  68. InsetLayout "Flex:Glossary Term (Capital)"
  69. LyxType custom
  70. LabelString Gls
  71. LatexType command
  72. LatexName Gls*
  73. InToc true
  74. CustomPars false
  75. End
  76. \end_local_layout
  77. \language english
  78. \language_package default
  79. \inputencoding utf8
  80. \fontencoding default
  81. \font_roman "default" "default"
  82. \font_sans "default" "default"
  83. \font_typewriter "default" "default"
  84. \font_math "auto" "auto"
  85. \font_default_family default
  86. \use_non_tex_fonts false
  87. \font_sc false
  88. \font_osf false
  89. \font_sf_scale 100 100
  90. \font_tt_scale 100 100
  91. \use_microtype false
  92. \use_dash_ligatures true
  93. \graphics default
  94. \default_output_format pdf4
  95. \output_sync 0
  96. \bibtex_command biber
  97. \index_command default
  98. \paperfontsize 12
  99. \spacing double
  100. \use_hyperref true
  101. \pdf_bookmarks true
  102. \pdf_bookmarksnumbered false
  103. \pdf_bookmarksopen false
  104. \pdf_bookmarksopenlevel 1
  105. \pdf_breaklinks false
  106. \pdf_pdfborder false
  107. \pdf_colorlinks false
  108. \pdf_backref false
  109. \pdf_pdfusetitle true
  110. \papersize letterpaper
  111. \use_geometry true
  112. \use_package amsmath 1
  113. \use_package amssymb 1
  114. \use_package cancel 1
  115. \use_package esint 1
  116. \use_package mathdots 1
  117. \use_package mathtools 1
  118. \use_package mhchem 1
  119. \use_package stackrel 1
  120. \use_package stmaryrd 1
  121. \use_package undertilde 1
  122. \cite_engine biblatex
  123. \cite_engine_type authoryear
  124. \biblio_style plain
  125. \biblatex_bibstyle authoryear
  126. \biblatex_citestyle numeric
  127. \use_bibtopic false
  128. \use_indices false
  129. \paperorientation portrait
  130. \suppress_date false
  131. \justification true
  132. \use_refstyle 1
  133. \use_minted 0
  134. \index Index
  135. \shortcut idx
  136. \color #008000
  137. \end_index
  138. \leftmargin 1.5in
  139. \topmargin 1in
  140. \rightmargin 1in
  141. \bottommargin 1in
  142. \secnumdepth 3
  143. \tocdepth 3
  144. \paragraph_separation indent
  145. \paragraph_indentation default
  146. \is_math_indent 0
  147. \math_numbering_side default
  148. \quotes_style english
  149. \dynamic_quotes 0
  150. \papercolumns 1
  151. \papersides 1
  152. \paperpagestyle default
  153. \tracking_changes false
  154. \output_changes false
  155. \html_math_output 0
  156. \html_css_as_file 0
  157. \html_be_strict false
  158. \end_header
  159. \begin_body
  160. \begin_layout Title
  161. Bioinformatic analysis of complex, high-throughput genomic and epigenomic
  162. data in the context of immunology and transplant rejection
  163. \end_layout
  164. \begin_layout Author
  165. A thesis presented
  166. \begin_inset Newline newline
  167. \end_inset
  168. by
  169. \begin_inset Newline newline
  170. \end_inset
  171. Ryan C.
  172. Thompson
  173. \begin_inset Newline newline
  174. \end_inset
  175. to
  176. \begin_inset Newline newline
  177. \end_inset
  178. The Scripps Research Institute Graduate Program
  179. \begin_inset Newline newline
  180. \end_inset
  181. in partial fulfillment of the requirements for the degree of
  182. \begin_inset Newline newline
  183. \end_inset
  184. Doctor of Philosophy in the subject of Biology
  185. \begin_inset Newline newline
  186. \end_inset
  187. for
  188. \begin_inset Newline newline
  189. \end_inset
  190. The Scripps Research Institute
  191. \begin_inset Newline newline
  192. \end_inset
  193. La Jolla, California
  194. \end_layout
  195. \begin_layout Date
  196. October 2019
  197. \end_layout
  198. \begin_layout Standard
  199. [Copyright notice]
  200. \end_layout
  201. \begin_layout Standard
  202. [Thesis acceptance form]
  203. \end_layout
  204. \begin_layout Standard
  205. [Dedication]
  206. \end_layout
  207. \begin_layout Standard
  208. [Acknowledgements]
  209. \end_layout
  210. \begin_layout Standard
  211. \begin_inset CommandInset toc
  212. LatexCommand tableofcontents
  213. \end_inset
  214. \end_layout
  215. \begin_layout Standard
  216. \begin_inset FloatList table
  217. \end_inset
  218. \end_layout
  219. \begin_layout Standard
  220. \begin_inset FloatList figure
  221. \end_inset
  222. \end_layout
  223. \begin_layout Standard
  224. \begin_inset Note Note
  225. status open
  226. \begin_layout Plain Layout
  227. To create a new nomenclature entry:
  228. \end_layout
  229. \begin_layout Enumerate
  230. Add an entry to abbrevs.tex
  231. \end_layout
  232. \begin_layout Enumerate
  233. Find the first instance of the term, and wrap it in Insert -> Custom Insets
  234. -> Glossary Term (use Capital if starting a sentence)
  235. \end_layout
  236. \begin_layout Enumerate
  237. Add a nomenclature entry after the first instance
  238. \end_layout
  239. \begin_layout Enumerate
  240. Replace every relevant instance throughout the document with the Glossary
  241. Term wrapped version, using Edit -> Find & Replace (Advanced).
  242. Skip section headers and floats.
  243. \end_layout
  244. \begin_layout Plain Layout
  245. \begin_inset CommandInset href
  246. LatexCommand href
  247. target "https://ctan.org/pkg/glossaries?lang=en"
  248. literal "false"
  249. \end_inset
  250. \end_layout
  251. \begin_layout Plain Layout
  252. \begin_inset CommandInset href
  253. LatexCommand href
  254. target "https://wiki.lyx.org/Tips/Nomenclature"
  255. literal "false"
  256. \end_inset
  257. \end_layout
  258. \end_inset
  259. \end_layout
  260. \begin_layout Standard
  261. \begin_inset CommandInset nomencl_print
  262. LatexCommand printnomenclature
  263. set_width "auto"
  264. \end_inset
  265. \end_layout
  266. \begin_layout List of TODOs
  267. \end_layout
  268. \begin_layout Standard
  269. \begin_inset Flex TODO Note (inline)
  270. status open
  271. \begin_layout Plain Layout
  272. Check all figures to make sure they fit on the page with their legends.
  273. \end_layout
  274. \end_inset
  275. \end_layout
  276. \begin_layout Standard
  277. \begin_inset Flex TODO Note (inline)
  278. status open
  279. \begin_layout Plain Layout
  280. Look into auto-generated nomenclature list:
  281. \begin_inset CommandInset href
  282. LatexCommand href
  283. target "https://wiki.lyx.org/Tips/Nomenclature"
  284. \end_inset
  285. .
  286. Otherwise, do a manual pass for all abbreviations at the end.
  287. Do nomenclature/abbreviations independently for each chapter.
  288. \end_layout
  289. \end_inset
  290. \end_layout
  291. \begin_layout Standard
  292. \begin_inset Flex TODO Note (inline)
  293. status open
  294. \begin_layout Plain Layout
  295. Make all descriptions consistent in terms of
  296. \begin_inset Quotes eld
  297. \end_inset
  298. we did X
  299. \begin_inset Quotes erd
  300. \end_inset
  301. vs
  302. \begin_inset Quotes eld
  303. \end_inset
  304. I did X
  305. \begin_inset Quotes erd
  306. \end_inset
  307. vs
  308. \begin_inset Quotes eld
  309. \end_inset
  310. X was done
  311. \begin_inset Quotes erd
  312. \end_inset
  313. .
  314. \end_layout
  315. \end_inset
  316. \end_layout
  317. \begin_layout Chapter*
  318. Abstract
  319. \end_layout
  320. \begin_layout Standard
  321. \begin_inset Note Note
  322. status open
  323. \begin_layout Plain Layout
  324. It is included as an integral part of the thesis and should immediately
  325. precede the introduction.
  326. \end_layout
  327. \begin_layout Plain Layout
  328. Preparing your Abstract.
  329. Your abstract (a succinct description of your work) is limited to 350 words.
  330. UMI will shorten it if they must; please do not exceed the limit.
  331. \end_layout
  332. \begin_layout Itemize
  333. Include pertinent place names, names of persons (in full), and other proper
  334. nouns.
  335. These are useful in automated retrieval.
  336. \end_layout
  337. \begin_layout Itemize
  338. Display symbols, as well as foreign words and phrases, clearly and accurately.
  339. Include transliterations for characters other than Roman and Greek letters
  340. and Arabic numerals.
  341. Include accents and diacritical marks.
  342. \end_layout
  343. \begin_layout Itemize
  344. Do not include graphs, charts, tables, or illustrations in your abstract.
  345. \end_layout
  346. \end_inset
  347. \end_layout
  348. \begin_layout Standard
  349. \begin_inset Flex TODO Note (inline)
  350. status open
  351. \begin_layout Plain Layout
  352. Obviously the abstract gets written last.
  353. \end_layout
  354. \end_inset
  355. \end_layout
  356. \begin_layout Chapter*
  357. Notes to draft readers
  358. \end_layout
  359. \begin_layout Standard
  360. Thank you so much for agreeing to read my thesis and give me feedback on
  361. it.
  362. What you are currently reading is a rough draft, in need of many revisions.
  363. You can always find the latest version at
  364. \begin_inset CommandInset href
  365. LatexCommand href
  366. target "https://mneme.dedyn.io/~ryan/Thesis/thesis.pdf"
  367. literal "false"
  368. \end_inset
  369. .
  370. the PDF at this link is updated periodically with my latest revisions,
  371. but you can just download the current version and give me feedback on that.
  372. Don't worry about keeping up with the updates.
  373. \end_layout
  374. \begin_layout Standard
  375. As for what feedback I'm looking for, first of all, don't waste your time
  376. marking spelling mistakes and such.
  377. I haven't run a spell checker on it yet, so let me worry about that.
  378. Also, I'm aware that many abbreviations are not properly introduced the
  379. first time they are used, so don't worry about that either.
  380. However, if you see any glaring formatting issues, such as a figure being
  381. too large and getting cut off at the edge of the page, please note them.
  382. In addition, if any of the text in the figures is too small, please note
  383. that as well.
  384. \end_layout
  385. \begin_layout Standard
  386. Beyond that, what I'm mainly interested in is feedback on the content.
  387. For example: does the introduction flow logically, and does it provide
  388. enough background to understand the other chapters? Does each chapter make
  389. it clear what work and analyses I have done? Do the figures clearly communicate
  390. the results I'm trying to show? Do you feel that the claims in the results
  391. and discussion sections are well-supported? There's no need to suggest
  392. improvements; just note areas that you feel need improvement.
  393. Additionally, while I am well aware that Chapter 1 (the introduction) contains
  394. many un-cited claims, all the other chapters (2,3, and 4)
  395. \emph on
  396. should
  397. \emph default
  398. be fully cited.
  399. So if you notice any un-cited claims in those chapters, please flag them
  400. for my attention.
  401. Similarly, if you discover any factual errors, please note them as well.
  402. \end_layout
  403. \begin_layout Standard
  404. You can provide your feedback in whatever way is most convenient to you.
  405. You could mark up this PDF with highlights and notes, then send it back
  406. to me.
  407. Or you could collect your comments in a separate text file and send that
  408. to me, or whatever else you like.
  409. However, if you send me your feedback in a separate document, please note
  410. a section/figure/table number for each comment, and
  411. \emph on
  412. also
  413. \emph default
  414. send me the exact PDF that you read so I can reference it while reading
  415. your comments, since as mentioned above, the current version I'm working
  416. on will have changed by that point (which might include shuffling sections
  417. and figures around, changing their numbers).
  418. One last thing: you'll see a bunch of text in orange boxes throughout the
  419. PDF.
  420. These are notes to myself about things that need to be fixed later, so
  421. if you see a problem noted in an orange box, that means I'm already aware
  422. of it, and there's no need to comment on it.
  423. \end_layout
  424. \begin_layout Standard
  425. My thesis is due Thursday, October 10th, so in order to be useful to me,
  426. I'll need your feedback at least a few days before that, ideally by Monday,
  427. October 7th.
  428. If you have limited time and are unable to get through the whole thesis,
  429. please focus your efforts on Chapters 1 and 2, since those are the roughest
  430. and most in need of revision.
  431. Chapter 3 is fairly short and straightforward, and Chapter 4 is an adaptation
  432. of a paper that's already been through a few rounds of revision, so they
  433. should be a lot tighter.
  434. If you can't spare any time between now and then, or if something unexpected
  435. comes up, I understand.
  436. Just let me know.
  437. \end_layout
  438. \begin_layout Standard
  439. Thanks again for your help, and happy reading!
  440. \end_layout
  441. \begin_layout Chapter
  442. Introduction
  443. \end_layout
  444. \begin_layout Section
  445. Background & Significance
  446. \end_layout
  447. \begin_layout Subsection
  448. Biological motivation
  449. \end_layout
  450. \begin_layout Standard
  451. \begin_inset Flex TODO Note (inline)
  452. status open
  453. \begin_layout Plain Layout
  454. Rethink the subsection organization after the intro is written.
  455. \end_layout
  456. \end_inset
  457. \end_layout
  458. \begin_layout Standard
  459. \begin_inset Flex TODO Note (inline)
  460. status open
  461. \begin_layout Plain Layout
  462. Citations are needed all over the place.
  463. A lot of this is knowledge I've just absorbed from years of conversation
  464. in the Salomon lab, without ever having seen a citation for it.
  465. \end_layout
  466. \end_inset
  467. \end_layout
  468. \begin_layout Subsubsection
  469. Rejection is the major long-term threat to organ and tissue allografts
  470. \end_layout
  471. \begin_layout Standard
  472. Organ and tissue transplants are a life-saving treatment for people who
  473. have lost the function of an important organ [CITE?].
  474. In some cases, it is possible to transplant a patient's own tissue from
  475. one area of their body to another, referred to as an autograft.
  476. This is common for tissues that are distributed throughout many areas of
  477. the body, such as skin and bone.
  478. However, in cases of organ failure, there is no functional self tissue
  479. remaining, and a transplant from another person – a donor – is required.
  480. This is referred to as an allograft.
  481. \end_layout
  482. \begin_layout Standard
  483. \begin_inset Flex TODO Note (inline)
  484. status open
  485. \begin_layout Plain Layout
  486. Possible citation for degree of generic variability:
  487. \begin_inset CommandInset href
  488. LatexCommand href
  489. target "https://www.ncbi.nlm.nih.gov/pubmed/22424236?dopt=Abstract"
  490. \end_inset
  491. \end_layout
  492. \end_inset
  493. \end_layout
  494. \begin_layout Standard
  495. \begin_inset Flex TODO Note (inline)
  496. status open
  497. \begin_layout Plain Layout
  498. How much mechanistic detail is needed here? My work doesn't really go into
  499. specific rejection mechanisms, so I think it's best to keep it basic.
  500. \end_layout
  501. \end_inset
  502. \end_layout
  503. \begin_layout Standard
  504. Because an allograft comes from a different person, it is genetically distinct
  505. from the rest of the recipient's body.
  506. Some genetic variants occur in protein coding regions and affect the polypeptid
  507. e sequences encoded by the affected genes, resulting in protein products
  508. that differ from the equivalent proteins produced by the graft recipient's
  509. own tissue.
  510. As a result, without intervention, the recipient's immune system will eventuall
  511. y identify the graft as foreign tissue and begin attacking it, eventually
  512. resulting in failure and death of the graft, a process referred to as transplan
  513. t rejection.
  514. Rejection is the most significant challenge to the long-term health and
  515. survival of an allograft [CITE?].
  516. Like any adaptive immune response, graft rejection generally occurs via
  517. two broad mechanisms: cellular immunity, in which CD8+ T-cells recognizing
  518. graft-specific antigens induce apoptosis in the graft cells; and humoral
  519. immunity, in which B-cells produce antibodies that bind to graft proteins
  520. and direct an immune response against the graft [CITE?].
  521. In either case, rejection shows most of the typical hallmarks of an adaptive
  522. immune response, in particular mediation by CD4+ T-cells and formation
  523. of immune memory.
  524. \end_layout
  525. \begin_layout Subsubsection
  526. Diagnosis and treatment of allograft rejection is a major challenge
  527. \end_layout
  528. \begin_layout Standard
  529. To prevent rejection, allograft recipients are treated with immune suppressive
  530. drugs [CITE?].
  531. The goal is to achieve sufficient suppression of the immune system to prevent
  532. rejection of the graft without compromising the ability of the immune system
  533. to raise a normal response against infection.
  534. As such, a delicate balance must be struck: insufficient immune suppression
  535. may lead to rejection and ultimately loss of the graft; excessive suppression
  536. leaves the patient vulnerable to life-threatening opportunistic infections.
  537. Because every patient is different, immune suppression must be tailored
  538. for each patient.
  539. Furthermore, immune suppression must be tuned over time, as the immune
  540. system's activity is not static, nor is it held in a steady state [CITE?].
  541. In order to properly adjust the dosage of immune suppression drugs, it
  542. is necessary to monitor the health of the transplant and increase the dosage
  543. if evidence of rejection is observed.
  544. \end_layout
  545. \begin_layout Standard
  546. However, diagnosis of rejection is a significant challenge.
  547. Early diagnosis is essential in order to step up immune suppression before
  548. the immune system damages the graft beyond recovery [CITE?].
  549. The current gold standard test for graft rejection is a tissue biopsy,
  550. examined for visible signs of rejection by a trained histologist [CITE?].
  551. When a patient shows symptoms of possible rejection, a
  552. \begin_inset Quotes eld
  553. \end_inset
  554. for cause
  555. \begin_inset Quotes erd
  556. \end_inset
  557. biopsy is performed to confirm the diagnosis, and immune suppression is
  558. adjusted as necessary.
  559. However, in many cases, the early stages of rejection are asymptomatic,
  560. known as
  561. \begin_inset Quotes eld
  562. \end_inset
  563. sub-clinical
  564. \begin_inset Quotes erd
  565. \end_inset
  566. rejection [CITE?].
  567. In light of this, is is now common to perform
  568. \begin_inset Quotes eld
  569. \end_inset
  570. protocol biopsies
  571. \begin_inset Quotes erd
  572. \end_inset
  573. at specific times after transplantation of a graft, even if no symptoms
  574. of rejection are apparent, in addition to
  575. \begin_inset Quotes eld
  576. \end_inset
  577. for cause
  578. \begin_inset Quotes erd
  579. \end_inset
  580. biopsies
  581. \begin_inset CommandInset citation
  582. LatexCommand cite
  583. key "Wilkinson2006"
  584. literal "false"
  585. \end_inset
  586. .
  587. \end_layout
  588. \begin_layout Standard
  589. However, biopsies have a number of downsides that limit their effectiveness
  590. as a diagnostic tool.
  591. First, the need for manual inspection by a histologist means that diagnosis
  592. is subject to the biases of the particular histologist examining the biopsy
  593. [CITE?].
  594. In marginal cases, two different histologists may give two different diagnoses
  595. to the same biopsy.
  596. Second, a biopsy can only evaluate if rejection is occurring in the section
  597. of the graft from which the tissue was extracted.
  598. If rejection is localized to one section of the graft and the tissue is
  599. extracted from a different section, a false negative diagnosis may result.
  600. Most importantly, extraction of tissue from a graft is invasive and is
  601. treated as an injury by the body, which results in inflammation that in
  602. turn promotes increased immune system activity [CITE?].
  603. Hence, the invasiveness of biopsies severely limits the frequency with
  604. which they can safely be performed.
  605. Typically, protocol biopsies are not scheduled more than about once per
  606. month
  607. \begin_inset CommandInset citation
  608. LatexCommand cite
  609. key "Wilkinson2006"
  610. literal "false"
  611. \end_inset
  612. .
  613. A less invasive diagnostic test for rejection would bring manifold benefits.
  614. Such a test would enable more frequent testing and therefore earlier detection
  615. of rejection events.
  616. In addition, having a larger pool of historical data for a given patient
  617. would make it easier to evaluate when a given test is outside the normal
  618. parameters for that specific patient, rather than relying on normal ranges
  619. for the population as a whole.
  620. Lastly, the accumulated data from more frequent tests would be a boon to
  621. the transplant research community.
  622. Beyond simply providing more data overall, the better time granularity
  623. of the tests will enable studying the progression of a rejection event
  624. on the scale of days to weeks, rather than months.
  625. \end_layout
  626. \begin_layout Subsubsection
  627. Memory cells are resistant to immune suppression
  628. \end_layout
  629. \begin_layout Standard
  630. One of the defining features of the adaptive immune system is immune memory:
  631. the ability of the immune system to recognize a previously encountered
  632. foreign antigen and respond more quickly and more strongly to that antigen
  633. in subsequent encounters.
  634. When the immune system first encounters a new antigen, the lymphocytes
  635. that respond are known as naïve cells – T-cells and B-cells that have never
  636. detected their target antigens before.
  637. Once activated by their specific antigen presented by an antigen-presenting
  638. cell in the proper co-stimulatory context, naïve cells differentiate into
  639. effector cells that carry out their respective functions in targeting and
  640. destroying the source of the foreign antigen.
  641. The requirement for co-stimulation is an important feature of naïve cells
  642. that limits
  643. \begin_inset Quotes eld
  644. \end_inset
  645. false positive
  646. \begin_inset Quotes erd
  647. \end_inset
  648. immune responses, because antigen-presenting cells usually only express
  649. the proper co-stimulation after detecting evidence of an infection, such
  650. as the presence of common bacterial cell components or inflamed tissue.
  651. Most effector cells die after the foreign antigen is cleared, since they
  652. are no longer needed, but some remain and differentiate into memory cells.
  653. Like naïve cells, memory cells respond to detection of their specific antigen
  654. by differentiating into effector cells, ready to fight an infection.
  655. However, unlike naïve cells, memory cells do not require the same degree
  656. of co-stimulatory signaling for activation, and once activated, they proliferat
  657. e and differentiate into effector cells more quickly than naïve cells do.
  658. \end_layout
  659. \begin_layout Standard
  660. In the context of a pathogenic infection, immune memory is a major advantage,
  661. allowing an organism to rapidly fight off a previously encountered pathogen
  662. much more quickly and effectively than the first time it was encountered.
  663. However, if effector cells that recognize an antigen from an allograft
  664. are allowed to differentiate into memory cells, preventing rejection of
  665. the graft becomes much more difficult.
  666. Many immune suppression drugs work by interfering with the co-stimulation
  667. that naïve cells require in order to mount an immune response [CITE?].
  668. Since memory cells do not require this co-stimulation, these drugs are
  669. not effective at suppressing an immune response that is mediated by memory
  670. cells.
  671. Secondly, because memory cells are able to mount a stronger and faster
  672. response to an antigen, all else being equal they require stronger immune
  673. suppression than naïve cells to prevent an immune response.
  674. However, immune suppression affects the entire immune system, not just
  675. cells recognizing a specific antigen, so increasing the dosage of immune
  676. suppression drugs also increases the risk of complications from a compromised
  677. immune system, such as opportunistic infections.
  678. While the differences in cell surface markers between naïve and memory
  679. cells have been fairly well characterized, the internal regulatory mechanisms
  680. that allow memory cells to respond more quickly and without co-stimulation
  681. are still poorly understood.
  682. In order to develop methods of immune suppression that either prevent the
  683. formation of memory cells or work more effectively against memory cells,
  684. the mechanisms of immune memory formation and regulation must be better
  685. understood.
  686. \end_layout
  687. \begin_layout Subsection
  688. Overview of bioinformatic analysis methods
  689. \end_layout
  690. \begin_layout Standard
  691. \begin_inset Flex TODO Note (inline)
  692. status open
  693. \begin_layout Plain Layout
  694. Also cite: R, Bioconductor, snakemake, python, pandas, bedtools, bowtie2,
  695. hisat2, STAR, samtools, sra-toolkit, picard tools
  696. \end_layout
  697. \end_inset
  698. \end_layout
  699. \begin_layout Standard
  700. The studies presented in this work all involve the analysis of high-throughput
  701. genomic and epigenomic data.
  702. These data present many unique analysis challenges, and a wide array of
  703. software tools are available to analyze them.
  704. This section presents an overview of the methods used, including what problems
  705. they solve, what assumptions they make, and a basic description of how
  706. they work.
  707. \end_layout
  708. \begin_layout Subsubsection
  709. \begin_inset Flex Code
  710. status open
  711. \begin_layout Plain Layout
  712. Limma
  713. \end_layout
  714. \end_inset
  715. : The standard linear modeling framework for genomics
  716. \end_layout
  717. \begin_layout Standard
  718. Linear models are a generalization of the
  719. \begin_inset Formula $t$
  720. \end_inset
  721. -test and ANOVA to arbitrarily complex experimental designs
  722. \begin_inset CommandInset citation
  723. LatexCommand cite
  724. key "chambers:1992"
  725. literal "false"
  726. \end_inset
  727. .
  728. In a typical linear model, there is one dependent variable observation
  729. per sample and a large number of samples.
  730. For example, in a linear model of height as a function of age and sex,
  731. there is one height measurement per person.
  732. However, when analyzing genomic data, each sample consists of observations
  733. of thousands of dependent variables.
  734. For example, in a
  735. \begin_inset Flex Glossary Term
  736. status open
  737. \begin_layout Plain Layout
  738. RNA-seq
  739. \end_layout
  740. \end_inset
  741. experiment, the dependent variables may be the count of
  742. \begin_inset Flex Glossary Term
  743. status open
  744. \begin_layout Plain Layout
  745. RNA-seq
  746. \end_layout
  747. \end_inset
  748. reads for each annotated gene.
  749. In abstract terms, each dependent variable being measured is referred to
  750. as a feature.
  751. The simplest approach to analyzing such data would be to fit the same model
  752. independently to each feature.
  753. However, this is undesirable for most genomics data sets.
  754. Genomics assays like high-throughput sequencing are expensive, and often
  755. the process of generating the samples is also quite expensive and time-consumin
  756. g.
  757. This expense limits the sample sizes typically employed in genomics experiments
  758. , and as a result the statistical power of the linear model for each individual
  759. feature is likewise limited.
  760. However, because thousands of features from the same samples are analyzed
  761. together, there is an opportunity to improve the statistical power of the
  762. analysis by exploiting shared patterns of variation across features.
  763. This is the core feature of
  764. \begin_inset Flex Code
  765. status open
  766. \begin_layout Plain Layout
  767. limma
  768. \end_layout
  769. \end_inset
  770. , a linear modeling framework designed for genomic data.
  771. \begin_inset Flex Code
  772. status open
  773. \begin_layout Plain Layout
  774. Limma
  775. \end_layout
  776. \end_inset
  777. is typically used to analyze expression microarray data, and more recently
  778. \begin_inset Flex Glossary Term
  779. status open
  780. \begin_layout Plain Layout
  781. RNA-seq
  782. \end_layout
  783. \end_inset
  784. data, but it can also be used to analyze any other data for which linear
  785. modeling is appropriate.
  786. \end_layout
  787. \begin_layout Standard
  788. The central challenge when fitting a linear model is to estimate the variance
  789. of the data accurately.
  790. Out of all parameters required to evaluate statistical significance of
  791. an effect, the variance is the most difficult to estimate when sample sizes
  792. are small.
  793. A single shared variance could be estimated for all of the features together,
  794. and this estimate would be very stable, in contrast to the individual feature
  795. variance estimates.
  796. However, this would require the assumption that every feature is equally
  797. variable, which is known to be false for most genomic data sets.
  798. \begin_inset Flex Code
  799. status open
  800. \begin_layout Plain Layout
  801. limma
  802. \end_layout
  803. \end_inset
  804. offers a compromise between these two extremes by using a method called
  805. empirical Bayes moderation to
  806. \begin_inset Quotes eld
  807. \end_inset
  808. squeeze
  809. \begin_inset Quotes erd
  810. \end_inset
  811. the distribution of estimated variances toward a single common value that
  812. represents the variance of an average feature in the data
  813. \begin_inset CommandInset citation
  814. LatexCommand cite
  815. key "Smyth2004"
  816. literal "false"
  817. \end_inset
  818. .
  819. While the individual feature variance estimates are not stable, the common
  820. variance estimate for the entire data set is quite stable, so using a combinati
  821. on of the two yields a variance estimate for each feature with greater precision
  822. than the individual feature variances.
  823. The trade-off for this improvement is that squeezing each estimated variance
  824. toward the common value introduces some bias – the variance will be underestima
  825. ted for features with high variance and overestimated for features with
  826. low variance.
  827. Essentially,
  828. \begin_inset Flex Code
  829. status open
  830. \begin_layout Plain Layout
  831. limma
  832. \end_layout
  833. \end_inset
  834. assumes that extreme variances are less common than variances close to
  835. the common value.
  836. The variance estimates from this empirical Bayes procedure are shown empiricall
  837. y to yield greater statistical power than either the individual feature
  838. variances or the single common value.
  839. \end_layout
  840. \begin_layout Standard
  841. On top of this core framework,
  842. \begin_inset Flex Code
  843. status open
  844. \begin_layout Plain Layout
  845. limma
  846. \end_layout
  847. \end_inset
  848. also implements many other enhancements that, further relax the assumptions
  849. of the model and extend the scope of what kinds of data it can analyze.
  850. Instead of squeezing toward a single common variance value,
  851. \begin_inset Flex Code
  852. status open
  853. \begin_layout Plain Layout
  854. limma
  855. \end_layout
  856. \end_inset
  857. can model the common variance as a function of a covariate, such as average
  858. expression
  859. \begin_inset CommandInset citation
  860. LatexCommand cite
  861. key "Law2013"
  862. literal "false"
  863. \end_inset
  864. .
  865. This is essential for
  866. \begin_inset Flex Glossary Term
  867. status open
  868. \begin_layout Plain Layout
  869. RNA-seq
  870. \end_layout
  871. \end_inset
  872. data, where higher gene counts yield more precise expression measurements
  873. and therefore smaller variances than low-count genes.
  874. While linear models typically assume that all samples have equal variance,
  875. \begin_inset Flex Code
  876. status open
  877. \begin_layout Plain Layout
  878. limma
  879. \end_layout
  880. \end_inset
  881. is able to relax this assumption by identifying and down-weighting samples
  882. that diverge more strongly from the linear model across many features
  883. \begin_inset CommandInset citation
  884. LatexCommand cite
  885. key "Ritchie2006,Liu2015"
  886. literal "false"
  887. \end_inset
  888. .
  889. In addition,
  890. \begin_inset Flex Code
  891. status open
  892. \begin_layout Plain Layout
  893. limma
  894. \end_layout
  895. \end_inset
  896. is also able to fit simple mixed models incorporating one random effect
  897. in addition to the fixed effects represented by an ordinary linear model
  898. \begin_inset CommandInset citation
  899. LatexCommand cite
  900. key "Smyth2005a"
  901. literal "false"
  902. \end_inset
  903. .
  904. Once again,
  905. \begin_inset Flex Code
  906. status open
  907. \begin_layout Plain Layout
  908. limma
  909. \end_layout
  910. \end_inset
  911. shares information between features to obtain a robust estimate for the
  912. random effect correlation.
  913. \end_layout
  914. \begin_layout Subsubsection
  915. \begin_inset Flex Code
  916. status open
  917. \begin_layout Plain Layout
  918. edgeR
  919. \end_layout
  920. \end_inset
  921. provides
  922. \begin_inset Flex Code
  923. status open
  924. \begin_layout Plain Layout
  925. limma
  926. \end_layout
  927. \end_inset
  928. -like analysis features for count data
  929. \end_layout
  930. \begin_layout Standard
  931. Although
  932. \begin_inset Flex Code
  933. status open
  934. \begin_layout Plain Layout
  935. limma
  936. \end_layout
  937. \end_inset
  938. can be applied to read counts from
  939. \begin_inset Flex Glossary Term
  940. status open
  941. \begin_layout Plain Layout
  942. RNA-seq
  943. \end_layout
  944. \end_inset
  945. data, it is less suitable for counts from
  946. \begin_inset Flex Glossary Term
  947. status open
  948. \begin_layout Plain Layout
  949. ChIP-seq
  950. \end_layout
  951. \end_inset
  952. data, which tend to be much smaller and therefore violate the assumption
  953. of a normal distribution more severely.
  954. For all count-based data, the
  955. \begin_inset Flex Code
  956. status open
  957. \begin_layout Plain Layout
  958. edgeR
  959. \end_layout
  960. \end_inset
  961. package works similarly to
  962. \begin_inset Flex Code
  963. status open
  964. \begin_layout Plain Layout
  965. limma
  966. \end_layout
  967. \end_inset
  968. , but uses a generalized linear model instead of a linear model.
  969. The most important difference is that the GLM in
  970. \begin_inset Flex Code
  971. status open
  972. \begin_layout Plain Layout
  973. edgeR
  974. \end_layout
  975. \end_inset
  976. models the counts directly using a negative binomial distribution rather
  977. than modeling the normalized log counts using a normal distribution
  978. \begin_inset CommandInset citation
  979. LatexCommand cite
  980. key "Chen2014,McCarthy2012,Robinson2010a"
  981. literal "false"
  982. \end_inset
  983. .
  984. The negative binomial is a good fit for count data because it can be derived
  985. as a gamma-distributed mixture of Poisson distributions.
  986. The Poisson distribution accurately represents the distribution of counts
  987. expected for a given gene abundance, and the gamma distribution is then
  988. used to represent the variation in gene abundance between biological replicates.
  989. For this reason, the square root of the dispersion parameter of the negative
  990. binomial is sometimes referred to as the biological coefficient of variation,
  991. since it represents the variability that was present in the samples prior
  992. to the Poisson
  993. \begin_inset Quotes eld
  994. \end_inset
  995. noise
  996. \begin_inset Quotes erd
  997. \end_inset
  998. that was generated by the random sampling of reads in proportion to feature
  999. abundances.
  1000. The choice of a gamma distribution is arbitrary and motivated by mathematical
  1001. convenience, since a gamma-Poisson mixture yields the numerically tractable
  1002. negative binomial distribution.
  1003. Thus,
  1004. \begin_inset Flex Code
  1005. status open
  1006. \begin_layout Plain Layout
  1007. edgeR
  1008. \end_layout
  1009. \end_inset
  1010. assumes
  1011. \emph on
  1012. a prioi
  1013. \emph default
  1014. that the variation in abundances between replicates follows a gamma distribution.
  1015. For differential abundance testing,
  1016. \begin_inset Flex Code
  1017. status open
  1018. \begin_layout Plain Layout
  1019. edgeR
  1020. \end_layout
  1021. \end_inset
  1022. offers a likelihood ratio test, but more recently recommends a quasi-likelihood
  1023. test that properly factors the uncertainty in variance estimation into
  1024. the statistical significance for each feature
  1025. \begin_inset CommandInset citation
  1026. LatexCommand cite
  1027. key "Lund2012"
  1028. literal "false"
  1029. \end_inset
  1030. .
  1031. \end_layout
  1032. \begin_layout Subsubsection
  1033. ChIP-seq Peak calling
  1034. \end_layout
  1035. \begin_layout Standard
  1036. Unlike
  1037. \begin_inset Flex Glossary Term
  1038. status open
  1039. \begin_layout Plain Layout
  1040. RNA-seq
  1041. \end_layout
  1042. \end_inset
  1043. data, in which gene annotations provide a well-defined set of discrete
  1044. genomic regions in which to count reads, ChIP-seq reads can potentially
  1045. occur anywhere in the genome.
  1046. However, most genome regions will not contain significant ChIP-seq read
  1047. coverage, and analyzing every position in the entire genome is statistically
  1048. and computationally infeasible, so it is necessary to identify regions
  1049. of interest inside which ChIP-seq reads will be counted and analyzed.
  1050. One option is to define a set of interesting regions
  1051. \emph on
  1052. a priori
  1053. \emph default
  1054. , for example by defining a promoter region for each annotated gene.
  1055. However, it is also possible to use the ChIP-seq data itself to identify
  1056. regions with ChIP-seq read coverage significantly above the background
  1057. level, known as peaks.
  1058. \end_layout
  1059. \begin_layout Standard
  1060. There are generally two kinds of peaks that can be identified: narrow peaks
  1061. and broadly enriched regions.
  1062. Proteins like transcription factors that bind specific sites in the genome
  1063. typically show most of their ChIP-seq read coverage at these specific sites
  1064. and very little coverage anywhere else.
  1065. Because the footprint of the protein is consistent wherever it binds, each
  1066. peak has a consistent width, typically tens to hundreds of base pairs,
  1067. representing the length of DNA that it binds to.
  1068. Algorithms like MACS exploit this pattern to identify specific loci at
  1069. which such
  1070. \begin_inset Quotes eld
  1071. \end_inset
  1072. narrow peaks
  1073. \begin_inset Quotes erd
  1074. \end_inset
  1075. occur by looking for the characteristic peak shape in the ChIP-seq coverage
  1076. rising above the surrounding background coverage
  1077. \begin_inset CommandInset citation
  1078. LatexCommand cite
  1079. key "Zhang2008"
  1080. literal "false"
  1081. \end_inset
  1082. .
  1083. In contrast, some proteins, chief among them histones, do not bind only
  1084. at a small number of specific sites, but rather bind potentially almost
  1085. everywhere in the entire genome.
  1086. When looking at histone marks, adjacent histones tend to be similarly marked,
  1087. and a given mark may be present on an arbitrary number of consecutive histones
  1088. along the genome.
  1089. Hence, there is no consistent
  1090. \begin_inset Quotes eld
  1091. \end_inset
  1092. footprint size
  1093. \begin_inset Quotes erd
  1094. \end_inset
  1095. for ChIP-seq peaks based on histone marks, and peaks typically span many
  1096. histones.
  1097. Hence, typical peaks span many hundreds or even thousands of base pairs.
  1098. Instead of identifying specific loci of strong enrichment, algorithms like
  1099. SICER assume that peaks are represented in the ChIP-seq data by modest
  1100. enrichment above background occurring across broad regions, and they attempt
  1101. to identify the extent of those regions
  1102. \begin_inset CommandInset citation
  1103. LatexCommand cite
  1104. key "Zang2009"
  1105. literal "false"
  1106. \end_inset
  1107. .
  1108. In all cases, better results are obtained if the local background coverage
  1109. level can be estimated from ChIP-seq input samples, since various biases
  1110. can result in uneven background coverage.
  1111. \end_layout
  1112. \begin_layout Standard
  1113. Regardless of the type of peak identified, it is important to identify peaks
  1114. that occur consistently across biological replicates.
  1115. The ENCODE project has developed a method called irreproducible discovery
  1116. rate for this purpose
  1117. \begin_inset CommandInset citation
  1118. LatexCommand cite
  1119. key "Li2006"
  1120. literal "false"
  1121. \end_inset
  1122. .
  1123. The IDR is defined as the probability that a peak identified in one biological
  1124. replicate will
  1125. \emph on
  1126. not
  1127. \emph default
  1128. also be identified in a second replicate.
  1129. Where the more familiar false discovery rate measures the degree of corresponde
  1130. nce between a data-derived ranked list and the true list of significant
  1131. features, IDR instead measures the degree of correspondence between two
  1132. ranked lists derived from different data.
  1133. IDR assumes that the highest-ranked features are
  1134. \begin_inset Quotes eld
  1135. \end_inset
  1136. signal
  1137. \begin_inset Quotes erd
  1138. \end_inset
  1139. peaks that tend to be listed in the same order in both lists, while the
  1140. lowest-ranked features are essentially noise peaks, listed in random order
  1141. with no correspondence between the lists.
  1142. IDR attempts to locate the
  1143. \begin_inset Quotes eld
  1144. \end_inset
  1145. crossover point
  1146. \begin_inset Quotes erd
  1147. \end_inset
  1148. between the signal and the noise by determining how far down the list the
  1149. correspondence between feature ranks breaks down.
  1150. \end_layout
  1151. \begin_layout Standard
  1152. In addition to other considerations, if called peaks are to be used as regions
  1153. of interest for differential abundance analysis, then care must be taken
  1154. to call peaks in a way that is blind to differential abundance between
  1155. experimental conditions, or else the statistical significance calculations
  1156. for differential abundance will overstate their confidence in the results.
  1157. The
  1158. \begin_inset Flex Code
  1159. status open
  1160. \begin_layout Plain Layout
  1161. csaw
  1162. \end_layout
  1163. \end_inset
  1164. package provides guidelines for calling peaks in this way: peaks are called
  1165. based on a combination of all ChIP-seq reads from all experimental conditions,
  1166. so that the identified peaks are based on the average abundance across
  1167. all conditions, which is independent of any differential abundance between
  1168. conditions
  1169. \begin_inset CommandInset citation
  1170. LatexCommand cite
  1171. key "Lun2015a"
  1172. literal "false"
  1173. \end_inset
  1174. .
  1175. \end_layout
  1176. \begin_layout Subsubsection
  1177. Normalization of high-throughput data is non-trivial and application-dependent
  1178. \end_layout
  1179. \begin_layout Standard
  1180. High-throughput data sets invariably require some kind of normalization
  1181. before further analysis can be conducted.
  1182. In general, the goal of normalization is to remove effects in the data
  1183. that are caused by technical factors that have nothing to do with the biology
  1184. being studied.
  1185. \end_layout
  1186. \begin_layout Standard
  1187. For Affymetrix expression arrays, the standard normalization algorithm used
  1188. in most analyses is Robust Multichip Average (RMA) [CITE].
  1189. RMA is designed with the assumption that some fraction of probes on each
  1190. array will be artifactual and takes advantage of the fact that each gene
  1191. is represented by multiple probes by implementing normalization and summarizati
  1192. on steps that are robust against outlier probes.
  1193. However, RMA uses the probe intensities of all arrays in the data set in
  1194. the normalization of each individual array, meaning that the normalized
  1195. expression values in each array depend on every array in the data set,
  1196. and will necessarily change each time an array is added or removed from
  1197. the data set.
  1198. If this is undesirable, frozen RMA implements a variant of RMA where the
  1199. relevant distributional parameters are learned from a large reference set
  1200. of diverse public array data sets and then
  1201. \begin_inset Quotes eld
  1202. \end_inset
  1203. frozen
  1204. \begin_inset Quotes erd
  1205. \end_inset
  1206. , so that each array is effectively normalized against this frozen reference
  1207. set rather than the other arrays in the data set under study [CITE].
  1208. Other array normalization methods considered include dChip, GRSN, and SCAN
  1209. [CITEx3].
  1210. \end_layout
  1211. \begin_layout Standard
  1212. In contrast, high-throughput sequencing data present very different normalizatio
  1213. n challenges.
  1214. The simplest case is
  1215. \begin_inset Flex Glossary Term
  1216. status open
  1217. \begin_layout Plain Layout
  1218. RNA-seq
  1219. \end_layout
  1220. \end_inset
  1221. in which read counts are obtained for a set of gene annotations, yielding
  1222. a matrix of counts with rows representing genes and columns representing
  1223. samples.
  1224. Because
  1225. \begin_inset Flex Glossary Term
  1226. status open
  1227. \begin_layout Plain Layout
  1228. RNA-seq
  1229. \end_layout
  1230. \end_inset
  1231. approximates a process of sampling from a population with replacement,
  1232. each gene's count is only interpretable as a fraction of the total reads
  1233. for that sample.
  1234. For that reason,
  1235. \begin_inset Flex Glossary Term
  1236. status open
  1237. \begin_layout Plain Layout
  1238. RNA-seq
  1239. \end_layout
  1240. \end_inset
  1241. abundances are often reported as counts per million (CPM).
  1242. Furthermore, if the abundance of a single gene increases, then in order
  1243. for its fraction of the total reads to increase, all other genes' fractions
  1244. must decrease to accommodate it.
  1245. This effect is known as composition bias, and it is an artifact of the
  1246. read sampling process that has nothing to do with the biology of the samples
  1247. and must therefore be normalized out.
  1248. The most commonly used methods to normalize for composition bias in
  1249. \begin_inset Flex Glossary Term
  1250. status open
  1251. \begin_layout Plain Layout
  1252. RNA-seq
  1253. \end_layout
  1254. \end_inset
  1255. data seek to equalize the average gene abundance across samples, under
  1256. the assumption that the average gene is likely not changing
  1257. \begin_inset CommandInset citation
  1258. LatexCommand cite
  1259. key "Robinson2010,Anders2010"
  1260. literal "false"
  1261. \end_inset
  1262. .
  1263. \end_layout
  1264. \begin_layout Standard
  1265. In ChIP-seq data, normalization is not as straightforward.
  1266. The
  1267. \begin_inset Flex Code
  1268. status open
  1269. \begin_layout Plain Layout
  1270. csaw
  1271. \end_layout
  1272. \end_inset
  1273. package implements several different normalization strategies and provides
  1274. guidance on when to use each one
  1275. \begin_inset CommandInset citation
  1276. LatexCommand cite
  1277. key "Lun2015a"
  1278. literal "false"
  1279. \end_inset
  1280. .
  1281. Briefly, a typical ChIP-seq sample has a bimodal distribution of read counts:
  1282. a low-abundance mode representing background regions and a high-abundance
  1283. mode representing signal regions.
  1284. This offers two potential normalization targets: equalizing background
  1285. coverage or equalizing signal coverage.
  1286. If the experiment is well controlled and ChIP efficiency is known to be
  1287. consistent across all samples, then normalizing the background coverage
  1288. to be equal across all samples is a reasonable strategy.
  1289. If this is not a safe assumption, then the preferred strategy is to normalize
  1290. the signal regions in a way similar to
  1291. \begin_inset Flex Glossary Term
  1292. status open
  1293. \begin_layout Plain Layout
  1294. RNA-seq
  1295. \end_layout
  1296. \end_inset
  1297. data by assuming that the average signal region is not changing abundance
  1298. between samples.
  1299. Beyond this, if a ChIP-seq experiment has a more complicated structure
  1300. that doesn't show the typical bimodal count distribution, it may be necessary
  1301. to implement a normalization as a smooth function of abundance.
  1302. However, this strategy makes a much stronger assumption about the data:
  1303. that the average log fold change is zero across all abundance levels.
  1304. Hence, the simpler scaling normalization based on background or signal
  1305. regions are generally preferred whenever possible.
  1306. \end_layout
  1307. \begin_layout Subsubsection
  1308. ComBat and SVA for correction of known and unknown batch effects
  1309. \end_layout
  1310. \begin_layout Standard
  1311. In addition to well-understood effects that can be easily normalized out,
  1312. a data set often contains confounding biological effects that must be accounted
  1313. for in the modeling step.
  1314. For instance, in an experiment with pre-treatment and post-treatment samples
  1315. of cells from several different donors, donor variability represents a
  1316. known batch effect.
  1317. The most straightforward correction for known batches is to estimate the
  1318. mean for each batch independently and subtract out the differences, so
  1319. that all batches have identical means for each feature.
  1320. However, as with variance estimation, estimating the differences in batch
  1321. means is not necessarily robust at the feature level, so the ComBat method
  1322. adds empirical Bayes squeezing of the batch mean differences toward a common
  1323. value, analogous to
  1324. \begin_inset Flex Code
  1325. status open
  1326. \begin_layout Plain Layout
  1327. limma
  1328. \end_layout
  1329. \end_inset
  1330. 's empirical Bayes squeezing of feature variance estimates
  1331. \begin_inset CommandInset citation
  1332. LatexCommand cite
  1333. key "Johnson2007"
  1334. literal "false"
  1335. \end_inset
  1336. .
  1337. Effectively, ComBat assumes that modest differences between batch means
  1338. are real batch effects, but extreme differences between batch means are
  1339. more likely to be the result of outlier observations that happen to line
  1340. up with the batches rather than a genuine batch effect.
  1341. The result is a batch correction that is more robust against outliers than
  1342. simple subtraction of mean differences subtraction.
  1343. \end_layout
  1344. \begin_layout Standard
  1345. In some data sets, unknown batch effects may be present due to inherent
  1346. variability in in the data, either caused by technical or biological effects.
  1347. Examples of unknown batch effects include variations in enrichment efficiency
  1348. between ChIP-seq samples, variations in populations of different cell types,
  1349. and the effects of uncontrolled environmental factors on gene expression
  1350. in humans or live animals.
  1351. In an ordinary linear model context, unknown batch effects cannot be inferred
  1352. and must be treated as random noise.
  1353. However, in high-throughput experiments, once again information can be
  1354. shared across features to identify patterns of un-modeled variation that
  1355. are repeated in many features.
  1356. One attractive strategy would be to perform singular value decomposition
  1357. (SVD) on the matrix of linear model residuals (which contain all the un-modeled
  1358. variation in the data) and take the first few singular vectors as batch
  1359. effects.
  1360. While this can be effective, it makes the unreasonable assumption that
  1361. all batch effects are uncorrelated with any of the effects being modeled.
  1362. Surrogate variable analysis (SVA) starts with this approach, but takes
  1363. some additional steps to identify batch effects in the full data that are
  1364. both highly correlated with the singular vectors in the residuals and least
  1365. correlated with the effects of interest
  1366. \begin_inset CommandInset citation
  1367. LatexCommand cite
  1368. key "Leek2007"
  1369. literal "false"
  1370. \end_inset
  1371. .
  1372. Since the final batch effects are estimated from the full data, moderate
  1373. correlations between the batch effects and effects of interest are allowed,
  1374. which gives SVA much more freedom to estimate the true extent of the batch
  1375. effects compared to simple residual SVD.
  1376. Once the surrogate variables are estimated, they can be included as coefficient
  1377. s in the linear model in a similar fashion to known batch effects in order
  1378. to subtract out their effects on each feature's abundance.
  1379. \end_layout
  1380. \begin_layout Subsubsection
  1381. Factor analysis: PCA, MDS, MOFA
  1382. \end_layout
  1383. \begin_layout Standard
  1384. \begin_inset Flex TODO Note (inline)
  1385. status open
  1386. \begin_layout Plain Layout
  1387. Not sure if this merits a subsection here.
  1388. \end_layout
  1389. \end_inset
  1390. \end_layout
  1391. \begin_layout Itemize
  1392. Batch-corrected PCA is informative, but careful application is required
  1393. to avoid bias
  1394. \end_layout
  1395. \begin_layout Section
  1396. Innovation
  1397. \end_layout
  1398. \begin_layout Standard
  1399. \begin_inset Flex TODO Note (inline)
  1400. status open
  1401. \begin_layout Plain Layout
  1402. Is this entire section redundant with the Approach sections of each chapter?
  1403. I'm not really sure what to write here.
  1404. \end_layout
  1405. \end_inset
  1406. \end_layout
  1407. \begin_layout Subsection
  1408. MSC infusion to improve transplant outcomes (prevent/delay rejection)
  1409. \end_layout
  1410. \begin_layout Standard
  1411. \begin_inset Flex TODO Note (inline)
  1412. status open
  1413. \begin_layout Plain Layout
  1414. Do I still talk about this? It's the motivation for chapter 4, but I don't
  1415. actually present any work related to MSCs.
  1416. \end_layout
  1417. \end_inset
  1418. \end_layout
  1419. \begin_layout Itemize
  1420. Demonstrated in mice, but not yet in primates
  1421. \end_layout
  1422. \begin_layout Itemize
  1423. Mechanism currently unknown, but MSC are known to be immune modulatory
  1424. \end_layout
  1425. \begin_layout Itemize
  1426. Characterize MSC response to interferon gamma
  1427. \end_layout
  1428. \begin_layout Itemize
  1429. IFN-g is thought to stimulate their function
  1430. \end_layout
  1431. \begin_layout Itemize
  1432. Test IFN-g treated MSC infusion as a therapy to delay graft rejection in
  1433. cynomolgus monkeys
  1434. \end_layout
  1435. \begin_layout Itemize
  1436. Monitor animals post-transplant using blood
  1437. \begin_inset Flex Glossary Term
  1438. status open
  1439. \begin_layout Plain Layout
  1440. RNA-seq
  1441. \end_layout
  1442. \end_inset
  1443. at serial time points
  1444. \end_layout
  1445. \begin_layout Subsection
  1446. Investigate dynamics of histone marks in CD4 T-cell activation and memory
  1447. \end_layout
  1448. \begin_layout Itemize
  1449. Previous studies have looked at single snapshots of histone marks
  1450. \end_layout
  1451. \begin_layout Itemize
  1452. Instead, look at changes in histone marks across activation and memory
  1453. \end_layout
  1454. \begin_layout Subsection
  1455. High-throughput sequencing and microarray technologies
  1456. \end_layout
  1457. \begin_layout Itemize
  1458. Powerful methods for assaying gene expression and epigenetics across entire
  1459. genomes
  1460. \end_layout
  1461. \begin_layout Itemize
  1462. Proper analysis requires finding and exploiting systematic genome-wide trends
  1463. \end_layout
  1464. \begin_layout Chapter
  1465. Reproducible genome-wide epigenetic analysis of H3K4 and H3K27 methylation
  1466. in naïve and memory CD4 T-cell activation
  1467. \end_layout
  1468. \begin_layout Standard
  1469. \begin_inset Flex TODO Note (inline)
  1470. status open
  1471. \begin_layout Plain Layout
  1472. Chapter author list: Me, Sarah, Dan
  1473. \end_layout
  1474. \end_inset
  1475. \end_layout
  1476. \begin_layout Standard
  1477. \begin_inset ERT
  1478. status collapsed
  1479. \begin_layout Plain Layout
  1480. \backslash
  1481. glsresetall
  1482. \end_layout
  1483. \end_inset
  1484. \end_layout
  1485. \begin_layout Standard
  1486. \begin_inset Flex TODO Note (inline)
  1487. status open
  1488. \begin_layout Plain Layout
  1489. Need better section titles throughout the entire chapter
  1490. \end_layout
  1491. \end_inset
  1492. \end_layout
  1493. \begin_layout Section
  1494. Approach
  1495. \end_layout
  1496. \begin_layout Standard
  1497. \begin_inset Flex TODO Note (inline)
  1498. status open
  1499. \begin_layout Plain Layout
  1500. Check on the exact correct way to write
  1501. \begin_inset Quotes eld
  1502. \end_inset
  1503. CD4 T-cell
  1504. \begin_inset Quotes erd
  1505. \end_inset
  1506. .
  1507. I think there might be a plus sign somewhere in there now? Also, maybe
  1508. figure out a reasonable way to abbreviate
  1509. \begin_inset Quotes eld
  1510. \end_inset
  1511. naïve CD4 T-cells
  1512. \begin_inset Quotes erd
  1513. \end_inset
  1514. and
  1515. \begin_inset Quotes eld
  1516. \end_inset
  1517. memory CD4 T-cells
  1518. \begin_inset Quotes erd
  1519. \end_inset
  1520. .
  1521. \end_layout
  1522. \end_inset
  1523. \end_layout
  1524. \begin_layout Standard
  1525. \begin_inset Flex TODO Note (inline)
  1526. status open
  1527. \begin_layout Plain Layout
  1528. Is it ok to just copy a bunch of citations from the intros to Sarah's papers?
  1529. That feels like cheating somehow.
  1530. \end_layout
  1531. \end_inset
  1532. \end_layout
  1533. \begin_layout Standard
  1534. CD4 T-cells are central to all adaptive immune responses, as well as immune
  1535. memory [CITE?].
  1536. After an infection is cleared, a subset of the naïve CD4 T-cells that responded
  1537. to that infection differentiate into memory CD4 T-cells, which are responsible
  1538. for responding to the same pathogen in the future.
  1539. Memory CD4 T-cells are functionally distinct, able to respond to an infection
  1540. more quickly and without the co-stimulation required by naïve CD4 T-cells.
  1541. However, the molecular mechanisms underlying this functional distinction
  1542. are not well-understood.
  1543. Epigenetic regulation via histone modification is thought to play an important
  1544. role, but while many studies have looked at static snapshots of histone
  1545. methylation in T-cells, few studies have looked at the dynamics of histone
  1546. regulation after T-cell activation, nor the differences in histone methylation
  1547. between naïve and memory T-cells.
  1548. H3K4me2, H3K4me3 and H3K27me3 are three histone marks thought to be major
  1549. epigenetic regulators of gene expression.
  1550. The goal of the present study is to investigate the role of these histone
  1551. marks in CD4 T-cell activation kinetics and memory differentiation.
  1552. In static snapshots, H3K4me2 and H3K4me3 are often observed in the promoters
  1553. of highly transcribed genes, while H3K27me3 is more often observed in promoters
  1554. of inactive genes with little to no transcription occurring.
  1555. As a result, the two H3K4 marks have been characterized as
  1556. \begin_inset Quotes eld
  1557. \end_inset
  1558. activating
  1559. \begin_inset Quotes erd
  1560. \end_inset
  1561. marks, while H3K27me3 has been characterized as
  1562. \begin_inset Quotes eld
  1563. \end_inset
  1564. deactivating
  1565. \begin_inset Quotes erd
  1566. \end_inset
  1567. .
  1568. Despite these characterizations, the actual causal relationship between
  1569. these histone modifications and gene transcription is complex and likely
  1570. involves positive and negative feedback loops between the two.
  1571. \end_layout
  1572. \begin_layout Standard
  1573. In order to investigate the relationship between gene expression and these
  1574. histone modifications in the context of naïve and memory CD4 T-cell activation,
  1575. a previously published data set of
  1576. \begin_inset Flex Glossary Term
  1577. status open
  1578. \begin_layout Plain Layout
  1579. RNA-seq
  1580. \end_layout
  1581. \end_inset
  1582. data and ChIP-seq data was re-analyzed using up-to-date methods designed
  1583. to address the specific analysis challenges posed by this data set.
  1584. The data set contains naïve and memory CD4 T-cell samples in a time course
  1585. before and after activation.
  1586. Like the original analysis, this analysis looks at the dynamics of these
  1587. marks histone marks and compare them to gene expression dynamics at the
  1588. same time points during activation, as well as compare them between naïve
  1589. and memory cells, in hope of discovering evidence of new mechanistic details
  1590. in the interplay between them.
  1591. The original analysis of this data treated each gene promoter as a monolithic
  1592. unit and mostly assumed that ChIP-seq reads or peaks occurring anywhere
  1593. within a promoter were equivalent, regardless of where they occurred relative
  1594. to the gene structure.
  1595. For an initial analysis of the data, this was a necessary simplifying assumptio
  1596. n.
  1597. The current analysis aims to relax this assumption, first by directly analyzing
  1598. ChIP-seq peaks for differential modification, and second by taking a more
  1599. granular look at the ChIP-seq read coverage within promoter regions to
  1600. ask whether the location of histone modifications relative to the gene's
  1601. TSS is an important factor, as opposed to simple proximity.
  1602. \end_layout
  1603. \begin_layout Section
  1604. Methods
  1605. \end_layout
  1606. \begin_layout Standard
  1607. \begin_inset Flex TODO Note (inline)
  1608. status open
  1609. \begin_layout Plain Layout
  1610. Look up some more details from the papers (e.g.
  1611. activation method).
  1612. \end_layout
  1613. \end_inset
  1614. \end_layout
  1615. \begin_layout Standard
  1616. A reproducible workflow was written to analyze the raw ChIP-seq and
  1617. \begin_inset Flex Glossary Term
  1618. status open
  1619. \begin_layout Plain Layout
  1620. RNA-seq
  1621. \end_layout
  1622. \end_inset
  1623. data from previous studies
  1624. \begin_inset CommandInset citation
  1625. LatexCommand cite
  1626. key "gh-cd4-csaw,LaMere2016,LaMere2017"
  1627. literal "true"
  1628. \end_inset
  1629. .
  1630. Briefly, this data consists of
  1631. \begin_inset Flex Glossary Term
  1632. status open
  1633. \begin_layout Plain Layout
  1634. RNA-seq
  1635. \end_layout
  1636. \end_inset
  1637. and ChIP-seq from CD4 T-cells cultured from 4 donors.
  1638. From each donor, naïve and memory CD4 T-cells were isolated separately.
  1639. Then cultures of both cells were activated [how?], and samples were taken
  1640. at 4 time points: Day 0 (pre-activation), Day 1 (early activation), Day
  1641. 5 (peak activation), and Day 14 (post-activation).
  1642. For each combination of cell type and time point, RNA was isolated and
  1643. sequenced, and ChIP-seq was performed for each of 3 histone marks: H3K4me2,
  1644. H3K4me3, and H3K27me3.
  1645. The ChIP-seq input DNA was also sequenced for each sample.
  1646. The result was 32 samples for each assay.
  1647. \end_layout
  1648. \begin_layout Subsection
  1649. RNA-seq differential expression analysis
  1650. \end_layout
  1651. \begin_layout Standard
  1652. \begin_inset Note Note
  1653. status collapsed
  1654. \begin_layout Plain Layout
  1655. \begin_inset Float figure
  1656. wide false
  1657. sideways false
  1658. status open
  1659. \begin_layout Plain Layout
  1660. \align center
  1661. \begin_inset Float figure
  1662. wide false
  1663. sideways false
  1664. status collapsed
  1665. \begin_layout Plain Layout
  1666. \align center
  1667. \begin_inset Graphics
  1668. filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-star-CROP.png
  1669. lyxscale 25
  1670. width 35col%
  1671. groupId rna-comp-subfig
  1672. \end_inset
  1673. \end_layout
  1674. \begin_layout Plain Layout
  1675. \begin_inset Caption Standard
  1676. \begin_layout Plain Layout
  1677. STAR quantification, Entrez vs Ensembl gene annotation
  1678. \end_layout
  1679. \end_inset
  1680. \end_layout
  1681. \end_inset
  1682. \begin_inset space \qquad{}
  1683. \end_inset
  1684. \begin_inset Float figure
  1685. wide false
  1686. sideways false
  1687. status collapsed
  1688. \begin_layout Plain Layout
  1689. \align center
  1690. \begin_inset Graphics
  1691. filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-shoal-CROP.png
  1692. lyxscale 25
  1693. width 35col%
  1694. groupId rna-comp-subfig
  1695. \end_inset
  1696. \end_layout
  1697. \begin_layout Plain Layout
  1698. \begin_inset Caption Standard
  1699. \begin_layout Plain Layout
  1700. Salmon+Shoal quantification, Entrez vs Ensembl gene annotation
  1701. \end_layout
  1702. \end_inset
  1703. \end_layout
  1704. \end_inset
  1705. \end_layout
  1706. \begin_layout Plain Layout
  1707. \align center
  1708. \begin_inset Float figure
  1709. wide false
  1710. sideways false
  1711. status collapsed
  1712. \begin_layout Plain Layout
  1713. \align center
  1714. \begin_inset Graphics
  1715. filename graphics/CD4-csaw/rnaseq-compare/star-vs-hisat2-CROP.png
  1716. lyxscale 25
  1717. width 35col%
  1718. groupId rna-comp-subfig
  1719. \end_inset
  1720. \end_layout
  1721. \begin_layout Plain Layout
  1722. \begin_inset Caption Standard
  1723. \begin_layout Plain Layout
  1724. STAR vs HISAT2 quantification, Ensembl gene annotation
  1725. \end_layout
  1726. \end_inset
  1727. \end_layout
  1728. \end_inset
  1729. \begin_inset space \qquad{}
  1730. \end_inset
  1731. \begin_inset Float figure
  1732. wide false
  1733. sideways false
  1734. status collapsed
  1735. \begin_layout Plain Layout
  1736. \align center
  1737. \begin_inset Graphics
  1738. filename graphics/CD4-csaw/rnaseq-compare/star-vs-salmon-CROP.png
  1739. lyxscale 25
  1740. width 35col%
  1741. groupId rna-comp-subfig
  1742. \end_inset
  1743. \end_layout
  1744. \begin_layout Plain Layout
  1745. \begin_inset Caption Standard
  1746. \begin_layout Plain Layout
  1747. Salmon vs STAR quantification, Ensembl gene annotation
  1748. \end_layout
  1749. \end_inset
  1750. \end_layout
  1751. \end_inset
  1752. \end_layout
  1753. \begin_layout Plain Layout
  1754. \align center
  1755. \begin_inset Float figure
  1756. wide false
  1757. sideways false
  1758. status collapsed
  1759. \begin_layout Plain Layout
  1760. \align center
  1761. \begin_inset Graphics
  1762. filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-kallisto-CROP.png
  1763. lyxscale 25
  1764. width 35col%
  1765. groupId rna-comp-subfig
  1766. \end_inset
  1767. \end_layout
  1768. \begin_layout Plain Layout
  1769. \begin_inset Caption Standard
  1770. \begin_layout Plain Layout
  1771. Salmon vs Kallisto quantification, Ensembl gene annotation
  1772. \end_layout
  1773. \end_inset
  1774. \end_layout
  1775. \end_inset
  1776. \begin_inset space \qquad{}
  1777. \end_inset
  1778. \begin_inset Float figure
  1779. wide false
  1780. sideways false
  1781. status collapsed
  1782. \begin_layout Plain Layout
  1783. \align center
  1784. \begin_inset Graphics
  1785. filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-shoal-CROP.png
  1786. lyxscale 25
  1787. width 35col%
  1788. groupId rna-comp-subfig
  1789. \end_inset
  1790. \end_layout
  1791. \begin_layout Plain Layout
  1792. \begin_inset Caption Standard
  1793. \begin_layout Plain Layout
  1794. Salmon+Shoal vs Salmon alone, Ensembl gene annotation
  1795. \end_layout
  1796. \end_inset
  1797. \end_layout
  1798. \end_inset
  1799. \end_layout
  1800. \begin_layout Plain Layout
  1801. \begin_inset Caption Standard
  1802. \begin_layout Plain Layout
  1803. \begin_inset CommandInset label
  1804. LatexCommand label
  1805. name "fig:RNA-norm-comp"
  1806. \end_inset
  1807. RNA-seq comparisons
  1808. \end_layout
  1809. \end_inset
  1810. \end_layout
  1811. \end_inset
  1812. \end_layout
  1813. \end_inset
  1814. \end_layout
  1815. \begin_layout Standard
  1816. Sequence reads were retrieved from the Sequence Read Archive (SRA)
  1817. \begin_inset CommandInset citation
  1818. LatexCommand cite
  1819. key "Leinonen2011"
  1820. literal "false"
  1821. \end_inset
  1822. .
  1823. Five different alignment and quantification methods were tested for the
  1824. \begin_inset Flex Glossary Term
  1825. status open
  1826. \begin_layout Plain Layout
  1827. RNA-seq
  1828. \end_layout
  1829. \end_inset
  1830. data
  1831. \begin_inset CommandInset citation
  1832. LatexCommand cite
  1833. key "Dobin2012,Kim2019,Liao2014,Pimentel2016,Patro2017,gh-shoal,gh-hg38-ref"
  1834. literal "false"
  1835. \end_inset
  1836. .
  1837. Each quantification was tested with both Ensembl transcripts and UCSC known
  1838. gene annotations [CITE? Also which versions of each?].
  1839. Comparisons of downstream results from each combination of quantification
  1840. method and reference revealed that all quantifications gave broadly similar
  1841. results for most genes, so shoal with the Ensembl annotation was chosen
  1842. as the method theoretically most likely to partially mitigate some of the
  1843. batch effect in the data.
  1844. \end_layout
  1845. \begin_layout Standard
  1846. \begin_inset Float figure
  1847. wide false
  1848. sideways false
  1849. status collapsed
  1850. \begin_layout Plain Layout
  1851. \align center
  1852. \begin_inset Float figure
  1853. wide false
  1854. sideways false
  1855. status open
  1856. \begin_layout Plain Layout
  1857. \align center
  1858. \begin_inset Graphics
  1859. filename graphics/CD4-csaw/RNA-seq/PCA-no-batchsub-CROP.png
  1860. lyxscale 25
  1861. width 75col%
  1862. groupId rna-pca-subfig
  1863. \end_inset
  1864. \end_layout
  1865. \begin_layout Plain Layout
  1866. \begin_inset Caption Standard
  1867. \begin_layout Plain Layout
  1868. \series bold
  1869. \begin_inset CommandInset label
  1870. LatexCommand label
  1871. name "fig:RNA-PCA-no-batchsub"
  1872. \end_inset
  1873. Before batch correction
  1874. \end_layout
  1875. \end_inset
  1876. \end_layout
  1877. \end_inset
  1878. \end_layout
  1879. \begin_layout Plain Layout
  1880. \align center
  1881. \begin_inset Float figure
  1882. wide false
  1883. sideways false
  1884. status open
  1885. \begin_layout Plain Layout
  1886. \align center
  1887. \begin_inset Graphics
  1888. filename graphics/CD4-csaw/RNA-seq/PCA-combat-batchsub-CROP.png
  1889. lyxscale 25
  1890. width 75col%
  1891. groupId rna-pca-subfig
  1892. \end_inset
  1893. \end_layout
  1894. \begin_layout Plain Layout
  1895. \begin_inset Caption Standard
  1896. \begin_layout Plain Layout
  1897. \series bold
  1898. \begin_inset CommandInset label
  1899. LatexCommand label
  1900. name "fig:RNA-PCA-ComBat-batchsub"
  1901. \end_inset
  1902. After batch correction with ComBat
  1903. \end_layout
  1904. \end_inset
  1905. \end_layout
  1906. \end_inset
  1907. \end_layout
  1908. \begin_layout Plain Layout
  1909. \begin_inset Caption Standard
  1910. \begin_layout Plain Layout
  1911. \series bold
  1912. \begin_inset CommandInset label
  1913. LatexCommand label
  1914. name "fig:RNA-PCA"
  1915. \end_inset
  1916. PCoA plots of RNA-seq data showing effect of batch correction.
  1917. \end_layout
  1918. \end_inset
  1919. \end_layout
  1920. \end_inset
  1921. \end_layout
  1922. \begin_layout Standard
  1923. Due to an error in sample preparation, the RNA from the samples for days
  1924. 0 and 5 were sequenced using a different kit than those for days 1 and
  1925. 14.
  1926. This induced a substantial batch effect in the data due to differences
  1927. in sequencing biases between the two kits, and this batch effect is unfortunate
  1928. ly confounded with the time point variable (Figure
  1929. \begin_inset CommandInset ref
  1930. LatexCommand ref
  1931. reference "fig:RNA-PCA-no-batchsub"
  1932. plural "false"
  1933. caps "false"
  1934. noprefix "false"
  1935. \end_inset
  1936. ).
  1937. To do the best possible analysis with this data, this batch effect was
  1938. subtracted out from the data using ComBat
  1939. \begin_inset CommandInset citation
  1940. LatexCommand cite
  1941. key "Johnson2007"
  1942. literal "false"
  1943. \end_inset
  1944. , ignoring the time point variable due to the confounding with the batch
  1945. variable.
  1946. The result is a marked improvement, but the unavoidable confounding with
  1947. time point means that certain real patterns of gene expression will be
  1948. indistinguishable from the batch effect and subtracted out as a result.
  1949. Specifically, any
  1950. \begin_inset Quotes eld
  1951. \end_inset
  1952. zig-zag
  1953. \begin_inset Quotes erd
  1954. \end_inset
  1955. pattern, such as a gene whose expression goes up on day 1, down on day
  1956. 5, and back up again on day 14, will be attenuated or eliminated entirely.
  1957. In the context of a T-cell activation time course, it is unlikely that
  1958. many genes of interest will follow such an expression pattern, so this
  1959. loss was deemed an acceptable cost for correcting the batch effect.
  1960. \end_layout
  1961. \begin_layout Standard
  1962. \begin_inset Float figure
  1963. wide false
  1964. sideways false
  1965. status collapsed
  1966. \begin_layout Plain Layout
  1967. \begin_inset Flex TODO Note (inline)
  1968. status open
  1969. \begin_layout Plain Layout
  1970. Just take the top row
  1971. \end_layout
  1972. \end_inset
  1973. \end_layout
  1974. \begin_layout Plain Layout
  1975. \align center
  1976. \begin_inset Graphics
  1977. filename graphics/CD4-csaw/RNA-seq/weights-vs-covars-CROP.png
  1978. lyxscale 25
  1979. width 100col%
  1980. groupId colwidth-raster
  1981. \end_inset
  1982. \end_layout
  1983. \begin_layout Plain Layout
  1984. \begin_inset Caption Standard
  1985. \begin_layout Plain Layout
  1986. \series bold
  1987. \begin_inset CommandInset label
  1988. LatexCommand label
  1989. name "fig:RNA-seq-weights-vs-covars"
  1990. \end_inset
  1991. RNA-seq sample weights, grouped by experimental and technical covariates.
  1992. \end_layout
  1993. \end_inset
  1994. \end_layout
  1995. \end_inset
  1996. \end_layout
  1997. \begin_layout Standard
  1998. However, removing the systematic component of the batch effect still leaves
  1999. the noise component.
  2000. The gene quantifications from the first batch are substantially noisier
  2001. than those in the second batch.
  2002. This analysis corrected for this by using
  2003. \begin_inset Flex Code
  2004. status open
  2005. \begin_layout Plain Layout
  2006. limma
  2007. \end_layout
  2008. \end_inset
  2009. 's sample weighting method to assign lower weights to the noisy samples
  2010. of batch 1
  2011. \begin_inset CommandInset citation
  2012. LatexCommand cite
  2013. key "Ritchie2006,Liu2015"
  2014. literal "false"
  2015. \end_inset
  2016. .
  2017. The resulting analysis gives an accurate assessment of statistical significance
  2018. for all comparisons, which unfortunately means a loss of statistical power
  2019. for comparisons involving samples in batch 1.
  2020. \end_layout
  2021. \begin_layout Standard
  2022. In any case, the
  2023. \begin_inset Flex Glossary Term
  2024. status open
  2025. \begin_layout Plain Layout
  2026. RNA-seq
  2027. \end_layout
  2028. \end_inset
  2029. counts were first normalized using trimmed mean of M-values
  2030. \begin_inset CommandInset citation
  2031. LatexCommand cite
  2032. key "Robinson2010"
  2033. literal "false"
  2034. \end_inset
  2035. , converted to normalized logCPM with quality weights using
  2036. \begin_inset Flex Code
  2037. status open
  2038. \begin_layout Plain Layout
  2039. voomWithQualityWeights
  2040. \end_layout
  2041. \end_inset
  2042. \begin_inset CommandInset citation
  2043. LatexCommand cite
  2044. key "Law2013,Liu2015"
  2045. literal "false"
  2046. \end_inset
  2047. , and batch-corrected at this point using ComBat.
  2048. A linear model was fit to the batch-corrected, quality-weighted data for
  2049. each gene using
  2050. \begin_inset Flex Code
  2051. status open
  2052. \begin_layout Plain Layout
  2053. limma
  2054. \end_layout
  2055. \end_inset
  2056. , and each gene was tested for differential expression using
  2057. \begin_inset Flex Code
  2058. status open
  2059. \begin_layout Plain Layout
  2060. limma
  2061. \end_layout
  2062. \end_inset
  2063. 's empirical Bayes moderated
  2064. \begin_inset Formula $t$
  2065. \end_inset
  2066. -test
  2067. \begin_inset CommandInset citation
  2068. LatexCommand cite
  2069. key "Smyth2005,Law2013,Phipson2013"
  2070. literal "false"
  2071. \end_inset
  2072. .
  2073. \end_layout
  2074. \begin_layout Subsection
  2075. ChIP-seq differential modification analysis
  2076. \end_layout
  2077. \begin_layout Standard
  2078. \begin_inset Float figure
  2079. wide false
  2080. sideways false
  2081. status collapsed
  2082. \begin_layout Plain Layout
  2083. \align center
  2084. \begin_inset Float figure
  2085. wide false
  2086. sideways false
  2087. status open
  2088. \begin_layout Plain Layout
  2089. \align center
  2090. \begin_inset Graphics
  2091. filename graphics/CD4-csaw/csaw/CCF-plots-noBL-PAGE2-CROP.pdf
  2092. lyxscale 50
  2093. height 40theight%
  2094. groupId ccf-subfig
  2095. \end_inset
  2096. \end_layout
  2097. \begin_layout Plain Layout
  2098. \begin_inset Caption Standard
  2099. \begin_layout Plain Layout
  2100. \series bold
  2101. \begin_inset CommandInset label
  2102. LatexCommand label
  2103. name "fig:CCF-without-blacklist"
  2104. \end_inset
  2105. Cross-correlation plots without removing blacklisted reads.
  2106. \series default
  2107. Without blacklisting, many artifactual peaks are visible in the cross-correlatio
  2108. ns of the ChIP-seq samples, and the peak at the true fragment size (147
  2109. \begin_inset space ~
  2110. \end_inset
  2111. bp) is frequently overshadowed by the artifactual peak at the read length
  2112. (100
  2113. \begin_inset space ~
  2114. \end_inset
  2115. bp).
  2116. \end_layout
  2117. \end_inset
  2118. \end_layout
  2119. \end_inset
  2120. \end_layout
  2121. \begin_layout Plain Layout
  2122. \align center
  2123. \begin_inset Float figure
  2124. wide false
  2125. sideways false
  2126. status open
  2127. \begin_layout Plain Layout
  2128. \align center
  2129. \begin_inset Graphics
  2130. filename graphics/CD4-csaw/csaw/CCF-plots-PAGE2-CROP.pdf
  2131. lyxscale 50
  2132. height 40theight%
  2133. groupId ccf-subfig
  2134. \end_inset
  2135. \end_layout
  2136. \begin_layout Plain Layout
  2137. \begin_inset Caption Standard
  2138. \begin_layout Plain Layout
  2139. \series bold
  2140. \begin_inset CommandInset label
  2141. LatexCommand label
  2142. name "fig:CCF-with-blacklist"
  2143. \end_inset
  2144. Cross-correlation plots with blacklisted reads removed.
  2145. \series default
  2146. After blacklisting, most ChIP-seq samples have clean-looking periodic cross-cor
  2147. relation plots, with the largest peak around 147
  2148. \begin_inset space ~
  2149. \end_inset
  2150. bp, the expected size for a fragment of DNA from a single nucleosome, and
  2151. little to no peak at the read length, 100
  2152. \begin_inset space ~
  2153. \end_inset
  2154. bp.
  2155. \end_layout
  2156. \end_inset
  2157. \end_layout
  2158. \end_inset
  2159. \end_layout
  2160. \begin_layout Plain Layout
  2161. \begin_inset Caption Standard
  2162. \begin_layout Plain Layout
  2163. \series bold
  2164. \begin_inset CommandInset label
  2165. LatexCommand label
  2166. name "fig:CCF-master"
  2167. \end_inset
  2168. Strand cross-correlation plots for ChIP-seq data, before and after blacklisting.
  2169. \end_layout
  2170. \end_inset
  2171. \end_layout
  2172. \end_inset
  2173. \end_layout
  2174. \begin_layout Standard
  2175. \begin_inset Note Note
  2176. status open
  2177. \begin_layout Plain Layout
  2178. \begin_inset Float figure
  2179. wide false
  2180. sideways false
  2181. status collapsed
  2182. \begin_layout Plain Layout
  2183. \align center
  2184. \begin_inset Graphics
  2185. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-sample-MAplot-bins-CROP.png
  2186. lyxscale 25
  2187. width 100col%
  2188. groupId colwidth-raster
  2189. \end_inset
  2190. \end_layout
  2191. \begin_layout Plain Layout
  2192. \begin_inset Caption Standard
  2193. \begin_layout Plain Layout
  2194. \series bold
  2195. \begin_inset CommandInset label
  2196. LatexCommand label
  2197. name "fig:MA-plot-bigbins"
  2198. \end_inset
  2199. MA plot of H3K4me2 read counts in 10kb bins for two arbitrary samples.
  2200. \end_layout
  2201. \end_inset
  2202. \end_layout
  2203. \end_inset
  2204. \end_layout
  2205. \end_inset
  2206. \end_layout
  2207. \begin_layout Standard
  2208. \begin_inset Flex TODO Note (inline)
  2209. status open
  2210. \begin_layout Plain Layout
  2211. Be consistent about use of
  2212. \begin_inset Quotes eld
  2213. \end_inset
  2214. differential binding
  2215. \begin_inset Quotes erd
  2216. \end_inset
  2217. vs
  2218. \begin_inset Quotes eld
  2219. \end_inset
  2220. differential modification
  2221. \begin_inset Quotes erd
  2222. \end_inset
  2223. throughout this chapter.
  2224. The latter is usually preferred.
  2225. \end_layout
  2226. \end_inset
  2227. \end_layout
  2228. \begin_layout Standard
  2229. Sequence reads were retrieved from SRA
  2230. \begin_inset CommandInset citation
  2231. LatexCommand cite
  2232. key "Leinonen2011"
  2233. literal "false"
  2234. \end_inset
  2235. .
  2236. ChIP-seq (and input) reads were aligned to GRCh38 genome assembly using
  2237. Bowtie 2
  2238. \begin_inset CommandInset citation
  2239. LatexCommand cite
  2240. key "Langmead2012,Schneider2017,gh-hg38-ref"
  2241. literal "false"
  2242. \end_inset
  2243. .
  2244. Artifact regions were annotated using a custom implementation of the
  2245. \begin_inset Flex Code
  2246. status open
  2247. \begin_layout Plain Layout
  2248. GreyListChIP
  2249. \end_layout
  2250. \end_inset
  2251. algorithm, and these
  2252. \begin_inset Quotes eld
  2253. \end_inset
  2254. greylists
  2255. \begin_inset Quotes erd
  2256. \end_inset
  2257. were merged with the published ENCODE blacklists
  2258. \begin_inset CommandInset citation
  2259. LatexCommand cite
  2260. key "greylistchip,Amemiya2019,Dunham2012,gh-cd4-csaw"
  2261. literal "false"
  2262. \end_inset
  2263. .
  2264. Any read or called peak overlapping one of these regions was regarded as
  2265. artifactual and excluded from downstream analyses.
  2266. Figure
  2267. \begin_inset CommandInset ref
  2268. LatexCommand ref
  2269. reference "fig:CCF-master"
  2270. plural "false"
  2271. caps "false"
  2272. noprefix "false"
  2273. \end_inset
  2274. shows the improvement after blacklisting in the strand cross-correlation
  2275. plots, a common quality control plot for ChIP-seq data.
  2276. Peaks were called using epic, an implementation of the SICER algorithm
  2277. \begin_inset CommandInset citation
  2278. LatexCommand cite
  2279. key "Zang2009,gh-epic"
  2280. literal "false"
  2281. \end_inset
  2282. .
  2283. Peaks were also called separately using MACS, but MACS was determined to
  2284. be a poor fit for the data, and these peak calls are not used in any further
  2285. analyses
  2286. \begin_inset CommandInset citation
  2287. LatexCommand cite
  2288. key "Zhang2008"
  2289. literal "false"
  2290. \end_inset
  2291. .
  2292. Consensus peaks were determined by applying the irreproducible discovery
  2293. rate (IDR) framework
  2294. \begin_inset CommandInset citation
  2295. LatexCommand cite
  2296. key "Li2006,gh-idr"
  2297. literal "false"
  2298. \end_inset
  2299. to find peaks consistently called in the same locations across all 4 donors.
  2300. \end_layout
  2301. \begin_layout Standard
  2302. Promoters were defined by computing the distance from each annotated TSS
  2303. to the nearest called peak and examining the distribution of distances,
  2304. observing that peaks for each histone mark were enriched within a certain
  2305. distance of the TSS.
  2306. For H3K4me2 and H3K4me3, this distance was about 1
  2307. \begin_inset space ~
  2308. \end_inset
  2309. kb, while for H3K27me3 it was 2.5
  2310. \begin_inset space ~
  2311. \end_inset
  2312. kb.
  2313. These distances were used as an
  2314. \begin_inset Quotes eld
  2315. \end_inset
  2316. effective promoter radius
  2317. \begin_inset Quotes erd
  2318. \end_inset
  2319. for each mark.
  2320. The promoter region for each gene was defined as the region of the genome
  2321. within this distance upstream or downstream of the gene's annotated TSS.
  2322. For genes with multiple annotated TSSs, a promoter region was defined for
  2323. each TSS individually, and any promoters that overlapped (due to multiple
  2324. TSSs being closer than 2 times the radius) were merged into one large promoter.
  2325. Thus, some genes had multiple promoters defined, which were each analyzed
  2326. separately for differential modification.
  2327. \end_layout
  2328. \begin_layout Standard
  2329. \begin_inset Float figure
  2330. wide false
  2331. sideways false
  2332. status collapsed
  2333. \begin_layout Plain Layout
  2334. \begin_inset Float figure
  2335. wide false
  2336. sideways false
  2337. status collapsed
  2338. \begin_layout Plain Layout
  2339. \align center
  2340. \begin_inset Graphics
  2341. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-PCA-raw-CROP.png
  2342. lyxscale 25
  2343. width 45col%
  2344. groupId pcoa-subfig
  2345. \end_inset
  2346. \end_layout
  2347. \begin_layout Plain Layout
  2348. \begin_inset Caption Standard
  2349. \begin_layout Plain Layout
  2350. \series bold
  2351. \begin_inset CommandInset label
  2352. LatexCommand label
  2353. name "fig:PCoA-H3K4me2-bad"
  2354. \end_inset
  2355. H3K4me2, no correction
  2356. \end_layout
  2357. \end_inset
  2358. \end_layout
  2359. \end_inset
  2360. \begin_inset space \hfill{}
  2361. \end_inset
  2362. \begin_inset Float figure
  2363. wide false
  2364. sideways false
  2365. status collapsed
  2366. \begin_layout Plain Layout
  2367. \align center
  2368. \begin_inset Graphics
  2369. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-PCA-SVsub-CROP.png
  2370. lyxscale 25
  2371. width 45col%
  2372. groupId pcoa-subfig
  2373. \end_inset
  2374. \end_layout
  2375. \begin_layout Plain Layout
  2376. \begin_inset Caption Standard
  2377. \begin_layout Plain Layout
  2378. \series bold
  2379. \begin_inset CommandInset label
  2380. LatexCommand label
  2381. name "fig:PCoA-H3K4me2-good"
  2382. \end_inset
  2383. H3K4me2, SVs subtracted
  2384. \end_layout
  2385. \end_inset
  2386. \end_layout
  2387. \end_inset
  2388. \end_layout
  2389. \begin_layout Plain Layout
  2390. \begin_inset Float figure
  2391. wide false
  2392. sideways false
  2393. status collapsed
  2394. \begin_layout Plain Layout
  2395. \align center
  2396. \begin_inset Graphics
  2397. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-PCA-raw-CROP.png
  2398. lyxscale 25
  2399. width 45col%
  2400. groupId pcoa-subfig
  2401. \end_inset
  2402. \end_layout
  2403. \begin_layout Plain Layout
  2404. \begin_inset Caption Standard
  2405. \begin_layout Plain Layout
  2406. \series bold
  2407. \begin_inset CommandInset label
  2408. LatexCommand label
  2409. name "fig:PCoA-H3K4me3-bad"
  2410. \end_inset
  2411. H3K4me3, no correction
  2412. \end_layout
  2413. \end_inset
  2414. \end_layout
  2415. \end_inset
  2416. \begin_inset space \hfill{}
  2417. \end_inset
  2418. \begin_inset Float figure
  2419. wide false
  2420. sideways false
  2421. status collapsed
  2422. \begin_layout Plain Layout
  2423. \align center
  2424. \begin_inset Graphics
  2425. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-PCA-SVsub-CROP.png
  2426. lyxscale 25
  2427. width 45col%
  2428. groupId pcoa-subfig
  2429. \end_inset
  2430. \end_layout
  2431. \begin_layout Plain Layout
  2432. \begin_inset Caption Standard
  2433. \begin_layout Plain Layout
  2434. \series bold
  2435. \begin_inset CommandInset label
  2436. LatexCommand label
  2437. name "fig:PCoA-H3K4me3-good"
  2438. \end_inset
  2439. H3K4me3, SVs subtracted
  2440. \end_layout
  2441. \end_inset
  2442. \end_layout
  2443. \end_inset
  2444. \end_layout
  2445. \begin_layout Plain Layout
  2446. \begin_inset Float figure
  2447. wide false
  2448. sideways false
  2449. status collapsed
  2450. \begin_layout Plain Layout
  2451. \align center
  2452. \begin_inset Graphics
  2453. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-PCA-raw-CROP.png
  2454. lyxscale 25
  2455. width 45col%
  2456. groupId pcoa-subfig
  2457. \end_inset
  2458. \end_layout
  2459. \begin_layout Plain Layout
  2460. \begin_inset Caption Standard
  2461. \begin_layout Plain Layout
  2462. \series bold
  2463. \begin_inset CommandInset label
  2464. LatexCommand label
  2465. name "fig:PCoA-H3K27me3-bad"
  2466. \end_inset
  2467. H3K27me3, no correction
  2468. \end_layout
  2469. \end_inset
  2470. \end_layout
  2471. \end_inset
  2472. \begin_inset space \hfill{}
  2473. \end_inset
  2474. \begin_inset Float figure
  2475. wide false
  2476. sideways false
  2477. status collapsed
  2478. \begin_layout Plain Layout
  2479. \align center
  2480. \begin_inset Graphics
  2481. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-PCA-SVsub-CROP.png
  2482. lyxscale 25
  2483. width 45col%
  2484. groupId pcoa-subfig
  2485. \end_inset
  2486. \end_layout
  2487. \begin_layout Plain Layout
  2488. \begin_inset Caption Standard
  2489. \begin_layout Plain Layout
  2490. \series bold
  2491. \begin_inset CommandInset label
  2492. LatexCommand label
  2493. name "fig:PCoA-H3K27me3-good"
  2494. \end_inset
  2495. H3K27me3, SVs subtracted
  2496. \end_layout
  2497. \end_inset
  2498. \end_layout
  2499. \end_inset
  2500. \end_layout
  2501. \begin_layout Plain Layout
  2502. \begin_inset Caption Standard
  2503. \begin_layout Plain Layout
  2504. \series bold
  2505. \begin_inset CommandInset label
  2506. LatexCommand label
  2507. name "fig:PCoA-ChIP"
  2508. \end_inset
  2509. PCoA plots of ChIP-seq sliding window data, before and after subtracting
  2510. surrogate variables (SVs).
  2511. \end_layout
  2512. \end_inset
  2513. \end_layout
  2514. \end_inset
  2515. \end_layout
  2516. \begin_layout Standard
  2517. Reads in promoters, peaks, and sliding windows across the genome were counted
  2518. and normalized using
  2519. \begin_inset Flex Code
  2520. status open
  2521. \begin_layout Plain Layout
  2522. csaw
  2523. \end_layout
  2524. \end_inset
  2525. and analyzed for differential modification using
  2526. \begin_inset Flex Code
  2527. status open
  2528. \begin_layout Plain Layout
  2529. edgeR
  2530. \end_layout
  2531. \end_inset
  2532. \begin_inset CommandInset citation
  2533. LatexCommand cite
  2534. key "Lun2014,Lun2015a,Lund2012,Phipson2016"
  2535. literal "false"
  2536. \end_inset
  2537. .
  2538. Unobserved confounding factors in the ChIP-seq data were corrected using
  2539. SVA
  2540. \begin_inset CommandInset citation
  2541. LatexCommand cite
  2542. key "Leek2007,Leek2014"
  2543. literal "false"
  2544. \end_inset
  2545. .
  2546. Principal coordinate plots of the promoter count data for each histone
  2547. mark before and after subtracting surrogate variable effects are shown
  2548. in Figure
  2549. \begin_inset CommandInset ref
  2550. LatexCommand ref
  2551. reference "fig:PCoA-ChIP"
  2552. plural "false"
  2553. caps "false"
  2554. noprefix "false"
  2555. \end_inset
  2556. .
  2557. \end_layout
  2558. \begin_layout Standard
  2559. To investigate whether the location of a peak within the promoter region
  2560. was important,
  2561. \begin_inset Quotes eld
  2562. \end_inset
  2563. relative coverage profiles
  2564. \begin_inset Quotes erd
  2565. \end_inset
  2566. were generated.
  2567. First, 500-bp sliding windows were tiled around each annotated TSS: one
  2568. window centered on the TSS itself, and 10 windows each upstream and downstream,
  2569. thus covering a 10.5-kb region centered on the TSS with 21 windows.
  2570. Reads in each window for each TSS were counted in each sample, and the
  2571. counts were normalized and converted to log CPM as in the differential
  2572. modification analysis.
  2573. Then, the logCPM values within each promoter were normalized to an average
  2574. of zero, such that each window's normalized abundance now represents the
  2575. relative read depth of that window compared to all other windows in the
  2576. same promoter.
  2577. The normalized abundance values for each window in a promoter are collectively
  2578. referred to as that promoter's
  2579. \begin_inset Quotes eld
  2580. \end_inset
  2581. relative coverage profile
  2582. \begin_inset Quotes erd
  2583. \end_inset
  2584. .
  2585. \end_layout
  2586. \begin_layout Subsection
  2587. MOFA recovers biologically relevant variation from blind analysis by correlating
  2588. across datasets
  2589. \end_layout
  2590. \begin_layout Standard
  2591. \begin_inset ERT
  2592. status open
  2593. \begin_layout Plain Layout
  2594. \backslash
  2595. afterpage{
  2596. \end_layout
  2597. \begin_layout Plain Layout
  2598. \backslash
  2599. begin{landscape}
  2600. \end_layout
  2601. \end_inset
  2602. \end_layout
  2603. \begin_layout Standard
  2604. \begin_inset Float figure
  2605. wide false
  2606. sideways false
  2607. status open
  2608. \begin_layout Plain Layout
  2609. \begin_inset Float figure
  2610. wide false
  2611. sideways false
  2612. status open
  2613. \begin_layout Plain Layout
  2614. \align center
  2615. \begin_inset Graphics
  2616. filename graphics/CD4-csaw/MOFA-varExplaiend-matrix-CROP.png
  2617. lyxscale 25
  2618. width 45col%
  2619. groupId mofa-subfig
  2620. \end_inset
  2621. \end_layout
  2622. \begin_layout Plain Layout
  2623. \begin_inset Caption Standard
  2624. \begin_layout Plain Layout
  2625. \series bold
  2626. \begin_inset CommandInset label
  2627. LatexCommand label
  2628. name "fig:mofa-varexplained"
  2629. \end_inset
  2630. Variance explained in each data set by each latent factor estimated by MOFA.
  2631. \series default
  2632. For each latent factor (LF) learned by MOFA, the variance explained by
  2633. that factor in each data set (
  2634. \begin_inset Quotes eld
  2635. \end_inset
  2636. view
  2637. \begin_inset Quotes erd
  2638. \end_inset
  2639. ) is shown by the shading of the cells in the lower section.
  2640. The upper section shows the total fraction of each data set's variance
  2641. that is explained by all LFs combined.
  2642. \end_layout
  2643. \end_inset
  2644. \end_layout
  2645. \end_inset
  2646. \begin_inset space \hfill{}
  2647. \end_inset
  2648. \begin_inset Float figure
  2649. wide false
  2650. sideways false
  2651. status open
  2652. \begin_layout Plain Layout
  2653. \align center
  2654. \begin_inset Graphics
  2655. filename graphics/CD4-csaw/MOFA-LF-scatter-CROP.png
  2656. lyxscale 25
  2657. width 45col%
  2658. groupId mofa-subfig
  2659. \end_inset
  2660. \end_layout
  2661. \begin_layout Plain Layout
  2662. \begin_inset Caption Standard
  2663. \begin_layout Plain Layout
  2664. \series bold
  2665. \begin_inset CommandInset label
  2666. LatexCommand label
  2667. name "fig:mofa-lf-scatter"
  2668. \end_inset
  2669. Scatter plots of specific pairs of MOFA latent factors.
  2670. \series default
  2671. LFs 1, 4, and 5 explain substantial variation in all data sets, so they
  2672. are plotted against each other in order to reveal patterns of variation
  2673. that are shared across all data sets.
  2674. \end_layout
  2675. \end_inset
  2676. \end_layout
  2677. \end_inset
  2678. \end_layout
  2679. \begin_layout Plain Layout
  2680. \begin_inset Caption Standard
  2681. \begin_layout Plain Layout
  2682. \series bold
  2683. \begin_inset CommandInset label
  2684. LatexCommand label
  2685. name "fig:MOFA-master"
  2686. \end_inset
  2687. MOFA latent factors separate technical confounders from
  2688. \end_layout
  2689. \end_inset
  2690. \end_layout
  2691. \end_inset
  2692. \end_layout
  2693. \begin_layout Standard
  2694. \begin_inset ERT
  2695. status open
  2696. \begin_layout Plain Layout
  2697. \backslash
  2698. end{landscape}
  2699. \end_layout
  2700. \begin_layout Plain Layout
  2701. }
  2702. \end_layout
  2703. \end_inset
  2704. \end_layout
  2705. \begin_layout Standard
  2706. MOFA was run on all the ChIP-seq windows overlapping consensus peaks for
  2707. each histone mark, as well as the
  2708. \begin_inset Flex Glossary Term
  2709. status open
  2710. \begin_layout Plain Layout
  2711. RNA-seq
  2712. \end_layout
  2713. \end_inset
  2714. data, in order to identify patterns of coordinated variation across all
  2715. data sets
  2716. \begin_inset CommandInset citation
  2717. LatexCommand cite
  2718. key "Argelaguet2018"
  2719. literal "false"
  2720. \end_inset
  2721. .
  2722. The results are summarized in Figure
  2723. \begin_inset CommandInset ref
  2724. LatexCommand ref
  2725. reference "fig:MOFA-master"
  2726. plural "false"
  2727. caps "false"
  2728. noprefix "false"
  2729. \end_inset
  2730. .
  2731. Latent factors 1, 4, and 5 were determined to explain the most variation
  2732. consistently across all data sets (Figure
  2733. \begin_inset CommandInset ref
  2734. LatexCommand ref
  2735. reference "fig:mofa-varexplained"
  2736. plural "false"
  2737. caps "false"
  2738. noprefix "false"
  2739. \end_inset
  2740. ), and scatter plots of these factors show that they also correlate best
  2741. with the experimental factors (Figure
  2742. \begin_inset CommandInset ref
  2743. LatexCommand ref
  2744. reference "fig:mofa-lf-scatter"
  2745. plural "false"
  2746. caps "false"
  2747. noprefix "false"
  2748. \end_inset
  2749. ).
  2750. Latent factor 2 captures the batch effect in the
  2751. \begin_inset Flex Glossary Term
  2752. status open
  2753. \begin_layout Plain Layout
  2754. RNA-seq
  2755. \end_layout
  2756. \end_inset
  2757. data.
  2758. Removing the effect of LF2 using MOFA theoretically yields a batch correction
  2759. that does not depend on knowing the experimental factors.
  2760. When this was attempted, the resulting batch correction was comparable
  2761. to ComBat (see Figure
  2762. \begin_inset CommandInset ref
  2763. LatexCommand ref
  2764. reference "fig:RNA-PCA-ComBat-batchsub"
  2765. plural "false"
  2766. caps "false"
  2767. noprefix "false"
  2768. \end_inset
  2769. ), indicating that the ComBat-based batch correction has little room for
  2770. improvement given the problems with the data set.
  2771. \end_layout
  2772. \begin_layout Standard
  2773. \begin_inset Note Note
  2774. status collapsed
  2775. \begin_layout Plain Layout
  2776. \begin_inset Float figure
  2777. wide false
  2778. sideways false
  2779. status open
  2780. \begin_layout Plain Layout
  2781. \align center
  2782. \begin_inset Graphics
  2783. filename graphics/CD4-csaw/MOFA-batch-correct-CROP.png
  2784. lyxscale 25
  2785. width 100col%
  2786. groupId colwidth-raster
  2787. \end_inset
  2788. \end_layout
  2789. \begin_layout Plain Layout
  2790. \begin_inset Caption Standard
  2791. \begin_layout Plain Layout
  2792. \series bold
  2793. \begin_inset CommandInset label
  2794. LatexCommand label
  2795. name "fig:mofa-batchsub"
  2796. \end_inset
  2797. Result of RNA-seq batch-correction using MOFA latent factors
  2798. \end_layout
  2799. \end_inset
  2800. \end_layout
  2801. \end_inset
  2802. \end_layout
  2803. \end_inset
  2804. \end_layout
  2805. \begin_layout Section
  2806. Results
  2807. \end_layout
  2808. \begin_layout Standard
  2809. \begin_inset Flex TODO Note (inline)
  2810. status open
  2811. \begin_layout Plain Layout
  2812. Focus on what hypotheses were tested, then select figures that show how
  2813. those hypotheses were tested, even if the result is a negative.
  2814. Not every interesting result needs to be in here.
  2815. Chapter should tell a story.
  2816. \end_layout
  2817. \end_inset
  2818. \end_layout
  2819. \begin_layout Standard
  2820. \begin_inset Flex TODO Note (inline)
  2821. status open
  2822. \begin_layout Plain Layout
  2823. Maybe reorder these sections to do RNA-seq, then ChIP-seq, then combined
  2824. analyses?
  2825. \end_layout
  2826. \end_inset
  2827. \end_layout
  2828. \begin_layout Subsection
  2829. Interpretation of RNA-seq analysis is limited by a major confounding factor
  2830. \end_layout
  2831. \begin_layout Standard
  2832. \begin_inset Float table
  2833. wide false
  2834. sideways false
  2835. status collapsed
  2836. \begin_layout Plain Layout
  2837. \align center
  2838. \begin_inset Tabular
  2839. <lyxtabular version="3" rows="11" columns="3">
  2840. <features tabularvalignment="middle">
  2841. <column alignment="center" valignment="top">
  2842. <column alignment="center" valignment="top">
  2843. <column alignment="center" valignment="top">
  2844. <row>
  2845. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2846. \begin_inset Text
  2847. \begin_layout Plain Layout
  2848. Test
  2849. \end_layout
  2850. \end_inset
  2851. </cell>
  2852. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2853. \begin_inset Text
  2854. \begin_layout Plain Layout
  2855. Est.
  2856. non-null
  2857. \end_layout
  2858. \end_inset
  2859. </cell>
  2860. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2861. \begin_inset Text
  2862. \begin_layout Plain Layout
  2863. \begin_inset Formula $\mathrm{FDR}\le10\%$
  2864. \end_inset
  2865. \end_layout
  2866. \end_inset
  2867. </cell>
  2868. </row>
  2869. <row>
  2870. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2871. \begin_inset Text
  2872. \begin_layout Plain Layout
  2873. Naïve Day 0 vs Day 1
  2874. \end_layout
  2875. \end_inset
  2876. </cell>
  2877. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2878. \begin_inset Text
  2879. \begin_layout Plain Layout
  2880. 5992
  2881. \end_layout
  2882. \end_inset
  2883. </cell>
  2884. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2885. \begin_inset Text
  2886. \begin_layout Plain Layout
  2887. 1613
  2888. \end_layout
  2889. \end_inset
  2890. </cell>
  2891. </row>
  2892. <row>
  2893. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2894. \begin_inset Text
  2895. \begin_layout Plain Layout
  2896. Naïve Day 0 vs Day 5
  2897. \end_layout
  2898. \end_inset
  2899. </cell>
  2900. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2901. \begin_inset Text
  2902. \begin_layout Plain Layout
  2903. 3038
  2904. \end_layout
  2905. \end_inset
  2906. </cell>
  2907. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2908. \begin_inset Text
  2909. \begin_layout Plain Layout
  2910. 32
  2911. \end_layout
  2912. \end_inset
  2913. </cell>
  2914. </row>
  2915. <row>
  2916. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2917. \begin_inset Text
  2918. \begin_layout Plain Layout
  2919. Naïve Day 0 vs Day 14
  2920. \end_layout
  2921. \end_inset
  2922. </cell>
  2923. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2924. \begin_inset Text
  2925. \begin_layout Plain Layout
  2926. 1870
  2927. \end_layout
  2928. \end_inset
  2929. </cell>
  2930. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2931. \begin_inset Text
  2932. \begin_layout Plain Layout
  2933. 190
  2934. \end_layout
  2935. \end_inset
  2936. </cell>
  2937. </row>
  2938. <row>
  2939. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2940. \begin_inset Text
  2941. \begin_layout Plain Layout
  2942. Memory Day 0 vs Day 1
  2943. \end_layout
  2944. \end_inset
  2945. </cell>
  2946. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2947. \begin_inset Text
  2948. \begin_layout Plain Layout
  2949. 3195
  2950. \end_layout
  2951. \end_inset
  2952. </cell>
  2953. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2954. \begin_inset Text
  2955. \begin_layout Plain Layout
  2956. 411
  2957. \end_layout
  2958. \end_inset
  2959. </cell>
  2960. </row>
  2961. <row>
  2962. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2963. \begin_inset Text
  2964. \begin_layout Plain Layout
  2965. Memory Day 0 vs Day 5
  2966. \end_layout
  2967. \end_inset
  2968. </cell>
  2969. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2970. \begin_inset Text
  2971. \begin_layout Plain Layout
  2972. 2688
  2973. \end_layout
  2974. \end_inset
  2975. </cell>
  2976. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2977. \begin_inset Text
  2978. \begin_layout Plain Layout
  2979. 18
  2980. \end_layout
  2981. \end_inset
  2982. </cell>
  2983. </row>
  2984. <row>
  2985. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2986. \begin_inset Text
  2987. \begin_layout Plain Layout
  2988. Memory Day 0 vs Day 14
  2989. \end_layout
  2990. \end_inset
  2991. </cell>
  2992. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2993. \begin_inset Text
  2994. \begin_layout Plain Layout
  2995. 1911
  2996. \end_layout
  2997. \end_inset
  2998. </cell>
  2999. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3000. \begin_inset Text
  3001. \begin_layout Plain Layout
  3002. 227
  3003. \end_layout
  3004. \end_inset
  3005. </cell>
  3006. </row>
  3007. <row>
  3008. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3009. \begin_inset Text
  3010. \begin_layout Plain Layout
  3011. Day 0 Naïve vs Memory
  3012. \end_layout
  3013. \end_inset
  3014. </cell>
  3015. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3016. \begin_inset Text
  3017. \begin_layout Plain Layout
  3018. 0
  3019. \end_layout
  3020. \end_inset
  3021. </cell>
  3022. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3023. \begin_inset Text
  3024. \begin_layout Plain Layout
  3025. 2
  3026. \end_layout
  3027. \end_inset
  3028. </cell>
  3029. </row>
  3030. <row>
  3031. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3032. \begin_inset Text
  3033. \begin_layout Plain Layout
  3034. Day 1 Naïve vs Memory
  3035. \end_layout
  3036. \end_inset
  3037. </cell>
  3038. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3039. \begin_inset Text
  3040. \begin_layout Plain Layout
  3041. 9167
  3042. \end_layout
  3043. \end_inset
  3044. </cell>
  3045. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3046. \begin_inset Text
  3047. \begin_layout Plain Layout
  3048. 5532
  3049. \end_layout
  3050. \end_inset
  3051. </cell>
  3052. </row>
  3053. <row>
  3054. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3055. \begin_inset Text
  3056. \begin_layout Plain Layout
  3057. Day 5 Naïve vs Memory
  3058. \end_layout
  3059. \end_inset
  3060. </cell>
  3061. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3062. \begin_inset Text
  3063. \begin_layout Plain Layout
  3064. 0
  3065. \end_layout
  3066. \end_inset
  3067. </cell>
  3068. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3069. \begin_inset Text
  3070. \begin_layout Plain Layout
  3071. 0
  3072. \end_layout
  3073. \end_inset
  3074. </cell>
  3075. </row>
  3076. <row>
  3077. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3078. \begin_inset Text
  3079. \begin_layout Plain Layout
  3080. Day 14 Naïve vs Memory
  3081. \end_layout
  3082. \end_inset
  3083. </cell>
  3084. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3085. \begin_inset Text
  3086. \begin_layout Plain Layout
  3087. 6446
  3088. \end_layout
  3089. \end_inset
  3090. </cell>
  3091. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3092. \begin_inset Text
  3093. \begin_layout Plain Layout
  3094. 2319
  3095. \end_layout
  3096. \end_inset
  3097. </cell>
  3098. </row>
  3099. </lyxtabular>
  3100. \end_inset
  3101. \end_layout
  3102. \begin_layout Plain Layout
  3103. \begin_inset Caption Standard
  3104. \begin_layout Plain Layout
  3105. \series bold
  3106. \begin_inset CommandInset label
  3107. LatexCommand label
  3108. name "tab:Estimated-and-detected-rnaseq"
  3109. \end_inset
  3110. Estimated and detected differentially expressed genes.
  3111. \series default
  3112. \begin_inset Quotes eld
  3113. \end_inset
  3114. Test
  3115. \begin_inset Quotes erd
  3116. \end_inset
  3117. : Which sample groups were compared;
  3118. \begin_inset Quotes eld
  3119. \end_inset
  3120. Est non-null
  3121. \begin_inset Quotes erd
  3122. \end_inset
  3123. : Estimated number of differentially expressed genes, using the method of
  3124. averaging local FDR values
  3125. \begin_inset CommandInset citation
  3126. LatexCommand cite
  3127. key "Phipson2013Thesis"
  3128. literal "false"
  3129. \end_inset
  3130. ;
  3131. \begin_inset Quotes eld
  3132. \end_inset
  3133. \begin_inset Formula $\mathrm{FDR}\le10\%$
  3134. \end_inset
  3135. \begin_inset Quotes erd
  3136. \end_inset
  3137. : Number of significantly differentially expressed genes at an FDR threshold
  3138. of 10%.
  3139. The total number of genes tested was 16707.
  3140. \end_layout
  3141. \end_inset
  3142. \end_layout
  3143. \end_inset
  3144. \end_layout
  3145. \begin_layout Standard
  3146. \begin_inset Float figure
  3147. wide false
  3148. sideways false
  3149. status collapsed
  3150. \begin_layout Plain Layout
  3151. \align center
  3152. \begin_inset Graphics
  3153. filename graphics/CD4-csaw/RNA-seq/PCA-final-12-CROP.png
  3154. lyxscale 25
  3155. width 100col%
  3156. groupId colwidth-raster
  3157. \end_inset
  3158. \end_layout
  3159. \begin_layout Plain Layout
  3160. \begin_inset Caption Standard
  3161. \begin_layout Plain Layout
  3162. \series bold
  3163. \begin_inset CommandInset label
  3164. LatexCommand label
  3165. name "fig:rna-pca-final"
  3166. \end_inset
  3167. PCoA plot of RNA-seq samples after ComBat batch correction.
  3168. \series default
  3169. Each point represents an individual sample.
  3170. Samples with the same combination of cell type and time point are encircled
  3171. with a shaded region to aid in visual identification of the sample groups.
  3172. Samples with of same cell type from the same donor are connected by lines
  3173. to indicate the
  3174. \begin_inset Quotes eld
  3175. \end_inset
  3176. trajectory
  3177. \begin_inset Quotes erd
  3178. \end_inset
  3179. of each donor's cells over time in PCoA space.
  3180. \end_layout
  3181. \end_inset
  3182. \end_layout
  3183. \begin_layout Plain Layout
  3184. \end_layout
  3185. \end_inset
  3186. \end_layout
  3187. \begin_layout Standard
  3188. Genes called present in the
  3189. \begin_inset Flex Glossary Term
  3190. status open
  3191. \begin_layout Plain Layout
  3192. RNA-seq
  3193. \end_layout
  3194. \end_inset
  3195. data were tested for differential expression between all time points and
  3196. cell types.
  3197. The counts of differentially expressed genes are shown in Table
  3198. \begin_inset CommandInset ref
  3199. LatexCommand ref
  3200. reference "tab:Estimated-and-detected-rnaseq"
  3201. plural "false"
  3202. caps "false"
  3203. noprefix "false"
  3204. \end_inset
  3205. .
  3206. Notably, all the results for Day 0 and Day 5 have substantially fewer genes
  3207. called differentially expressed than any of the results for other time
  3208. points.
  3209. This is an unfortunate result of the difference in sample quality between
  3210. the two batches of
  3211. \begin_inset Flex Glossary Term
  3212. status open
  3213. \begin_layout Plain Layout
  3214. RNA-seq
  3215. \end_layout
  3216. \end_inset
  3217. data.
  3218. All the samples in Batch 1, which includes all the samples from Days 0
  3219. and 5, have substantially more variability than the samples in Batch 2,
  3220. which includes the other time points.
  3221. This is reflected in the substantially higher weights assigned to Batch
  3222. 2 (Figure
  3223. \begin_inset CommandInset ref
  3224. LatexCommand ref
  3225. reference "fig:RNA-seq-weights-vs-covars"
  3226. plural "false"
  3227. caps "false"
  3228. noprefix "false"
  3229. \end_inset
  3230. ).
  3231. The batch effect has both a systematic component and a random noise component.
  3232. While the systematic component was subtracted out using ComBat (Figure
  3233. \begin_inset CommandInset ref
  3234. LatexCommand ref
  3235. reference "fig:RNA-PCA"
  3236. plural "false"
  3237. caps "false"
  3238. noprefix "false"
  3239. \end_inset
  3240. ), no such correction is possible for the noise component: Batch 1 simply
  3241. has substantially more random noise in it, which reduces the statistical
  3242. power for any differential expression tests involving samples in that batch.
  3243. \end_layout
  3244. \begin_layout Standard
  3245. Despite the difficulty in detecting specific differentially expressed genes,
  3246. there is still evidence that differential expression is present for these
  3247. time points.
  3248. In Figure
  3249. \begin_inset CommandInset ref
  3250. LatexCommand ref
  3251. reference "fig:rna-pca-final"
  3252. plural "false"
  3253. caps "false"
  3254. noprefix "false"
  3255. \end_inset
  3256. , there is a clear separation between naïve and memory samples at Day 0,
  3257. despite the fact that only 2 genes were significantly differentially expressed
  3258. for this comparison.
  3259. Similarly, the small numbers of genes detected for the Day 0 vs Day 5 compariso
  3260. ns do not reflect the large separation between these time points in Figure
  3261. \begin_inset CommandInset ref
  3262. LatexCommand ref
  3263. reference "fig:rna-pca-final"
  3264. plural "false"
  3265. caps "false"
  3266. noprefix "false"
  3267. \end_inset
  3268. .
  3269. In addition, the MOFA latent factor plots in Figure
  3270. \begin_inset CommandInset ref
  3271. LatexCommand ref
  3272. reference "fig:mofa-lf-scatter"
  3273. plural "false"
  3274. caps "false"
  3275. noprefix "false"
  3276. \end_inset
  3277. .
  3278. This suggests that there is indeed a differential expression signal present
  3279. in the data for these comparisons, but the large variability in the Batch
  3280. 1 samples obfuscates this signal at the individual gene level.
  3281. As a result, it is impossible to make any meaningful statements about the
  3282. \begin_inset Quotes eld
  3283. \end_inset
  3284. size
  3285. \begin_inset Quotes erd
  3286. \end_inset
  3287. of the gene signature for any time point, since the number of significant
  3288. genes as well as the estimated number of differentially expressed genes
  3289. depends so strongly on the variations in sample quality in addition to
  3290. the size of the differential expression signal in the data.
  3291. Gene-set enrichment analyses are similarly impractical.
  3292. However, analyses looking at genome-wide patterns of expression are still
  3293. practical.
  3294. \end_layout
  3295. \begin_layout Subsection
  3296. H3K4 and H3K27 methylation occur in broad regions and are enriched near
  3297. promoters
  3298. \end_layout
  3299. \begin_layout Standard
  3300. \begin_inset Float table
  3301. wide false
  3302. sideways false
  3303. status collapsed
  3304. \begin_layout Plain Layout
  3305. \align center
  3306. \begin_inset Flex TODO Note (inline)
  3307. status open
  3308. \begin_layout Plain Layout
  3309. Also get
  3310. \emph on
  3311. median
  3312. \emph default
  3313. peak width and maybe other quantiles (25%, 75%)
  3314. \end_layout
  3315. \end_inset
  3316. \end_layout
  3317. \begin_layout Plain Layout
  3318. \align center
  3319. \begin_inset Tabular
  3320. <lyxtabular version="3" rows="4" columns="5">
  3321. <features tabularvalignment="middle">
  3322. <column alignment="center" valignment="top">
  3323. <column alignment="center" valignment="top">
  3324. <column alignment="center" valignment="top">
  3325. <column alignment="center" valignment="top">
  3326. <column alignment="center" valignment="top">
  3327. <row>
  3328. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3329. \begin_inset Text
  3330. \begin_layout Plain Layout
  3331. Histone Mark
  3332. \end_layout
  3333. \end_inset
  3334. </cell>
  3335. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3336. \begin_inset Text
  3337. \begin_layout Plain Layout
  3338. # Peaks
  3339. \end_layout
  3340. \end_inset
  3341. </cell>
  3342. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3343. \begin_inset Text
  3344. \begin_layout Plain Layout
  3345. Mean peak width
  3346. \end_layout
  3347. \end_inset
  3348. </cell>
  3349. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3350. \begin_inset Text
  3351. \begin_layout Plain Layout
  3352. genome coverage
  3353. \end_layout
  3354. \end_inset
  3355. </cell>
  3356. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3357. \begin_inset Text
  3358. \begin_layout Plain Layout
  3359. FRiP
  3360. \end_layout
  3361. \end_inset
  3362. </cell>
  3363. </row>
  3364. <row>
  3365. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3366. \begin_inset Text
  3367. \begin_layout Plain Layout
  3368. H3K4me2
  3369. \end_layout
  3370. \end_inset
  3371. </cell>
  3372. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3373. \begin_inset Text
  3374. \begin_layout Plain Layout
  3375. 14965
  3376. \end_layout
  3377. \end_inset
  3378. </cell>
  3379. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3380. \begin_inset Text
  3381. \begin_layout Plain Layout
  3382. 3970
  3383. \end_layout
  3384. \end_inset
  3385. </cell>
  3386. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3387. \begin_inset Text
  3388. \begin_layout Plain Layout
  3389. 1.92%
  3390. \end_layout
  3391. \end_inset
  3392. </cell>
  3393. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3394. \begin_inset Text
  3395. \begin_layout Plain Layout
  3396. 14.2%
  3397. \end_layout
  3398. \end_inset
  3399. </cell>
  3400. </row>
  3401. <row>
  3402. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3403. \begin_inset Text
  3404. \begin_layout Plain Layout
  3405. H3K4me3
  3406. \end_layout
  3407. \end_inset
  3408. </cell>
  3409. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3410. \begin_inset Text
  3411. \begin_layout Plain Layout
  3412. 6163
  3413. \end_layout
  3414. \end_inset
  3415. </cell>
  3416. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3417. \begin_inset Text
  3418. \begin_layout Plain Layout
  3419. 2946
  3420. \end_layout
  3421. \end_inset
  3422. </cell>
  3423. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3424. \begin_inset Text
  3425. \begin_layout Plain Layout
  3426. 0.588%
  3427. \end_layout
  3428. \end_inset
  3429. </cell>
  3430. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3431. \begin_inset Text
  3432. \begin_layout Plain Layout
  3433. 6.57%
  3434. \end_layout
  3435. \end_inset
  3436. </cell>
  3437. </row>
  3438. <row>
  3439. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3440. \begin_inset Text
  3441. \begin_layout Plain Layout
  3442. H3K27me3
  3443. \end_layout
  3444. \end_inset
  3445. </cell>
  3446. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3447. \begin_inset Text
  3448. \begin_layout Plain Layout
  3449. 18139
  3450. \end_layout
  3451. \end_inset
  3452. </cell>
  3453. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3454. \begin_inset Text
  3455. \begin_layout Plain Layout
  3456. 18967
  3457. \end_layout
  3458. \end_inset
  3459. </cell>
  3460. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3461. \begin_inset Text
  3462. \begin_layout Plain Layout
  3463. 11.1%
  3464. \end_layout
  3465. \end_inset
  3466. </cell>
  3467. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3468. \begin_inset Text
  3469. \begin_layout Plain Layout
  3470. 22.5%
  3471. \end_layout
  3472. \end_inset
  3473. </cell>
  3474. </row>
  3475. </lyxtabular>
  3476. \end_inset
  3477. \end_layout
  3478. \begin_layout Plain Layout
  3479. \begin_inset Caption Standard
  3480. \begin_layout Plain Layout
  3481. \series bold
  3482. \begin_inset CommandInset label
  3483. LatexCommand label
  3484. name "tab:peak-calling-summary"
  3485. \end_inset
  3486. Peak-calling summary.
  3487. \series default
  3488. For each histone mark, the number of peaks called using SICER at an IDR
  3489. threshold of ???, the mean width of those peaks, the fraction of the genome
  3490. covered by peaks, and the fraction of reads in peaks (FRiP).
  3491. \end_layout
  3492. \end_inset
  3493. \end_layout
  3494. \end_inset
  3495. \end_layout
  3496. \begin_layout Standard
  3497. Table
  3498. \begin_inset CommandInset ref
  3499. LatexCommand ref
  3500. reference "tab:peak-calling-summary"
  3501. plural "false"
  3502. caps "false"
  3503. noprefix "false"
  3504. \end_inset
  3505. gives a summary of the peak calling statistics for each histone mark.
  3506. Consistent with previous observations [CITATION NEEDED], all 3 histone
  3507. marks occur in broad regions spanning many consecutive nucleosomes, rather
  3508. than in sharp peaks as would be expected for a transcription factor or
  3509. other molecule that binds to specific sites.
  3510. This conclusion is further supported by Figure
  3511. \begin_inset CommandInset ref
  3512. LatexCommand ref
  3513. reference "fig:CCF-with-blacklist"
  3514. plural "false"
  3515. caps "false"
  3516. noprefix "false"
  3517. \end_inset
  3518. , in which a clear nucleosome-sized periodicity is visible in the cross-correlat
  3519. ion value for each sample, indicating that each time a given mark is present
  3520. on one histone, it is also likely to be found on adjacent histones as well.
  3521. H3K27me3 enrichment in particular is substantially more broad than either
  3522. H3K4 mark, with a mean peak width of almost 19,000 bp.
  3523. This is also reflected in the periodicity observed in Figure
  3524. \begin_inset CommandInset ref
  3525. LatexCommand ref
  3526. reference "fig:CCF-with-blacklist"
  3527. plural "false"
  3528. caps "false"
  3529. noprefix "false"
  3530. \end_inset
  3531. , which remains strong much farther out for H3K27me3 than the other marks,
  3532. showing H3K27me3 especially tends to be found on long runs of consecutive
  3533. histones.
  3534. \end_layout
  3535. \begin_layout Standard
  3536. \begin_inset Float figure
  3537. wide false
  3538. sideways false
  3539. status open
  3540. \begin_layout Plain Layout
  3541. \begin_inset Flex TODO Note (inline)
  3542. status open
  3543. \begin_layout Plain Layout
  3544. Ensure this figure uses the peak calls from the new analysis.
  3545. \end_layout
  3546. \end_inset
  3547. \end_layout
  3548. \begin_layout Plain Layout
  3549. \begin_inset Flex TODO Note (inline)
  3550. status open
  3551. \begin_layout Plain Layout
  3552. Need a control: shuffle all peaks and repeat, N times.
  3553. Do real vs shuffled control both in a top/bottom arrangement.
  3554. \end_layout
  3555. \end_inset
  3556. \end_layout
  3557. \begin_layout Plain Layout
  3558. \begin_inset Flex TODO Note (inline)
  3559. status open
  3560. \begin_layout Plain Layout
  3561. Consider counting TSS inside peaks as negative number indicating how far
  3562. \emph on
  3563. inside
  3564. \emph default
  3565. the peak the TSS is (i.e.
  3566. distance to nearest non-peak area).
  3567. \end_layout
  3568. \end_inset
  3569. \end_layout
  3570. \begin_layout Plain Layout
  3571. \begin_inset Flex TODO Note (inline)
  3572. status open
  3573. \begin_layout Plain Layout
  3574. The H3K4 part of this figure is included in
  3575. \begin_inset CommandInset citation
  3576. LatexCommand cite
  3577. key "LaMere2016"
  3578. literal "false"
  3579. \end_inset
  3580. as Fig.
  3581. S2.
  3582. Do I need to do anything about that?
  3583. \end_layout
  3584. \end_inset
  3585. \end_layout
  3586. \begin_layout Plain Layout
  3587. \align center
  3588. \begin_inset Graphics
  3589. filename graphics/CD4-csaw/Promoter Peak Distance Profile-PAGE1-CROP.pdf
  3590. lyxscale 50
  3591. width 80col%
  3592. \end_inset
  3593. \end_layout
  3594. \begin_layout Plain Layout
  3595. \begin_inset Caption Standard
  3596. \begin_layout Plain Layout
  3597. \series bold
  3598. \begin_inset CommandInset label
  3599. LatexCommand label
  3600. name "fig:near-promoter-peak-enrich"
  3601. \end_inset
  3602. Enrichment of peaks in promoter neighborhoods.
  3603. \series default
  3604. This plot shows the distribution of distances from each annotated transcription
  3605. start site in the genome to the nearest called peak.
  3606. Each line represents one combination of histone mark, cell type, and time
  3607. point.
  3608. Distributions are smoothed using kernel density estimation [CITE? see ggplot2
  3609. stat_density()].
  3610. Transcription start sites that occur
  3611. \emph on
  3612. within
  3613. \emph default
  3614. peaks were excluded from this plot to avoid a large spike at zero that
  3615. would overshadow the rest of the distribution.
  3616. \end_layout
  3617. \end_inset
  3618. \end_layout
  3619. \end_inset
  3620. \end_layout
  3621. \begin_layout Standard
  3622. \begin_inset Float table
  3623. wide false
  3624. sideways false
  3625. status collapsed
  3626. \begin_layout Plain Layout
  3627. \align center
  3628. \begin_inset Tabular
  3629. <lyxtabular version="3" rows="4" columns="2">
  3630. <features tabularvalignment="middle">
  3631. <column alignment="center" valignment="top">
  3632. <column alignment="center" valignment="top">
  3633. <row>
  3634. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3635. \begin_inset Text
  3636. \begin_layout Plain Layout
  3637. Histone mark
  3638. \end_layout
  3639. \end_inset
  3640. </cell>
  3641. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3642. \begin_inset Text
  3643. \begin_layout Plain Layout
  3644. Effective promoter radius
  3645. \end_layout
  3646. \end_inset
  3647. </cell>
  3648. </row>
  3649. <row>
  3650. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3651. \begin_inset Text
  3652. \begin_layout Plain Layout
  3653. H3K4me2
  3654. \end_layout
  3655. \end_inset
  3656. </cell>
  3657. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3658. \begin_inset Text
  3659. \begin_layout Plain Layout
  3660. 1 kb
  3661. \end_layout
  3662. \end_inset
  3663. </cell>
  3664. </row>
  3665. <row>
  3666. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3667. \begin_inset Text
  3668. \begin_layout Plain Layout
  3669. H3K4me3
  3670. \end_layout
  3671. \end_inset
  3672. </cell>
  3673. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3674. \begin_inset Text
  3675. \begin_layout Plain Layout
  3676. 1 kb
  3677. \end_layout
  3678. \end_inset
  3679. </cell>
  3680. </row>
  3681. <row>
  3682. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3683. \begin_inset Text
  3684. \begin_layout Plain Layout
  3685. H3K27me3
  3686. \end_layout
  3687. \end_inset
  3688. </cell>
  3689. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3690. \begin_inset Text
  3691. \begin_layout Plain Layout
  3692. 2.5 kb
  3693. \end_layout
  3694. \end_inset
  3695. </cell>
  3696. </row>
  3697. </lyxtabular>
  3698. \end_inset
  3699. \end_layout
  3700. \begin_layout Plain Layout
  3701. \begin_inset Caption Standard
  3702. \begin_layout Plain Layout
  3703. \series bold
  3704. \begin_inset CommandInset label
  3705. LatexCommand label
  3706. name "tab:effective-promoter-radius"
  3707. \end_inset
  3708. Effective promoter radius for each histone mark.
  3709. \series default
  3710. These values represent the approximate distance from transcription start
  3711. site positions within which an excess of peaks are found, as shown in Figure
  3712. \begin_inset CommandInset ref
  3713. LatexCommand ref
  3714. reference "fig:near-promoter-peak-enrich"
  3715. plural "false"
  3716. caps "false"
  3717. noprefix "false"
  3718. \end_inset
  3719. .
  3720. \end_layout
  3721. \end_inset
  3722. \end_layout
  3723. \begin_layout Plain Layout
  3724. \end_layout
  3725. \end_inset
  3726. \end_layout
  3727. \begin_layout Standard
  3728. All 3 histone marks tend to occur more often near promoter regions, as shown
  3729. in Figure
  3730. \begin_inset CommandInset ref
  3731. LatexCommand ref
  3732. reference "fig:near-promoter-peak-enrich"
  3733. plural "false"
  3734. caps "false"
  3735. noprefix "false"
  3736. \end_inset
  3737. .
  3738. The majority of each density distribution is flat, representing the background
  3739. density of peaks genome-wide.
  3740. Each distribution has a peak near zero, representing an enrichment of peaks
  3741. close transcription start site (TSS) positions relative to the remainder
  3742. of the genome.
  3743. Interestingly, the
  3744. \begin_inset Quotes eld
  3745. \end_inset
  3746. radius
  3747. \begin_inset Quotes erd
  3748. \end_inset
  3749. within which this enrichment occurs is not the same for every histone mark
  3750. (Table
  3751. \begin_inset CommandInset ref
  3752. LatexCommand ref
  3753. reference "tab:effective-promoter-radius"
  3754. plural "false"
  3755. caps "false"
  3756. noprefix "false"
  3757. \end_inset
  3758. ).
  3759. For H3K4me2 and H3K4me3, peaks are most enriched within 1
  3760. \begin_inset space ~
  3761. \end_inset
  3762. kbp of TSS positions, while for H3K27me3, enrichment is broader, extending
  3763. to 2.5
  3764. \begin_inset space ~
  3765. \end_inset
  3766. kbp.
  3767. These
  3768. \begin_inset Quotes eld
  3769. \end_inset
  3770. effective promoter radii
  3771. \begin_inset Quotes erd
  3772. \end_inset
  3773. remain approximately the same across all combinations of experimental condition
  3774. (cell type, time point, and donor), so they appear to be a property of
  3775. the histone mark itself.
  3776. Hence, these radii were used to define the promoter regions for each histone
  3777. mark in all further analyses.
  3778. \end_layout
  3779. \begin_layout Standard
  3780. \begin_inset Flex TODO Note (inline)
  3781. status open
  3782. \begin_layout Plain Layout
  3783. Consider also showing figure for distance to nearest peak center, and reference
  3784. median peak size once that is known.
  3785. \end_layout
  3786. \end_inset
  3787. \end_layout
  3788. \begin_layout Subsection
  3789. H3K4 and H3K27 promoter methylation has broadly the expected correlation
  3790. with gene expression
  3791. \end_layout
  3792. \begin_layout Standard
  3793. \begin_inset Float figure
  3794. wide false
  3795. sideways false
  3796. status collapsed
  3797. \begin_layout Plain Layout
  3798. \begin_inset Flex TODO Note (inline)
  3799. status open
  3800. \begin_layout Plain Layout
  3801. This figure is generated from the old analysis.
  3802. Either note that in some way or re-generate it from the new peak calls.
  3803. \end_layout
  3804. \end_inset
  3805. \end_layout
  3806. \begin_layout Plain Layout
  3807. \align center
  3808. \begin_inset Graphics
  3809. filename graphics/CD4-csaw/FPKM by Peak Violin Plots-CROP.pdf
  3810. lyxscale 50
  3811. width 100col%
  3812. \end_inset
  3813. \end_layout
  3814. \begin_layout Plain Layout
  3815. \begin_inset Caption Standard
  3816. \begin_layout Plain Layout
  3817. \series bold
  3818. \begin_inset CommandInset label
  3819. LatexCommand label
  3820. name "fig:fpkm-by-peak"
  3821. \end_inset
  3822. Expression distributions of genes with and without promoter peaks.
  3823. \end_layout
  3824. \end_inset
  3825. \end_layout
  3826. \end_inset
  3827. \end_layout
  3828. \begin_layout Standard
  3829. H3K4me2 and H3K4me2 have previously been reported as activating marks whose
  3830. presence in a gene's promoter is associated with higher gene expression,
  3831. while H3K27me3 has been reported as inactivating [CITE].
  3832. The data are consistent with this characterization: genes whose promoters
  3833. (as defined by the radii for each histone mark listed in
  3834. \begin_inset CommandInset ref
  3835. LatexCommand ref
  3836. reference "tab:effective-promoter-radius"
  3837. plural "false"
  3838. caps "false"
  3839. noprefix "false"
  3840. \end_inset
  3841. ) overlap with a H3K4me2 or H3K4me3 peak tend to have higher expression
  3842. than those that don't, while H3K27me3 is likewise associated with lower
  3843. gene expression, as shown in
  3844. \begin_inset CommandInset ref
  3845. LatexCommand ref
  3846. reference "fig:fpkm-by-peak"
  3847. plural "false"
  3848. caps "false"
  3849. noprefix "false"
  3850. \end_inset
  3851. .
  3852. This pattern holds across all combinations of cell type and time point
  3853. (Welch's
  3854. \emph on
  3855. t
  3856. \emph default
  3857. -test, all
  3858. \begin_inset Formula $p\mathrm{-values}\ll2.2\times10^{-16}$
  3859. \end_inset
  3860. ).
  3861. The difference in average log FPKM values when a peak overlaps the promoter
  3862. is about
  3863. \begin_inset Formula $+5.67$
  3864. \end_inset
  3865. for H3K4me2,
  3866. \begin_inset Formula $+5.76$
  3867. \end_inset
  3868. for H3K4me2, and
  3869. \begin_inset Formula $-4.00$
  3870. \end_inset
  3871. for H3K27me3.
  3872. \end_layout
  3873. \begin_layout Standard
  3874. \begin_inset Flex TODO Note (inline)
  3875. status open
  3876. \begin_layout Plain Layout
  3877. I also have some figures looking at interactions between marks (e.g.
  3878. what if a promoter has both H3K4me3 and H3K27me3), but I don't know if
  3879. that much detail is warranted here, since all the effects just seem approximate
  3880. ly additive anyway.
  3881. \end_layout
  3882. \end_inset
  3883. \end_layout
  3884. \begin_layout Subsection
  3885. Gene expression and promoter histone methylation patterns in naïve and memory
  3886. show convergence at day 14
  3887. \end_layout
  3888. \begin_layout Standard
  3889. \begin_inset ERT
  3890. status open
  3891. \begin_layout Plain Layout
  3892. \backslash
  3893. afterpage{
  3894. \end_layout
  3895. \begin_layout Plain Layout
  3896. \backslash
  3897. begin{landscape}
  3898. \end_layout
  3899. \end_inset
  3900. \end_layout
  3901. \begin_layout Standard
  3902. \begin_inset Float table
  3903. wide false
  3904. sideways false
  3905. status open
  3906. \begin_layout Plain Layout
  3907. \align center
  3908. \begin_inset Tabular
  3909. <lyxtabular version="3" rows="6" columns="7">
  3910. <features tabularvalignment="middle">
  3911. <column alignment="center" valignment="top">
  3912. <column alignment="center" valignment="top">
  3913. <column alignment="center" valignment="top">
  3914. <column alignment="center" valignment="top">
  3915. <column alignment="center" valignment="top">
  3916. <column alignment="center" valignment="top">
  3917. <column alignment="center" valignment="top">
  3918. <row>
  3919. <cell alignment="center" valignment="top" usebox="none">
  3920. \begin_inset Text
  3921. \begin_layout Plain Layout
  3922. \end_layout
  3923. \end_inset
  3924. </cell>
  3925. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3926. \begin_inset Text
  3927. \begin_layout Plain Layout
  3928. Number of significant promoters
  3929. \end_layout
  3930. \end_inset
  3931. </cell>
  3932. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3933. \begin_inset Text
  3934. \begin_layout Plain Layout
  3935. \end_layout
  3936. \end_inset
  3937. </cell>
  3938. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3939. \begin_inset Text
  3940. \begin_layout Plain Layout
  3941. \end_layout
  3942. \end_inset
  3943. </cell>
  3944. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3945. \begin_inset Text
  3946. \begin_layout Plain Layout
  3947. Est.
  3948. differentially modified promoters
  3949. \end_layout
  3950. \end_inset
  3951. </cell>
  3952. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3953. \begin_inset Text
  3954. \begin_layout Plain Layout
  3955. \end_layout
  3956. \end_inset
  3957. </cell>
  3958. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3959. \begin_inset Text
  3960. \begin_layout Plain Layout
  3961. \end_layout
  3962. \end_inset
  3963. </cell>
  3964. </row>
  3965. <row>
  3966. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3967. \begin_inset Text
  3968. \begin_layout Plain Layout
  3969. Time Point
  3970. \end_layout
  3971. \end_inset
  3972. </cell>
  3973. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3974. \begin_inset Text
  3975. \begin_layout Plain Layout
  3976. H3K4me2
  3977. \end_layout
  3978. \end_inset
  3979. </cell>
  3980. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3981. \begin_inset Text
  3982. \begin_layout Plain Layout
  3983. H3K4me3
  3984. \end_layout
  3985. \end_inset
  3986. </cell>
  3987. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3988. \begin_inset Text
  3989. \begin_layout Plain Layout
  3990. H3K27me3
  3991. \end_layout
  3992. \end_inset
  3993. </cell>
  3994. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3995. \begin_inset Text
  3996. \begin_layout Plain Layout
  3997. H3K4me2
  3998. \end_layout
  3999. \end_inset
  4000. </cell>
  4001. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4002. \begin_inset Text
  4003. \begin_layout Plain Layout
  4004. H3K4me3
  4005. \end_layout
  4006. \end_inset
  4007. </cell>
  4008. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4009. \begin_inset Text
  4010. \begin_layout Plain Layout
  4011. H3K27me3
  4012. \end_layout
  4013. \end_inset
  4014. </cell>
  4015. </row>
  4016. <row>
  4017. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4018. \begin_inset Text
  4019. \begin_layout Plain Layout
  4020. Day 0
  4021. \end_layout
  4022. \end_inset
  4023. </cell>
  4024. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4025. \begin_inset Text
  4026. \begin_layout Plain Layout
  4027. 4553
  4028. \end_layout
  4029. \end_inset
  4030. </cell>
  4031. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4032. \begin_inset Text
  4033. \begin_layout Plain Layout
  4034. 927
  4035. \end_layout
  4036. \end_inset
  4037. </cell>
  4038. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4039. \begin_inset Text
  4040. \begin_layout Plain Layout
  4041. 6
  4042. \end_layout
  4043. \end_inset
  4044. </cell>
  4045. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4046. \begin_inset Text
  4047. \begin_layout Plain Layout
  4048. 9967
  4049. \end_layout
  4050. \end_inset
  4051. </cell>
  4052. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4053. \begin_inset Text
  4054. \begin_layout Plain Layout
  4055. 4149
  4056. \end_layout
  4057. \end_inset
  4058. </cell>
  4059. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4060. \begin_inset Text
  4061. \begin_layout Plain Layout
  4062. 2404
  4063. \end_layout
  4064. \end_inset
  4065. </cell>
  4066. </row>
  4067. <row>
  4068. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4069. \begin_inset Text
  4070. \begin_layout Plain Layout
  4071. Day 1
  4072. \end_layout
  4073. \end_inset
  4074. </cell>
  4075. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4076. \begin_inset Text
  4077. \begin_layout Plain Layout
  4078. 567
  4079. \end_layout
  4080. \end_inset
  4081. </cell>
  4082. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4083. \begin_inset Text
  4084. \begin_layout Plain Layout
  4085. 278
  4086. \end_layout
  4087. \end_inset
  4088. </cell>
  4089. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4090. \begin_inset Text
  4091. \begin_layout Plain Layout
  4092. 1570
  4093. \end_layout
  4094. \end_inset
  4095. </cell>
  4096. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4097. \begin_inset Text
  4098. \begin_layout Plain Layout
  4099. 4370
  4100. \end_layout
  4101. \end_inset
  4102. </cell>
  4103. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4104. \begin_inset Text
  4105. \begin_layout Plain Layout
  4106. 2145
  4107. \end_layout
  4108. \end_inset
  4109. </cell>
  4110. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4111. \begin_inset Text
  4112. \begin_layout Plain Layout
  4113. 6598
  4114. \end_layout
  4115. \end_inset
  4116. </cell>
  4117. </row>
  4118. <row>
  4119. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4120. \begin_inset Text
  4121. \begin_layout Plain Layout
  4122. Day 5
  4123. \end_layout
  4124. \end_inset
  4125. </cell>
  4126. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4127. \begin_inset Text
  4128. \begin_layout Plain Layout
  4129. 2313
  4130. \end_layout
  4131. \end_inset
  4132. </cell>
  4133. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4134. \begin_inset Text
  4135. \begin_layout Plain Layout
  4136. 139
  4137. \end_layout
  4138. \end_inset
  4139. </cell>
  4140. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4141. \begin_inset Text
  4142. \begin_layout Plain Layout
  4143. 490
  4144. \end_layout
  4145. \end_inset
  4146. </cell>
  4147. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4148. \begin_inset Text
  4149. \begin_layout Plain Layout
  4150. 9450
  4151. \end_layout
  4152. \end_inset
  4153. </cell>
  4154. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4155. \begin_inset Text
  4156. \begin_layout Plain Layout
  4157. 1148
  4158. \end_layout
  4159. \end_inset
  4160. </cell>
  4161. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4162. \begin_inset Text
  4163. \begin_layout Plain Layout
  4164. 4141
  4165. \end_layout
  4166. \end_inset
  4167. </cell>
  4168. </row>
  4169. <row>
  4170. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4171. \begin_inset Text
  4172. \begin_layout Plain Layout
  4173. Day 14
  4174. \end_layout
  4175. \end_inset
  4176. </cell>
  4177. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4178. \begin_inset Text
  4179. \begin_layout Plain Layout
  4180. 0
  4181. \end_layout
  4182. \end_inset
  4183. </cell>
  4184. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4185. \begin_inset Text
  4186. \begin_layout Plain Layout
  4187. 0
  4188. \end_layout
  4189. \end_inset
  4190. </cell>
  4191. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4192. \begin_inset Text
  4193. \begin_layout Plain Layout
  4194. 0
  4195. \end_layout
  4196. \end_inset
  4197. </cell>
  4198. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4199. \begin_inset Text
  4200. \begin_layout Plain Layout
  4201. 0
  4202. \end_layout
  4203. \end_inset
  4204. </cell>
  4205. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4206. \begin_inset Text
  4207. \begin_layout Plain Layout
  4208. 0
  4209. \end_layout
  4210. \end_inset
  4211. </cell>
  4212. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4213. \begin_inset Text
  4214. \begin_layout Plain Layout
  4215. 0
  4216. \end_layout
  4217. \end_inset
  4218. </cell>
  4219. </row>
  4220. </lyxtabular>
  4221. \end_inset
  4222. \end_layout
  4223. \begin_layout Plain Layout
  4224. \begin_inset Caption Standard
  4225. \begin_layout Plain Layout
  4226. \series bold
  4227. \begin_inset CommandInset label
  4228. LatexCommand label
  4229. name "tab:Number-signif-promoters"
  4230. \end_inset
  4231. Number of differentially modified promoters between naïve and memory cells
  4232. at each time point after activation.
  4233. \series default
  4234. This table shows both the number of differentially modified promoters detected
  4235. at a 10% FDR threshold (left half), and the total number of differentially
  4236. modified promoters as estimated using the method of
  4237. \begin_inset CommandInset citation
  4238. LatexCommand cite
  4239. key "Phipson2013"
  4240. literal "false"
  4241. \end_inset
  4242. (right half).
  4243. \end_layout
  4244. \end_inset
  4245. \end_layout
  4246. \end_inset
  4247. \end_layout
  4248. \begin_layout Standard
  4249. \begin_inset ERT
  4250. status open
  4251. \begin_layout Plain Layout
  4252. \backslash
  4253. end{landscape}
  4254. \end_layout
  4255. \begin_layout Plain Layout
  4256. }
  4257. \end_layout
  4258. \end_inset
  4259. \end_layout
  4260. \begin_layout Standard
  4261. \begin_inset Float figure
  4262. placement p
  4263. wide false
  4264. sideways false
  4265. status open
  4266. \begin_layout Plain Layout
  4267. \align center
  4268. \begin_inset Float figure
  4269. wide false
  4270. sideways false
  4271. status open
  4272. \begin_layout Plain Layout
  4273. \align center
  4274. \begin_inset Graphics
  4275. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-promoter-PCA-group-CROP.png
  4276. lyxscale 25
  4277. width 45col%
  4278. groupId pcoa-prom-subfig
  4279. \end_inset
  4280. \end_layout
  4281. \begin_layout Plain Layout
  4282. \begin_inset Caption Standard
  4283. \begin_layout Plain Layout
  4284. \series bold
  4285. \begin_inset CommandInset label
  4286. LatexCommand label
  4287. name "fig:PCoA-H3K4me2-prom"
  4288. \end_inset
  4289. PCoA plot of H3K4me2 promoters, after subtracting surrogate variables
  4290. \end_layout
  4291. \end_inset
  4292. \end_layout
  4293. \end_inset
  4294. \begin_inset space \hfill{}
  4295. \end_inset
  4296. \begin_inset Float figure
  4297. wide false
  4298. sideways false
  4299. status open
  4300. \begin_layout Plain Layout
  4301. \align center
  4302. \begin_inset Graphics
  4303. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-promoter-PCA-group-CROP.png
  4304. lyxscale 25
  4305. width 45col%
  4306. groupId pcoa-prom-subfig
  4307. \end_inset
  4308. \end_layout
  4309. \begin_layout Plain Layout
  4310. \begin_inset Caption Standard
  4311. \begin_layout Plain Layout
  4312. \series bold
  4313. \begin_inset CommandInset label
  4314. LatexCommand label
  4315. name "fig:PCoA-H3K4me3-prom"
  4316. \end_inset
  4317. PCoA plot of H3K4me3 promoters, after subtracting surrogate variables
  4318. \end_layout
  4319. \end_inset
  4320. \end_layout
  4321. \end_inset
  4322. \end_layout
  4323. \begin_layout Plain Layout
  4324. \align center
  4325. \begin_inset Float figure
  4326. wide false
  4327. sideways false
  4328. status collapsed
  4329. \begin_layout Plain Layout
  4330. \align center
  4331. \begin_inset Graphics
  4332. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-promoter-PCA-group-CROP.png
  4333. lyxscale 25
  4334. width 45col%
  4335. groupId pcoa-prom-subfig
  4336. \end_inset
  4337. \end_layout
  4338. \begin_layout Plain Layout
  4339. \begin_inset Caption Standard
  4340. \begin_layout Plain Layout
  4341. \series bold
  4342. \begin_inset CommandInset label
  4343. LatexCommand label
  4344. name "fig:PCoA-H3K27me3-prom"
  4345. \end_inset
  4346. PCoA plot of H3K27me3 promoters, after subtracting surrogate variables
  4347. \end_layout
  4348. \end_inset
  4349. \end_layout
  4350. \end_inset
  4351. \begin_inset space \hfill{}
  4352. \end_inset
  4353. \begin_inset Float figure
  4354. wide false
  4355. sideways false
  4356. status open
  4357. \begin_layout Plain Layout
  4358. \align center
  4359. \begin_inset Graphics
  4360. filename graphics/CD4-csaw/RNA-seq/PCA-final-23-CROP.png
  4361. lyxscale 25
  4362. width 45col%
  4363. groupId pcoa-prom-subfig
  4364. \end_inset
  4365. \end_layout
  4366. \begin_layout Plain Layout
  4367. \begin_inset Caption Standard
  4368. \begin_layout Plain Layout
  4369. \series bold
  4370. \begin_inset CommandInset label
  4371. LatexCommand label
  4372. name "fig:RNA-PCA-group"
  4373. \end_inset
  4374. RNA-seq PCoA showing principal coordinates 2 and 3.
  4375. \end_layout
  4376. \end_inset
  4377. \end_layout
  4378. \end_inset
  4379. \end_layout
  4380. \begin_layout Plain Layout
  4381. \begin_inset Caption Standard
  4382. \begin_layout Plain Layout
  4383. \series bold
  4384. \begin_inset CommandInset label
  4385. LatexCommand label
  4386. name "fig:PCoA-promoters"
  4387. \end_inset
  4388. PCoA plots for promoter ChIP-seq and expression RNA-seq data
  4389. \end_layout
  4390. \end_inset
  4391. \end_layout
  4392. \end_inset
  4393. \end_layout
  4394. \begin_layout Standard
  4395. \begin_inset Flex TODO Note (inline)
  4396. status open
  4397. \begin_layout Plain Layout
  4398. Check up on figure refs in this paragraph
  4399. \end_layout
  4400. \end_inset
  4401. \end_layout
  4402. \begin_layout Standard
  4403. We hypothesized that if naïve cells had differentiated into memory cells
  4404. by Day 14, then their patterns of expression and histone modification should
  4405. converge with those of memory cells at Day 14.
  4406. Figure
  4407. \begin_inset CommandInset ref
  4408. LatexCommand ref
  4409. reference "fig:PCoA-promoters"
  4410. plural "false"
  4411. caps "false"
  4412. noprefix "false"
  4413. \end_inset
  4414. shows the patterns of variation in all 3 histone marks in the promoter
  4415. regions of the genome using principal coordinate analysis.
  4416. All 3 marks show a noticeable convergence between the naïve and memory
  4417. samples at day 14, visible as an overlapping of the day 14 groups on each
  4418. plot.
  4419. This is consistent with the counts of significantly differentially modified
  4420. promoters and estimates of the total numbers of differentially modified
  4421. promoters shown in Table
  4422. \begin_inset CommandInset ref
  4423. LatexCommand ref
  4424. reference "tab:Number-signif-promoters"
  4425. plural "false"
  4426. caps "false"
  4427. noprefix "false"
  4428. \end_inset
  4429. .
  4430. For all histone marks, evidence of differential modification between naïve
  4431. and memory samples was detected at every time point except day 14.
  4432. The day 14 convergence pattern is also present in the
  4433. \begin_inset Flex Glossary Term
  4434. status open
  4435. \begin_layout Plain Layout
  4436. RNA-seq
  4437. \end_layout
  4438. \end_inset
  4439. data (Figure
  4440. \begin_inset CommandInset ref
  4441. LatexCommand ref
  4442. reference "fig:RNA-PCA-group"
  4443. plural "false"
  4444. caps "false"
  4445. noprefix "false"
  4446. \end_inset
  4447. ), albeit in the 2nd and 3rd principal coordinates, indicating that it is
  4448. not the most dominant pattern driving gene expression.
  4449. Taken together, the data show that promoter histone methylation for these
  4450. 3 histone marks and RNA expression for naïve and memory cells are most
  4451. similar at day 14, the furthest time point after activation.
  4452. MOFA was also able to capture this day 14 convergence pattern in latent
  4453. factor 5 (Figure
  4454. \begin_inset CommandInset ref
  4455. LatexCommand ref
  4456. reference "fig:mofa-lf-scatter"
  4457. plural "false"
  4458. caps "false"
  4459. noprefix "false"
  4460. \end_inset
  4461. ), which accounts for shared variation across all 3 histone marks and the
  4462. \begin_inset Flex Glossary Term
  4463. status open
  4464. \begin_layout Plain Layout
  4465. RNA-seq
  4466. \end_layout
  4467. \end_inset
  4468. data, confirming that this convergence is a coordinated pattern across
  4469. all 4 data sets.
  4470. While this observation does not prove that the naïve cells have differentiated
  4471. into memory cells at Day 14, it is consistent with that hypothesis.
  4472. \end_layout
  4473. \begin_layout Subsection
  4474. Effect of H3K4me2 and H3K4me3 promoter coverage upstream vs downstream of
  4475. TSS
  4476. \end_layout
  4477. \begin_layout Standard
  4478. \begin_inset Flex TODO Note (inline)
  4479. status open
  4480. \begin_layout Plain Layout
  4481. Need a better section title, for this and the next one.
  4482. \end_layout
  4483. \end_inset
  4484. \end_layout
  4485. \begin_layout Standard
  4486. \begin_inset Flex TODO Note (inline)
  4487. status open
  4488. \begin_layout Plain Layout
  4489. Make sure use of coverage/abundance/whatever is consistent.
  4490. \end_layout
  4491. \end_inset
  4492. \end_layout
  4493. \begin_layout Standard
  4494. \begin_inset Flex TODO Note (inline)
  4495. status open
  4496. \begin_layout Plain Layout
  4497. For the figures in this section and the next, the group labels are arbitrary,
  4498. so if time allows, it would be good to manually reorder them in a logical
  4499. way, e.g.
  4500. most upstream to most downstream.
  4501. If this is done, make sure to update the text with the correct group labels.
  4502. \end_layout
  4503. \end_inset
  4504. \end_layout
  4505. \begin_layout Standard
  4506. \begin_inset ERT
  4507. status open
  4508. \begin_layout Plain Layout
  4509. \backslash
  4510. afterpage{
  4511. \end_layout
  4512. \begin_layout Plain Layout
  4513. \backslash
  4514. begin{landscape}
  4515. \end_layout
  4516. \end_inset
  4517. \end_layout
  4518. \begin_layout Standard
  4519. \begin_inset Float figure
  4520. wide false
  4521. sideways false
  4522. status open
  4523. \begin_layout Plain Layout
  4524. \align center
  4525. \begin_inset Float figure
  4526. wide false
  4527. sideways false
  4528. status open
  4529. \begin_layout Plain Layout
  4530. \align center
  4531. \begin_inset Graphics
  4532. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-neighborhood-clusters-CROP.png
  4533. lyxscale 25
  4534. width 30col%
  4535. groupId covprof-subfig
  4536. \end_inset
  4537. \end_layout
  4538. \begin_layout Plain Layout
  4539. \begin_inset Caption Standard
  4540. \begin_layout Plain Layout
  4541. \series bold
  4542. \begin_inset CommandInset label
  4543. LatexCommand label
  4544. name "fig:H3K4me2-neighborhood-clusters"
  4545. \end_inset
  4546. Average relative coverage for each bin in each cluster
  4547. \end_layout
  4548. \end_inset
  4549. \end_layout
  4550. \end_inset
  4551. \begin_inset space \hfill{}
  4552. \end_inset
  4553. \begin_inset Float figure
  4554. wide false
  4555. sideways false
  4556. status open
  4557. \begin_layout Plain Layout
  4558. \align center
  4559. \begin_inset Graphics
  4560. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-neighborhood-PCA-CROP.png
  4561. lyxscale 25
  4562. width 30col%
  4563. groupId covprof-subfig
  4564. \end_inset
  4565. \end_layout
  4566. \begin_layout Plain Layout
  4567. \begin_inset Caption Standard
  4568. \begin_layout Plain Layout
  4569. \series bold
  4570. \begin_inset CommandInset label
  4571. LatexCommand label
  4572. name "fig:H3K4me2-neighborhood-pca"
  4573. \end_inset
  4574. PCA of relative coverage depth, colored by K-means cluster membership.
  4575. \end_layout
  4576. \end_inset
  4577. \end_layout
  4578. \end_inset
  4579. \begin_inset space \hfill{}
  4580. \end_inset
  4581. \begin_inset Float figure
  4582. wide false
  4583. sideways false
  4584. status open
  4585. \begin_layout Plain Layout
  4586. \align center
  4587. \begin_inset Graphics
  4588. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-neighborhood-expression-CROP.png
  4589. lyxscale 25
  4590. width 30col%
  4591. groupId covprof-subfig
  4592. \end_inset
  4593. \end_layout
  4594. \begin_layout Plain Layout
  4595. \begin_inset Caption Standard
  4596. \begin_layout Plain Layout
  4597. \series bold
  4598. \begin_inset CommandInset label
  4599. LatexCommand label
  4600. name "fig:H3K4me2-neighborhood-expression"
  4601. \end_inset
  4602. Gene expression grouped by promoter coverage clusters.
  4603. \end_layout
  4604. \end_inset
  4605. \end_layout
  4606. \end_inset
  4607. \end_layout
  4608. \begin_layout Plain Layout
  4609. \begin_inset Caption Standard
  4610. \begin_layout Plain Layout
  4611. \series bold
  4612. \begin_inset CommandInset label
  4613. LatexCommand label
  4614. name "fig:H3K4me2-neighborhood"
  4615. \end_inset
  4616. K-means clustering of promoter H3K4me2 relative coverage depth in naïve
  4617. day 0 samples.
  4618. \series default
  4619. H3K4me2 ChIP-seq reads were binned into 500-bp windows tiled across each
  4620. promoter from 5
  4621. \begin_inset space ~
  4622. \end_inset
  4623. kbp upstream to 5
  4624. \begin_inset space ~
  4625. \end_inset
  4626. kbp downstream, and the logCPM values were normalized within each promoter
  4627. to an average of 0, yielding relative coverage depths.
  4628. These were then grouped using K-means clustering with
  4629. \begin_inset Formula $K=6$
  4630. \end_inset
  4631. ,
  4632. \series bold
  4633. \series default
  4634. and the average bin values were plotted for each cluster (a).
  4635. The
  4636. \begin_inset Formula $x$
  4637. \end_inset
  4638. -axis is the genomic coordinate of each bin relative to the the transcription
  4639. start site, and the
  4640. \begin_inset Formula $y$
  4641. \end_inset
  4642. -axis is the mean relative coverage depth of that bin across all promoters
  4643. in the cluster.
  4644. Each line represents the average
  4645. \begin_inset Quotes eld
  4646. \end_inset
  4647. shape
  4648. \begin_inset Quotes erd
  4649. \end_inset
  4650. of the promoter coverage for promoters in that cluster.
  4651. PCA was performed on the same data, and the first two principal components
  4652. were plotted, coloring each point by its K-means cluster identity (b).
  4653. For each cluster, the distribution of gene expression values was plotted
  4654. (c).
  4655. \end_layout
  4656. \end_inset
  4657. \end_layout
  4658. \end_inset
  4659. \end_layout
  4660. \begin_layout Standard
  4661. \begin_inset ERT
  4662. status open
  4663. \begin_layout Plain Layout
  4664. \backslash
  4665. end{landscape}
  4666. \end_layout
  4667. \begin_layout Plain Layout
  4668. }
  4669. \end_layout
  4670. \end_inset
  4671. \end_layout
  4672. \begin_layout Standard
  4673. To test whether the position of a histone mark relative to a gene's transcriptio
  4674. n start site (TSS) was important, we looked at the
  4675. \begin_inset Quotes eld
  4676. \end_inset
  4677. landscape
  4678. \begin_inset Quotes erd
  4679. \end_inset
  4680. of ChIP-seq read coverage in naïve Day 0 samples within 5 kb of each gene's
  4681. TSS by binning reads into 500-bp windows tiled across each promoter LogCPM
  4682. values were calculated for the bins in each promoter and then the average
  4683. logCPM for each promoter's bins was normalized to zero, such that the values
  4684. represent coverage relative to other regions of the same promoter rather
  4685. than being proportional to absolute read count.
  4686. The promoters were then clustered based on the normalized bin abundances
  4687. using
  4688. \begin_inset Formula $k$
  4689. \end_inset
  4690. -means clustering with
  4691. \begin_inset Formula $K=6$
  4692. \end_inset
  4693. .
  4694. Different values of
  4695. \begin_inset Formula $K$
  4696. \end_inset
  4697. were also tested, but did not substantially change the interpretation of
  4698. the data.
  4699. \end_layout
  4700. \begin_layout Standard
  4701. For H3K4me2, plotting the average bin abundances for each cluster reveals
  4702. a simple pattern (Figure
  4703. \begin_inset CommandInset ref
  4704. LatexCommand ref
  4705. reference "fig:H3K4me2-neighborhood-clusters"
  4706. plural "false"
  4707. caps "false"
  4708. noprefix "false"
  4709. \end_inset
  4710. ): Cluster 5 represents a completely flat promoter coverage profile, likely
  4711. consisting of genes with no H3K4me2 methylation in the promoter.
  4712. All the other clusters represent a continuum of peak positions relative
  4713. to the TSS.
  4714. In order from must upstream to most downstream, they are Clusters 6, 4,
  4715. 3, 1, and 2.
  4716. There do not appear to be any clusters representing coverage patterns other
  4717. than lone peaks, such as coverage troughs or double peaks.
  4718. Next, all promoters were plotted in a PCA plot based on the same relative
  4719. bin abundance data, and colored based on cluster membership (Figure
  4720. \begin_inset CommandInset ref
  4721. LatexCommand ref
  4722. reference "fig:H3K4me2-neighborhood-pca"
  4723. plural "false"
  4724. caps "false"
  4725. noprefix "false"
  4726. \end_inset
  4727. ).
  4728. The PCA plot shows Cluster 5 (the
  4729. \begin_inset Quotes eld
  4730. \end_inset
  4731. no peak
  4732. \begin_inset Quotes erd
  4733. \end_inset
  4734. cluster) at the center, with the other clusters arranged in a counter-clockwise
  4735. arc around it in the order noted above, from most upstream peak to most
  4736. downstream.
  4737. Notably, the
  4738. \begin_inset Quotes eld
  4739. \end_inset
  4740. clusters
  4741. \begin_inset Quotes erd
  4742. \end_inset
  4743. form a single large
  4744. \begin_inset Quotes eld
  4745. \end_inset
  4746. cloud
  4747. \begin_inset Quotes erd
  4748. \end_inset
  4749. with no apparent separation between them, further supporting the conclusion
  4750. that these clusters represent an arbitrary partitioning of a continuous
  4751. distribution of promoter coverage landscapes.
  4752. While the clusters are a useful abstraction that aids in visualization,
  4753. they are ultimately not an accurate representation of the data.
  4754. A better representation might be something like a polar coordinate system
  4755. with the origin at the center of Cluster 5, where the radius represents
  4756. the peak height above the background and the angle represents the peak's
  4757. position upstream or downstream of the TSS.
  4758. The continuous nature of the distribution also explains why different values
  4759. of
  4760. \begin_inset Formula $K$
  4761. \end_inset
  4762. led to similar conclusions.
  4763. \end_layout
  4764. \begin_layout Standard
  4765. \begin_inset Flex TODO Note (inline)
  4766. status open
  4767. \begin_layout Plain Layout
  4768. RNA-seq values in the plots use logCPM but should really use logFPKM or
  4769. logTPM.
  4770. Fix if time allows.
  4771. \end_layout
  4772. \end_inset
  4773. \end_layout
  4774. \begin_layout Standard
  4775. \begin_inset Flex TODO Note (inline)
  4776. status open
  4777. \begin_layout Plain Layout
  4778. Should have a table of p-values on difference of means between Cluster 5
  4779. and the others.
  4780. \end_layout
  4781. \end_inset
  4782. \end_layout
  4783. \begin_layout Standard
  4784. To investigate the association between relative peak position and gene expressio
  4785. n, we plotted the Naïve Day 0 expression for the genes in each cluster (Figure
  4786. \begin_inset CommandInset ref
  4787. LatexCommand ref
  4788. reference "fig:H3K4me2-neighborhood-expression"
  4789. plural "false"
  4790. caps "false"
  4791. noprefix "false"
  4792. \end_inset
  4793. ).
  4794. Most genes in Cluster 5, the
  4795. \begin_inset Quotes eld
  4796. \end_inset
  4797. no peak
  4798. \begin_inset Quotes erd
  4799. \end_inset
  4800. cluster, have low expression values.
  4801. Taking this as the
  4802. \begin_inset Quotes eld
  4803. \end_inset
  4804. baseline
  4805. \begin_inset Quotes erd
  4806. \end_inset
  4807. distribution when no H3K4me2 methylation is present, we can compare the
  4808. other clusters' distributions to determine which peak positions are associated
  4809. with elevated expression.
  4810. As might be expected, the 3 clusters representing peaks closest to the
  4811. TSS, Clusters 1, 3, and 4, show the highest average expression distributions.
  4812. Specifically, these clusters all have their highest ChIP-seq abundance
  4813. within 1kb of the TSS, consistent with the previously determined promoter
  4814. radius.
  4815. In contrast, cluster 6, which represents peaks several kb upstream of the
  4816. TSS, shows a slightly higher average expression than baseline, while Cluster
  4817. 2, which represents peaks several kb downstream, doesn't appear to show
  4818. any appreciable difference.
  4819. Interestingly, the cluster with the highest average expression is Cluster
  4820. 1, which represents peaks about 1 kb downstream of the TSS, rather than
  4821. Cluster 3, which represents peaks centered directly at the TSS.
  4822. This suggests that conceptualizing the promoter as a region centered on
  4823. the TSS with a certain
  4824. \begin_inset Quotes eld
  4825. \end_inset
  4826. radius
  4827. \begin_inset Quotes erd
  4828. \end_inset
  4829. may be an oversimplification – a peak that is a specific distance from
  4830. the TSS may have a different degree of influence depending on whether it
  4831. is upstream or downstream of the TSS.
  4832. \end_layout
  4833. \begin_layout Standard
  4834. \begin_inset ERT
  4835. status open
  4836. \begin_layout Plain Layout
  4837. \backslash
  4838. afterpage{
  4839. \end_layout
  4840. \begin_layout Plain Layout
  4841. \backslash
  4842. begin{landscape}
  4843. \end_layout
  4844. \end_inset
  4845. \end_layout
  4846. \begin_layout Standard
  4847. \begin_inset Float figure
  4848. wide false
  4849. sideways false
  4850. status open
  4851. \begin_layout Plain Layout
  4852. \align center
  4853. \begin_inset Float figure
  4854. wide false
  4855. sideways false
  4856. status open
  4857. \begin_layout Plain Layout
  4858. \align center
  4859. \begin_inset Graphics
  4860. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-neighborhood-clusters-CROP.png
  4861. lyxscale 25
  4862. width 30col%
  4863. groupId covprof-subfig
  4864. \end_inset
  4865. \end_layout
  4866. \begin_layout Plain Layout
  4867. \begin_inset Caption Standard
  4868. \begin_layout Plain Layout
  4869. \series bold
  4870. \begin_inset CommandInset label
  4871. LatexCommand label
  4872. name "fig:H3K4me3-neighborhood-clusters"
  4873. \end_inset
  4874. Average relative coverage for each bin in each cluster
  4875. \end_layout
  4876. \end_inset
  4877. \end_layout
  4878. \end_inset
  4879. \begin_inset space \hfill{}
  4880. \end_inset
  4881. \begin_inset Float figure
  4882. wide false
  4883. sideways false
  4884. status open
  4885. \begin_layout Plain Layout
  4886. \align center
  4887. \begin_inset Graphics
  4888. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-neighborhood-PCA-CROP.png
  4889. lyxscale 25
  4890. width 30col%
  4891. groupId covprof-subfig
  4892. \end_inset
  4893. \end_layout
  4894. \begin_layout Plain Layout
  4895. \begin_inset Caption Standard
  4896. \begin_layout Plain Layout
  4897. \series bold
  4898. \begin_inset CommandInset label
  4899. LatexCommand label
  4900. name "fig:H3K4me3-neighborhood-pca"
  4901. \end_inset
  4902. PCA of relative coverage depth, colored by K-means cluster membership.
  4903. \end_layout
  4904. \end_inset
  4905. \end_layout
  4906. \end_inset
  4907. \begin_inset space \hfill{}
  4908. \end_inset
  4909. \begin_inset Float figure
  4910. wide false
  4911. sideways false
  4912. status open
  4913. \begin_layout Plain Layout
  4914. \align center
  4915. \begin_inset Graphics
  4916. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-neighborhood-expression-CROP.png
  4917. lyxscale 25
  4918. width 30col%
  4919. groupId covprof-subfig
  4920. \end_inset
  4921. \end_layout
  4922. \begin_layout Plain Layout
  4923. \begin_inset Caption Standard
  4924. \begin_layout Plain Layout
  4925. \series bold
  4926. \begin_inset CommandInset label
  4927. LatexCommand label
  4928. name "fig:H3K4me3-neighborhood-expression"
  4929. \end_inset
  4930. Gene expression grouped by promoter coverage clusters.
  4931. \end_layout
  4932. \end_inset
  4933. \end_layout
  4934. \end_inset
  4935. \end_layout
  4936. \begin_layout Plain Layout
  4937. \begin_inset Caption Standard
  4938. \begin_layout Plain Layout
  4939. \series bold
  4940. \begin_inset CommandInset label
  4941. LatexCommand label
  4942. name "fig:H3K4me3-neighborhood"
  4943. \end_inset
  4944. K-means clustering of promoter H3K4me3 relative coverage depth in naïve
  4945. day 0 samples.
  4946. \series default
  4947. H3K4me2 ChIP-seq reads were binned into 500-bp windows tiled across each
  4948. promoter from 5
  4949. \begin_inset space ~
  4950. \end_inset
  4951. kbp upstream to 5
  4952. \begin_inset space ~
  4953. \end_inset
  4954. kbp downstream, and the logCPM values were normalized within each promoter
  4955. to an average of 0, yielding relative coverage depths.
  4956. These were then grouped using K-means clustering with
  4957. \begin_inset Formula $K=6$
  4958. \end_inset
  4959. ,
  4960. \series bold
  4961. \series default
  4962. and the average bin values were plotted for each cluster (a).
  4963. The
  4964. \begin_inset Formula $x$
  4965. \end_inset
  4966. -axis is the genomic coordinate of each bin relative to the the transcription
  4967. start site, and the
  4968. \begin_inset Formula $y$
  4969. \end_inset
  4970. -axis is the mean relative coverage depth of that bin across all promoters
  4971. in the cluster.
  4972. Each line represents the average
  4973. \begin_inset Quotes eld
  4974. \end_inset
  4975. shape
  4976. \begin_inset Quotes erd
  4977. \end_inset
  4978. of the promoter coverage for promoters in that cluster.
  4979. PCA was performed on the same data, and the first two principal components
  4980. were plotted, coloring each point by its K-means cluster identity (b).
  4981. For each cluster, the distribution of gene expression values was plotted
  4982. (c).
  4983. \end_layout
  4984. \end_inset
  4985. \end_layout
  4986. \end_inset
  4987. \end_layout
  4988. \begin_layout Standard
  4989. \begin_inset ERT
  4990. status open
  4991. \begin_layout Plain Layout
  4992. \backslash
  4993. end{landscape}
  4994. \end_layout
  4995. \begin_layout Plain Layout
  4996. }
  4997. \end_layout
  4998. \end_inset
  4999. \end_layout
  5000. \begin_layout Standard
  5001. \begin_inset Flex TODO Note (inline)
  5002. status open
  5003. \begin_layout Plain Layout
  5004. Is there more to say here?
  5005. \end_layout
  5006. \end_inset
  5007. \end_layout
  5008. \begin_layout Standard
  5009. All observations described above for H3K4me2 ChIP-seq also appear to hold
  5010. for H3K4me3 as well (Figure
  5011. \begin_inset CommandInset ref
  5012. LatexCommand ref
  5013. reference "fig:H3K4me3-neighborhood"
  5014. plural "false"
  5015. caps "false"
  5016. noprefix "false"
  5017. \end_inset
  5018. ).
  5019. This is expected, since there is a high correlation between the positions
  5020. where both histone marks occur.
  5021. \end_layout
  5022. \begin_layout Subsection
  5023. Promoter coverage H3K27me3
  5024. \end_layout
  5025. \begin_layout Standard
  5026. \begin_inset ERT
  5027. status open
  5028. \begin_layout Plain Layout
  5029. \backslash
  5030. afterpage{
  5031. \end_layout
  5032. \begin_layout Plain Layout
  5033. \backslash
  5034. begin{landscape}
  5035. \end_layout
  5036. \end_inset
  5037. \end_layout
  5038. \begin_layout Standard
  5039. \begin_inset Float figure
  5040. wide false
  5041. sideways false
  5042. status collapsed
  5043. \begin_layout Plain Layout
  5044. \align center
  5045. \begin_inset Float figure
  5046. wide false
  5047. sideways false
  5048. status open
  5049. \begin_layout Plain Layout
  5050. \align center
  5051. \begin_inset Graphics
  5052. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-neighborhood-clusters-CROP.png
  5053. lyxscale 25
  5054. width 30col%
  5055. groupId covprof-subfig
  5056. \end_inset
  5057. \end_layout
  5058. \begin_layout Plain Layout
  5059. \begin_inset Caption Standard
  5060. \begin_layout Plain Layout
  5061. \series bold
  5062. \begin_inset CommandInset label
  5063. LatexCommand label
  5064. name "fig:H3K27me3-neighborhood-clusters"
  5065. \end_inset
  5066. Average relative coverage for each bin in each cluster
  5067. \end_layout
  5068. \end_inset
  5069. \end_layout
  5070. \end_inset
  5071. \begin_inset space \hfill{}
  5072. \end_inset
  5073. \begin_inset Float figure
  5074. wide false
  5075. sideways false
  5076. status open
  5077. \begin_layout Plain Layout
  5078. \align center
  5079. \begin_inset Graphics
  5080. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-neighborhood-PCA-CROP.png
  5081. lyxscale 25
  5082. width 30col%
  5083. groupId covprof-subfig
  5084. \end_inset
  5085. \end_layout
  5086. \begin_layout Plain Layout
  5087. \begin_inset Caption Standard
  5088. \begin_layout Plain Layout
  5089. \series bold
  5090. \begin_inset CommandInset label
  5091. LatexCommand label
  5092. name "fig:H3K27me3-neighborhood-pca"
  5093. \end_inset
  5094. PCA of relative coverage depth, colored by K-means cluster membership.
  5095. \series default
  5096. Note that Cluster 6 is hidden behind all the other clusters.
  5097. \end_layout
  5098. \end_inset
  5099. \end_layout
  5100. \end_inset
  5101. \begin_inset space \hfill{}
  5102. \end_inset
  5103. \begin_inset Float figure
  5104. wide false
  5105. sideways false
  5106. status open
  5107. \begin_layout Plain Layout
  5108. \align center
  5109. \begin_inset Graphics
  5110. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-neighborhood-expression-CROP.png
  5111. lyxscale 25
  5112. width 30col%
  5113. groupId covprof-subfig
  5114. \end_inset
  5115. \end_layout
  5116. \begin_layout Plain Layout
  5117. \begin_inset Caption Standard
  5118. \begin_layout Plain Layout
  5119. \series bold
  5120. \begin_inset CommandInset label
  5121. LatexCommand label
  5122. name "fig:H3K27me3-neighborhood-expression"
  5123. \end_inset
  5124. Gene expression grouped by promoter coverage clusters.
  5125. \end_layout
  5126. \end_inset
  5127. \end_layout
  5128. \end_inset
  5129. \end_layout
  5130. \begin_layout Plain Layout
  5131. \begin_inset Flex TODO Note (inline)
  5132. status open
  5133. \begin_layout Plain Layout
  5134. Repeated figure legends are kind of an issue here.
  5135. What to do?
  5136. \end_layout
  5137. \end_inset
  5138. \end_layout
  5139. \begin_layout Plain Layout
  5140. \begin_inset Caption Standard
  5141. \begin_layout Plain Layout
  5142. \series bold
  5143. \begin_inset CommandInset label
  5144. LatexCommand label
  5145. name "fig:H3K27me3-neighborhood"
  5146. \end_inset
  5147. K-means clustering of promoter H3K27me3 relative coverage depth in naïve
  5148. day 0 samples.
  5149. \series default
  5150. H3K27me3 ChIP-seq reads were binned into 500-bp windows tiled across each
  5151. promoter from 5
  5152. \begin_inset space ~
  5153. \end_inset
  5154. kbp upstream to 5
  5155. \begin_inset space ~
  5156. \end_inset
  5157. kbp downstream, and the logCPM values were normalized within each promoter
  5158. to an average of 0, yielding relative coverage depths.
  5159. These were then grouped using
  5160. \begin_inset Formula $k$
  5161. \end_inset
  5162. -means clustering with
  5163. \begin_inset Formula $K=6$
  5164. \end_inset
  5165. ,
  5166. \series bold
  5167. \series default
  5168. and the average bin values were plotted for each cluster (a).
  5169. The
  5170. \begin_inset Formula $x$
  5171. \end_inset
  5172. -axis is the genomic coordinate of each bin relative to the the transcription
  5173. start site, and the
  5174. \begin_inset Formula $y$
  5175. \end_inset
  5176. -axis is the mean relative coverage depth of that bin across all promoters
  5177. in the cluster.
  5178. Each line represents the average
  5179. \begin_inset Quotes eld
  5180. \end_inset
  5181. shape
  5182. \begin_inset Quotes erd
  5183. \end_inset
  5184. of the promoter coverage for promoters in that cluster.
  5185. PCA was performed on the same data, and the first two principal components
  5186. were plotted, coloring each point by its K-means cluster identity (b).
  5187. For each cluster, the distribution of gene expression values was plotted
  5188. (c).
  5189. \end_layout
  5190. \end_inset
  5191. \end_layout
  5192. \end_inset
  5193. \end_layout
  5194. \begin_layout Standard
  5195. \begin_inset ERT
  5196. status open
  5197. \begin_layout Plain Layout
  5198. \backslash
  5199. end{landscape}
  5200. \end_layout
  5201. \begin_layout Plain Layout
  5202. }
  5203. \end_layout
  5204. \end_inset
  5205. \end_layout
  5206. \begin_layout Standard
  5207. \begin_inset Flex TODO Note (inline)
  5208. status open
  5209. \begin_layout Plain Layout
  5210. Should maybe re-explain what was done or refer back to the previous section.
  5211. \end_layout
  5212. \end_inset
  5213. \end_layout
  5214. \begin_layout Standard
  5215. Unlike both H3K4 marks, whose main patterns of variation appear directly
  5216. related to the size and position of a single peak within the promoter,
  5217. the patterns of H3K27me3 methylation in promoters are more complex (Figure
  5218. \begin_inset CommandInset ref
  5219. LatexCommand ref
  5220. reference "fig:H3K27me3-neighborhood"
  5221. plural "false"
  5222. caps "false"
  5223. noprefix "false"
  5224. \end_inset
  5225. ).
  5226. Once again looking at the relative coverage in a 500-bp wide bins in a
  5227. 5kb radius around each TSS, promoters were clustered based on the normalized
  5228. relative coverage values in each bin using
  5229. \begin_inset Formula $k$
  5230. \end_inset
  5231. -means clustering with
  5232. \begin_inset Formula $K=6$
  5233. \end_inset
  5234. (Figure
  5235. \begin_inset CommandInset ref
  5236. LatexCommand ref
  5237. reference "fig:H3K27me3-neighborhood-clusters"
  5238. plural "false"
  5239. caps "false"
  5240. noprefix "false"
  5241. \end_inset
  5242. ).
  5243. This time, 3
  5244. \begin_inset Quotes eld
  5245. \end_inset
  5246. axes
  5247. \begin_inset Quotes erd
  5248. \end_inset
  5249. of variation can be observed, each represented by 2 clusters with opposing
  5250. patterns.
  5251. The first axis is greater upstream coverage (Cluster 1) vs.
  5252. greater downstream coverage (Cluster 3); the second axis is the coverage
  5253. at the TSS itself: peak (Cluster 4) or trough (Cluster 2); lastly, the
  5254. third axis represents a trough upstream of the TSS (Cluster 5) vs.
  5255. downstream of the TSS (Cluster 6).
  5256. Referring to these opposing pairs of clusters as axes of variation is justified
  5257. , because they correspond precisely to the first 3 principal components
  5258. in the PCA plot of the relative coverage values (Figure
  5259. \begin_inset CommandInset ref
  5260. LatexCommand ref
  5261. reference "fig:H3K27me3-neighborhood-pca"
  5262. plural "false"
  5263. caps "false"
  5264. noprefix "false"
  5265. \end_inset
  5266. ).
  5267. The PCA plot reveals that as in the case of H3K4me2, all the
  5268. \begin_inset Quotes eld
  5269. \end_inset
  5270. clusters
  5271. \begin_inset Quotes erd
  5272. \end_inset
  5273. are really just sections of a single connected cloud rather than discrete
  5274. clusters.
  5275. The cloud is approximately ellipsoid-shaped, with each PC being an axis
  5276. of the ellipse, and each cluster consisting of a pyramidal section of the
  5277. ellipsoid.
  5278. \end_layout
  5279. \begin_layout Standard
  5280. In Figure
  5281. \begin_inset CommandInset ref
  5282. LatexCommand ref
  5283. reference "fig:H3K27me3-neighborhood-expression"
  5284. plural "false"
  5285. caps "false"
  5286. noprefix "false"
  5287. \end_inset
  5288. , we can see that Clusters 1 and 2 are the only clusters with higher gene
  5289. expression than the others.
  5290. For Cluster 2, this is expected, since this cluster represents genes with
  5291. depletion of H3K27me3 near the promoter.
  5292. Hence, elevated expression in cluster 2 is consistent with the conventional
  5293. view of H3K27me3 as a deactivating mark.
  5294. However, Cluster 1, the cluster with the most elevated gene expression,
  5295. represents genes with elevated coverage upstream of the TSS, or equivalently,
  5296. decreased coverage downstream, inside the gene body.
  5297. The opposite pattern, in which H3K27me3 is more abundant within the gene
  5298. body and less abundance in the upstream promoter region, does not show
  5299. any elevation in gene expression.
  5300. As with H3K4me2, this shows that the location of H3K27 trimethylation relative
  5301. to the TSS is potentially an important factor beyond simple proximity.
  5302. \end_layout
  5303. \begin_layout Standard
  5304. \begin_inset Flex TODO Note (inline)
  5305. status open
  5306. \begin_layout Plain Layout
  5307. Show the figures where the negative result ended this line of inquiry.
  5308. I need to debug some errors resulting from an R upgrade to do this.
  5309. \end_layout
  5310. \end_inset
  5311. \end_layout
  5312. \begin_layout Subsection
  5313. Defined pattern analysis
  5314. \end_layout
  5315. \begin_layout Standard
  5316. \begin_inset Flex TODO Note (inline)
  5317. status open
  5318. \begin_layout Plain Layout
  5319. This was where I defined interesting expression patterns and then looked
  5320. at initial relative promoter coverage for each expression pattern.
  5321. Negative result.
  5322. I forgot about this until recently.
  5323. Worth including? Remember to also write methods.
  5324. \end_layout
  5325. \end_inset
  5326. \end_layout
  5327. \begin_layout Subsection
  5328. Promoter CpG islands?
  5329. \end_layout
  5330. \begin_layout Standard
  5331. \begin_inset Flex TODO Note (inline)
  5332. status collapsed
  5333. \begin_layout Plain Layout
  5334. I forgot until recently about the work I did on this.
  5335. Worth including? Remember to also write methods.
  5336. \end_layout
  5337. \end_inset
  5338. \end_layout
  5339. \begin_layout Section
  5340. Discussion
  5341. \end_layout
  5342. \begin_layout Standard
  5343. \begin_inset Flex TODO Note (inline)
  5344. status open
  5345. \begin_layout Plain Layout
  5346. Write better section headers
  5347. \end_layout
  5348. \end_inset
  5349. \end_layout
  5350. \begin_layout Subsection
  5351. Effective promoter radius
  5352. \end_layout
  5353. \begin_layout Standard
  5354. Figure
  5355. \begin_inset CommandInset ref
  5356. LatexCommand ref
  5357. reference "fig:near-promoter-peak-enrich"
  5358. plural "false"
  5359. caps "false"
  5360. noprefix "false"
  5361. \end_inset
  5362. shows that H3K4me2, H3K4me3, and H3K27me3 are all enriched near promoters,
  5363. relative to the rest of the genome, consistent with their conventionally
  5364. understood role in regulating gene transcription.
  5365. Interestingly, the radius within this enrichment occurs is not the same
  5366. for each histone mark.
  5367. H3K4me2 and H3K4me3 are enriched within a 1
  5368. \begin_inset space \thinspace{}
  5369. \end_inset
  5370. kb radius, while H3K27me3 is enriched within 2.5
  5371. \begin_inset space \thinspace{}
  5372. \end_inset
  5373. kb.
  5374. Notably, the determined promoter radius was consistent across all experimental
  5375. conditions, varying only between different histone marks.
  5376. This suggests that the conventional
  5377. \begin_inset Quotes eld
  5378. \end_inset
  5379. one size fits all
  5380. \begin_inset Quotes erd
  5381. \end_inset
  5382. approach of defining a single promoter region for each gene (or each TSS)
  5383. and using that same promoter region for analyzing all types of genomic
  5384. data within an experiment may not be appropriate, and a better approach
  5385. may be to use a separate promoter radius for each kind of data, with each
  5386. radius being derived from the data itself.
  5387. Furthermore, the apparent asymmetry of upstream and downstream promoter
  5388. histone modification with respect to gene expression, seen in Figures
  5389. \begin_inset CommandInset ref
  5390. LatexCommand ref
  5391. reference "fig:H3K4me2-neighborhood"
  5392. plural "false"
  5393. caps "false"
  5394. noprefix "false"
  5395. \end_inset
  5396. ,
  5397. \begin_inset CommandInset ref
  5398. LatexCommand ref
  5399. reference "fig:H3K4me3-neighborhood"
  5400. plural "false"
  5401. caps "false"
  5402. noprefix "false"
  5403. \end_inset
  5404. , and
  5405. \begin_inset CommandInset ref
  5406. LatexCommand ref
  5407. reference "fig:H3K27me3-neighborhood"
  5408. plural "false"
  5409. caps "false"
  5410. noprefix "false"
  5411. \end_inset
  5412. , shows that even the concept of a promoter
  5413. \begin_inset Quotes eld
  5414. \end_inset
  5415. radius
  5416. \begin_inset Quotes erd
  5417. \end_inset
  5418. is likely an oversimplification.
  5419. At a minimum, nearby enrichment of peaks should be evaluated separately
  5420. for both upstream and downstream peaks, and an appropriate
  5421. \begin_inset Quotes eld
  5422. \end_inset
  5423. radius
  5424. \begin_inset Quotes erd
  5425. \end_inset
  5426. should be selected for each direction.
  5427. \end_layout
  5428. \begin_layout Standard
  5429. Figures
  5430. \begin_inset CommandInset ref
  5431. LatexCommand ref
  5432. reference "fig:H3K4me2-neighborhood"
  5433. plural "false"
  5434. caps "false"
  5435. noprefix "false"
  5436. \end_inset
  5437. and
  5438. \begin_inset CommandInset ref
  5439. LatexCommand ref
  5440. reference "fig:H3K4me3-neighborhood"
  5441. plural "false"
  5442. caps "false"
  5443. noprefix "false"
  5444. \end_inset
  5445. show that the determined promoter radius of 1
  5446. \begin_inset space ~
  5447. \end_inset
  5448. kb is approximately consistent with the distance from the TSS at which enrichmen
  5449. t of H3K4 methylation correlates with increased expression, showing that
  5450. this radius, which was determined by a simple analysis of measuring the
  5451. distance from each TSS to the nearest peak, also has functional significance.
  5452. For H3K27me3, the correlation between histone modification near the promoter
  5453. and gene expression is more complex, involving non-peak variations such
  5454. as troughs in coverage at the TSS and asymmetric coverage upstream and
  5455. downstream, so it is difficult in this case to evaluate whether the 2.5
  5456. \begin_inset space ~
  5457. \end_inset
  5458. kb radius determined from TSS-to-peak distances is functionally significant.
  5459. However, the two patterns of coverage associated with elevated expression
  5460. levels both have interesting features within this radius.
  5461. \end_layout
  5462. \begin_layout Standard
  5463. \begin_inset Flex TODO Note (inline)
  5464. status open
  5465. \begin_layout Plain Layout
  5466. My instinct is to say
  5467. \begin_inset Quotes eld
  5468. \end_inset
  5469. further study is needed
  5470. \begin_inset Quotes erd
  5471. \end_inset
  5472. here, but that goes in Chapter 5, right?
  5473. \end_layout
  5474. \end_inset
  5475. \end_layout
  5476. \begin_layout Subsection
  5477. Convergence
  5478. \end_layout
  5479. \begin_layout Standard
  5480. \begin_inset Flex TODO Note (inline)
  5481. status open
  5482. \begin_layout Plain Layout
  5483. Look up some more references for these histone marks being involved in memory
  5484. differentiation.
  5485. (Ask Sarah)
  5486. \end_layout
  5487. \end_inset
  5488. \end_layout
  5489. \begin_layout Standard
  5490. We have observed that all 3 histone marks and the gene expression data all
  5491. exhibit evidence of convergence in abundance between naïve and memory cells
  5492. by day 14 after activation (Figure
  5493. \begin_inset CommandInset ref
  5494. LatexCommand ref
  5495. reference "fig:PCoA-promoters"
  5496. plural "false"
  5497. caps "false"
  5498. noprefix "false"
  5499. \end_inset
  5500. , Table
  5501. \begin_inset CommandInset ref
  5502. LatexCommand ref
  5503. reference "tab:Number-signif-promoters"
  5504. plural "false"
  5505. caps "false"
  5506. noprefix "false"
  5507. \end_inset
  5508. ).
  5509. The MOFA latent factor scatter plots (Figure
  5510. \begin_inset CommandInset ref
  5511. LatexCommand ref
  5512. reference "fig:mofa-lf-scatter"
  5513. plural "false"
  5514. caps "false"
  5515. noprefix "false"
  5516. \end_inset
  5517. ) show that this pattern of convergence is captured in latent factor 5.
  5518. Like all the latent factors in this plot, this factor explains a substantial
  5519. portion of the variance in all 4 data sets, indicating a coordinated pattern
  5520. of variation shared across all histone marks and gene expression.
  5521. This, of course, is consistent with the expectation that any naïve CD4
  5522. T-cells remaining at day 14 should have differentiated into memory cells
  5523. by that time, and should therefore have a genomic state similar to memory
  5524. cells.
  5525. This convergence is evidence that these histone marks all play an important
  5526. role in the naïve-to-memory differentiation process.
  5527. A histone mark that was not involved in naïve-to-memory differentiation
  5528. would not be expected to converge in this way after activation.
  5529. \end_layout
  5530. \begin_layout Standard
  5531. \begin_inset Float figure
  5532. wide false
  5533. sideways false
  5534. status collapsed
  5535. \begin_layout Plain Layout
  5536. \align center
  5537. \begin_inset Graphics
  5538. filename graphics/CD4-csaw/LaMere2016_fig8.pdf
  5539. lyxscale 50
  5540. width 60col%
  5541. groupId colwidth
  5542. \end_inset
  5543. \end_layout
  5544. \begin_layout Plain Layout
  5545. \begin_inset Caption Standard
  5546. \begin_layout Plain Layout
  5547. \series bold
  5548. \begin_inset CommandInset label
  5549. LatexCommand label
  5550. name "fig:Lamere2016-Fig8"
  5551. \end_inset
  5552. Lamere 2016 Figure 8
  5553. \begin_inset CommandInset citation
  5554. LatexCommand cite
  5555. key "LaMere2016"
  5556. literal "false"
  5557. \end_inset
  5558. ,
  5559. \begin_inset Quotes eld
  5560. \end_inset
  5561. Model for the role of H3K4 methylation during CD4 T-cell activation.
  5562. \begin_inset Quotes erd
  5563. \end_inset
  5564. \series default
  5565. Reproduced with permission.
  5566. \end_layout
  5567. \end_inset
  5568. \end_layout
  5569. \end_inset
  5570. \end_layout
  5571. \begin_layout Standard
  5572. In H3K4me2, H3K4me3, and
  5573. \begin_inset Flex Glossary Term
  5574. status open
  5575. \begin_layout Plain Layout
  5576. RNA-seq
  5577. \end_layout
  5578. \end_inset
  5579. , this convergence appears to be in progress already by Day 5, shown by
  5580. the smaller distance between naïve and memory cells at day 5 along the
  5581. \begin_inset Formula $y$
  5582. \end_inset
  5583. -axes in Figures
  5584. \begin_inset CommandInset ref
  5585. LatexCommand ref
  5586. reference "fig:PCoA-H3K4me2-prom"
  5587. plural "false"
  5588. caps "false"
  5589. noprefix "false"
  5590. \end_inset
  5591. ,
  5592. \begin_inset CommandInset ref
  5593. LatexCommand ref
  5594. reference "fig:PCoA-H3K4me3-prom"
  5595. plural "false"
  5596. caps "false"
  5597. noprefix "false"
  5598. \end_inset
  5599. , and
  5600. \begin_inset CommandInset ref
  5601. LatexCommand ref
  5602. reference "fig:RNA-PCA-group"
  5603. plural "false"
  5604. caps "false"
  5605. noprefix "false"
  5606. \end_inset
  5607. .
  5608. This agrees with the model proposed by Sarah Lamere based on an prior analysis
  5609. of the same data, shown in Figure
  5610. \begin_inset CommandInset ref
  5611. LatexCommand ref
  5612. reference "fig:Lamere2016-Fig8"
  5613. plural "false"
  5614. caps "false"
  5615. noprefix "false"
  5616. \end_inset
  5617. , which shows the pattern of H3K4 methylation and expression for naïve cells
  5618. and memory cells converging at day 5.
  5619. This model was developed without the benefit of the PCoA plots in Figure
  5620. \begin_inset CommandInset ref
  5621. LatexCommand ref
  5622. reference "fig:PCoA-promoters"
  5623. plural "false"
  5624. caps "false"
  5625. noprefix "false"
  5626. \end_inset
  5627. , which have been corrected for confounding factors by ComBat and SVA.
  5628. This shows that proper batch correction assists in extracting meaningful
  5629. patterns in the data while eliminating systematic sources of irrelevant
  5630. variation in the data, allowing simple automated procedures like PCoA to
  5631. reveal interesting behaviors in the data that were previously only detectable
  5632. by a detailed manual analysis.
  5633. \end_layout
  5634. \begin_layout Standard
  5635. While the ideal comparison to demonstrate this convergence would be naïve
  5636. cells at day 14 to memory cells at day 0, this is not feasible in this
  5637. experimental system, since neither naïve nor memory cells are able to fully
  5638. return to their pre-activation state, as shown by the lack of overlap between
  5639. days 0 and 14 for either naïve or memory cells in Figure
  5640. \begin_inset CommandInset ref
  5641. LatexCommand ref
  5642. reference "fig:PCoA-promoters"
  5643. plural "false"
  5644. caps "false"
  5645. noprefix "false"
  5646. \end_inset
  5647. .
  5648. \end_layout
  5649. \begin_layout Subsection
  5650. Positional
  5651. \end_layout
  5652. \begin_layout Standard
  5653. When looking at patterns in the relative coverage of each histone mark near
  5654. the TSS of each gene, several interesting patterns were apparent.
  5655. For H3K4me2 and H3K4me3, the pattern was straightforward: the consistent
  5656. pattern across all promoters was a single peak a few kb wide, with the
  5657. main axis of variation being the position of this peak relative to the
  5658. TSS (Figures
  5659. \begin_inset CommandInset ref
  5660. LatexCommand ref
  5661. reference "fig:H3K4me2-neighborhood"
  5662. plural "false"
  5663. caps "false"
  5664. noprefix "false"
  5665. \end_inset
  5666. &
  5667. \begin_inset CommandInset ref
  5668. LatexCommand ref
  5669. reference "fig:H3K4me3-neighborhood"
  5670. plural "false"
  5671. caps "false"
  5672. noprefix "false"
  5673. \end_inset
  5674. ).
  5675. There were no obvious
  5676. \begin_inset Quotes eld
  5677. \end_inset
  5678. preferred
  5679. \begin_inset Quotes erd
  5680. \end_inset
  5681. positions, but rather a continuous distribution of relative positions ranging
  5682. all across the promoter region.
  5683. The association with gene expression was also straightforward: peaks closer
  5684. to the TSS were more strongly associated with elevated gene expression.
  5685. Coverage downstream of the TSS appears to be more strongly associated with
  5686. elevated expression than coverage the same distance upstream, indicating
  5687. that the
  5688. \begin_inset Quotes eld
  5689. \end_inset
  5690. effective promoter region
  5691. \begin_inset Quotes erd
  5692. \end_inset
  5693. for H3K4me2 and H3K4me3 may be centered downstream of the TSS.
  5694. \end_layout
  5695. \begin_layout Standard
  5696. The relative promoter coverage for H3K27me3 had a more complex pattern,
  5697. with two specific patterns of promoter coverage associated with elevated
  5698. expression: a sharp depletion of H3K27me3 around the TSS relative to the
  5699. surrounding area, and a depletion of H3K27me3 downstream of the TSS relative
  5700. to upstream (Figure
  5701. \begin_inset CommandInset ref
  5702. LatexCommand ref
  5703. reference "fig:H3K27me3-neighborhood"
  5704. plural "false"
  5705. caps "false"
  5706. noprefix "false"
  5707. \end_inset
  5708. ).
  5709. A previous study found that H3K27me3 depletion within the gene body was
  5710. associated with elevated gene expression in 4 different cell types in mice
  5711. \begin_inset CommandInset citation
  5712. LatexCommand cite
  5713. key "Young2011"
  5714. literal "false"
  5715. \end_inset
  5716. .
  5717. This is consistent with the second pattern described here.
  5718. This study also reported that a spike in coverage at the TSS was associated
  5719. with
  5720. \emph on
  5721. lower
  5722. \emph default
  5723. expression, which is indirectly consistent with the first pattern described
  5724. here, in the sense that it associates lower H3K27me3 levels near the TSS
  5725. with higher expression.
  5726. \end_layout
  5727. \begin_layout Subsection
  5728. Workflow
  5729. \end_layout
  5730. \begin_layout Standard
  5731. \begin_inset ERT
  5732. status open
  5733. \begin_layout Plain Layout
  5734. \backslash
  5735. afterpage{
  5736. \end_layout
  5737. \begin_layout Plain Layout
  5738. \backslash
  5739. begin{landscape}
  5740. \end_layout
  5741. \end_inset
  5742. \end_layout
  5743. \begin_layout Standard
  5744. \begin_inset Float figure
  5745. wide false
  5746. sideways false
  5747. status open
  5748. \begin_layout Plain Layout
  5749. \align center
  5750. \begin_inset Graphics
  5751. filename graphics/CD4-csaw/rulegraphs/rulegraph-all.pdf
  5752. lyxscale 50
  5753. width 100col%
  5754. height 95theight%
  5755. \end_inset
  5756. \end_layout
  5757. \begin_layout Plain Layout
  5758. \begin_inset Caption Standard
  5759. \begin_layout Plain Layout
  5760. \begin_inset CommandInset label
  5761. LatexCommand label
  5762. name "fig:rulegraph"
  5763. \end_inset
  5764. \series bold
  5765. Dependency graph of steps in reproducible workflow.
  5766. \end_layout
  5767. \end_inset
  5768. \end_layout
  5769. \end_inset
  5770. \end_layout
  5771. \begin_layout Standard
  5772. \begin_inset ERT
  5773. status open
  5774. \begin_layout Plain Layout
  5775. \backslash
  5776. end{landscape}
  5777. \end_layout
  5778. \begin_layout Plain Layout
  5779. }
  5780. \end_layout
  5781. \end_inset
  5782. \end_layout
  5783. \begin_layout Standard
  5784. The analyses described in this chapter were organized into a reproducible
  5785. workflow using the Snakemake workflow management system
  5786. \begin_inset CommandInset citation
  5787. LatexCommand cite
  5788. key "Koster2012"
  5789. literal "false"
  5790. \end_inset
  5791. .
  5792. As shown in Figure
  5793. \begin_inset CommandInset ref
  5794. LatexCommand ref
  5795. reference "fig:rulegraph"
  5796. plural "false"
  5797. caps "false"
  5798. noprefix "false"
  5799. \end_inset
  5800. , the workflow includes many steps with complex dependencies between them.
  5801. For example, the step that counts the number of ChIP-seq reads in 500
  5802. \begin_inset space ~
  5803. \end_inset
  5804. bp windows in each promoter (the starting point for Figures
  5805. \begin_inset CommandInset ref
  5806. LatexCommand ref
  5807. reference "fig:H3K4me2-neighborhood"
  5808. plural "false"
  5809. caps "false"
  5810. noprefix "false"
  5811. \end_inset
  5812. ,
  5813. \begin_inset CommandInset ref
  5814. LatexCommand ref
  5815. reference "fig:H3K4me3-neighborhood"
  5816. plural "false"
  5817. caps "false"
  5818. noprefix "false"
  5819. \end_inset
  5820. , and
  5821. \begin_inset CommandInset ref
  5822. LatexCommand ref
  5823. reference "fig:H3K27me3-neighborhood"
  5824. plural "false"
  5825. caps "false"
  5826. noprefix "false"
  5827. \end_inset
  5828. ), named
  5829. \begin_inset Flex Code
  5830. status open
  5831. \begin_layout Plain Layout
  5832. chipseq_count_tss_neighborhoods
  5833. \end_layout
  5834. \end_inset
  5835. , depends on the
  5836. \begin_inset Flex Glossary Term
  5837. status open
  5838. \begin_layout Plain Layout
  5839. RNA-seq
  5840. \end_layout
  5841. \end_inset
  5842. abundance estimates in order to select the most-used TSS for each gene,
  5843. the aligned ChIP-seq reads, the index for those reads, and the blacklist
  5844. of regions to be excluded from ChIP-seq analysis.
  5845. Each step declares its inputs and outputs, and Snakemake uses these to
  5846. determine the dependencies between steps.
  5847. Each step is marked as depending on all the steps whose outputs match its
  5848. inputs, generating the workflow graph in Figure
  5849. \begin_inset CommandInset ref
  5850. LatexCommand ref
  5851. reference "fig:rulegraph"
  5852. plural "false"
  5853. caps "false"
  5854. noprefix "false"
  5855. \end_inset
  5856. , which Snakemake uses to determine order in which to execute each step
  5857. so that each step is executed only after all of the steps it depends on
  5858. have completed, thereby automating the entire workflow from start to finish.
  5859. \end_layout
  5860. \begin_layout Standard
  5861. In addition to simply making it easier to organize the steps in the analysis,
  5862. structuring the analysis as a workflow allowed for some analysis strategies
  5863. that would not have been practical otherwise.
  5864. For example, 5 different
  5865. \begin_inset Flex Glossary Term
  5866. status open
  5867. \begin_layout Plain Layout
  5868. RNA-seq
  5869. \end_layout
  5870. \end_inset
  5871. quantification methods were tested against two different reference transcriptom
  5872. e annotations for a total of 10 different quantifications of the same
  5873. \begin_inset Flex Glossary Term
  5874. status open
  5875. \begin_layout Plain Layout
  5876. RNA-seq
  5877. \end_layout
  5878. \end_inset
  5879. data.
  5880. These were then compared against each other in the exploratory data analysis
  5881. step, to determine that the results were not very sensitive to either the
  5882. choice of quantification method or the choice of annotation.
  5883. This was possible with a single script for the exploratory data analysis,
  5884. because Snakemake was able to automate running this script for every combinatio
  5885. n of method and reference.
  5886. In a similar manner, two different peak calling methods were tested against
  5887. each other, and in this case it was determined that SICER was unambiguously
  5888. superior to MACS for all histone marks studied.
  5889. By enabling these types of comparisons, structuring the analysis as an
  5890. automated workflow allowed important analysis decisions to be made in a
  5891. data-driven way, by running every reasonable option through the downstream
  5892. steps, seeing the consequences of choosing each option, and deciding accordingl
  5893. y.
  5894. \end_layout
  5895. \begin_layout Subsection
  5896. Data quality issues limit conclusions
  5897. \end_layout
  5898. \begin_layout Standard
  5899. \begin_inset Flex TODO Note (inline)
  5900. status open
  5901. \begin_layout Plain Layout
  5902. Is this needed?
  5903. \end_layout
  5904. \end_inset
  5905. \end_layout
  5906. \begin_layout Section
  5907. Future Directions
  5908. \end_layout
  5909. \begin_layout Standard
  5910. The analysis of
  5911. \begin_inset Flex Glossary Term
  5912. status open
  5913. \begin_layout Plain Layout
  5914. RNA-seq
  5915. \end_layout
  5916. \end_inset
  5917. and ChIP-seq in CD4 T-cells in Chapter 2 is in many ways a preliminary
  5918. study that suggests a multitude of new avenues of investigation.
  5919. Here we consider a selection of such avenues.
  5920. \end_layout
  5921. \begin_layout Subsection
  5922. Negative results
  5923. \end_layout
  5924. \begin_layout Standard
  5925. Two additional analyses were conducted beyond those reported in the results.
  5926. First, we searched for evidence that the presence or absence of a CpG island
  5927. in the promoter was correlated with increases or decreases in gene expression
  5928. or any histone mark in any of the tested contrasts.
  5929. Second, we searched for evidence that the relative ChIP-seq coverage profiles
  5930. prior to activations could predict the change in expression of a gene after
  5931. activation.
  5932. Neither analysis turned up any clear positive results.
  5933. \end_layout
  5934. \begin_layout Subsection
  5935. Improve on the idea of an effective promoter radius
  5936. \end_layout
  5937. \begin_layout Standard
  5938. This study introduced the concept of an
  5939. \begin_inset Quotes eld
  5940. \end_inset
  5941. effective promoter radius
  5942. \begin_inset Quotes erd
  5943. \end_inset
  5944. specific to each histone mark based on distance from the TSS within which
  5945. an excess of peaks was called for that mark.
  5946. This concept was then used to guide further analyses throughout the study.
  5947. However, while the effective promoter radius was useful in those analyses,
  5948. it is both limited in theory and shown in practice to be a possible oversimplif
  5949. ication.
  5950. First, the effective promoter radii used in this study were chosen based
  5951. on manual inspection of the TSS-to-peak distance distributions in Figure
  5952. \begin_inset CommandInset ref
  5953. LatexCommand ref
  5954. reference "fig:near-promoter-peak-enrich"
  5955. plural "false"
  5956. caps "false"
  5957. noprefix "false"
  5958. \end_inset
  5959. , selecting round numbers of analyst convenience (Table
  5960. \begin_inset CommandInset ref
  5961. LatexCommand ref
  5962. reference "tab:effective-promoter-radius"
  5963. plural "false"
  5964. caps "false"
  5965. noprefix "false"
  5966. \end_inset
  5967. ).
  5968. It would be better to define an algorithm that selects a more precise radius
  5969. based on the features of the graph.
  5970. One possible way to do this would be to randomly rearrange the called peaks
  5971. throughout the genome many (while preserving the distribution of peak widths)
  5972. and re-generate the same plot as in Figure
  5973. \begin_inset CommandInset ref
  5974. LatexCommand ref
  5975. reference "fig:near-promoter-peak-enrich"
  5976. plural "false"
  5977. caps "false"
  5978. noprefix "false"
  5979. \end_inset
  5980. .
  5981. This would yield a better
  5982. \begin_inset Quotes eld
  5983. \end_inset
  5984. background
  5985. \begin_inset Quotes erd
  5986. \end_inset
  5987. distribution that demonstrates the degree of near-TSS enrichment that would
  5988. be expected by random chance.
  5989. The effective promoter radius could be defined as the point where the true
  5990. distribution diverges from the randomized background distribution.
  5991. \end_layout
  5992. \begin_layout Standard
  5993. Furthermore, the above definition of effective promoter radius has the significa
  5994. nt limitation of being based on the peak calling method.
  5995. It is thus very sensitive to the choice of peak caller and significance
  5996. threshold for calling peaks, as well as the degree of saturation in the
  5997. sequencing.
  5998. Calling peaks from ChIP-seq samples with insufficient coverage depth, with
  5999. the wrong peak caller, or with a different significance threshold could
  6000. give a drastically different number of called peaks, and hence a drastically
  6001. different distribution of peak-to-TSS distances.
  6002. To address this, it is desirable to develop a better method of determining
  6003. the effective promoter radius that relies only on the distribution of read
  6004. coverage around the TSS, independent of the peak calling.
  6005. Furthermore, as demonstrated by the upstream-downstream asymmetries observed
  6006. in Figures
  6007. \begin_inset CommandInset ref
  6008. LatexCommand ref
  6009. reference "fig:H3K4me2-neighborhood"
  6010. plural "false"
  6011. caps "false"
  6012. noprefix "false"
  6013. \end_inset
  6014. ,
  6015. \begin_inset CommandInset ref
  6016. LatexCommand ref
  6017. reference "fig:H3K4me3-neighborhood"
  6018. plural "false"
  6019. caps "false"
  6020. noprefix "false"
  6021. \end_inset
  6022. , and
  6023. \begin_inset CommandInset ref
  6024. LatexCommand ref
  6025. reference "fig:H3K27me3-neighborhood"
  6026. plural "false"
  6027. caps "false"
  6028. noprefix "false"
  6029. \end_inset
  6030. , this definition should determine a different radius for the upstream and
  6031. downstream directions.
  6032. At this point, it may be better to rename this concept
  6033. \begin_inset Quotes eld
  6034. \end_inset
  6035. effective promoter extent
  6036. \begin_inset Quotes erd
  6037. \end_inset
  6038. and avoid the word
  6039. \begin_inset Quotes eld
  6040. \end_inset
  6041. radius
  6042. \begin_inset Quotes erd
  6043. \end_inset
  6044. , since a radius implies a symmetry about the TSS that is not supported
  6045. by the data.
  6046. \end_layout
  6047. \begin_layout Standard
  6048. Beyond improving the definition of effective promoter extent, functional
  6049. validation is necessary to show that this measure of near-TSS enrichment
  6050. has biological meaning.
  6051. Figures
  6052. \begin_inset CommandInset ref
  6053. LatexCommand ref
  6054. reference "fig:H3K4me2-neighborhood"
  6055. plural "false"
  6056. caps "false"
  6057. noprefix "false"
  6058. \end_inset
  6059. and
  6060. \begin_inset CommandInset ref
  6061. LatexCommand ref
  6062. reference "fig:H3K4me3-neighborhood"
  6063. plural "false"
  6064. caps "false"
  6065. noprefix "false"
  6066. \end_inset
  6067. already provide a very limited functional validation of the chosen promoter
  6068. extents for H3K4me2 and H3K4me3 by showing that spikes in coverage within
  6069. this region are most strongly correlated with elevated gene expression.
  6070. However, there are other ways to show functional relevance of the promoter
  6071. extent.
  6072. For example, correlations could be computed between read counts in peaks
  6073. nearby gene promoters and the expression level of those genes, and these
  6074. correlations could be plotted against the distance of the peak upstream
  6075. or downstream of the gene's TSS.
  6076. If the promoter extent truly defines a
  6077. \begin_inset Quotes eld
  6078. \end_inset
  6079. sphere of influence
  6080. \begin_inset Quotes erd
  6081. \end_inset
  6082. within which a histone mark is involved with the regulation of a gene,
  6083. then the correlations for peaks within this extent should be significantly
  6084. higher than those further upstream or downstream.
  6085. Peaks within these extents may also be more likely to show differential
  6086. modification than those outside genic regions of the genome.
  6087. \end_layout
  6088. \begin_layout Subsection
  6089. Design experiments to focus on post-activation convergence of naïve & memory
  6090. cells
  6091. \end_layout
  6092. \begin_layout Standard
  6093. In this study, a convergence between naïve and memory cells was observed
  6094. in both the pattern of gene expression and in epigenetic state of the 3
  6095. histone marks studied, consistent with the hypothesis that any naïve cells
  6096. remaining 14 days after activation have differentiated into memory cells,
  6097. and that both gene expression and these histone marks are involved in this
  6098. differentiation.
  6099. However, the current study was not designed with this specific hypothesis
  6100. in mind, and it therefore has some deficiencies with regard to testing
  6101. it.
  6102. The memory CD4 samples at day 14 do not resemble the memory samples at
  6103. day 0, indicating that in the specific model of activation used for this
  6104. experiment, the cells are not guaranteed to return to their original pre-activa
  6105. tion state, or perhaps this process takes substantially longer than 14 days.
  6106. This is a challenge for the convergence hypothesis because the ideal comparison
  6107. to prove that naïve cells are converging to a resting memory state would
  6108. be to compare the final naïve time point to the Day 0 memory samples, but
  6109. this comparison is only meaningful if memory cells generally return to
  6110. the same
  6111. \begin_inset Quotes eld
  6112. \end_inset
  6113. resting
  6114. \begin_inset Quotes erd
  6115. \end_inset
  6116. state that they started at.
  6117. \end_layout
  6118. \begin_layout Standard
  6119. To better study the convergence hypothesis, a new experiment should be designed
  6120. using a model system for T-cell activation that is known to allow cells
  6121. to return as closely as possible to their pre-activation state.
  6122. Alternatively, if it is not possible to find or design such a model system,
  6123. the same cell cultures could be activated serially multiple times, and
  6124. sequenced after each activation cycle right before the next activation.
  6125. It is likely that several activations in the same model system will settle
  6126. into a cyclical pattern, converging to a consistent
  6127. \begin_inset Quotes eld
  6128. \end_inset
  6129. resting
  6130. \begin_inset Quotes erd
  6131. \end_inset
  6132. state after each activation, even if this state is different from the initial
  6133. resting state at Day 0.
  6134. If so, it will be possible to compare the final states of both naïve and
  6135. memory cells to show that they converge despite different initial conditions.
  6136. \end_layout
  6137. \begin_layout Standard
  6138. In addition, if naïve-to-memory convergence is a general pattern, it should
  6139. also be detectable in other epigenetic marks, including other histone marks
  6140. and DNA methylation.
  6141. An experiment should be designed studying a large number of epigenetic
  6142. marks known or suspected to be involved in regulation of gene expression,
  6143. assaying all of these at the same pre- and post-activation time points.
  6144. Multi-dataset factor analysis methods like MOFA can then be used to identify
  6145. coordinated patterns of regulation shared across many epigenetic marks.
  6146. If possible, some
  6147. \begin_inset Quotes eld
  6148. \end_inset
  6149. negative control
  6150. \begin_inset Quotes erd
  6151. \end_inset
  6152. marks should be included that are known
  6153. \emph on
  6154. not
  6155. \emph default
  6156. to be involved in T-cell activation or memory formation.
  6157. Of course, CD4 T-cells are not the only adaptive immune cells with memory.
  6158. A similar study could be designed for CD8 T-cells, B-cells, and even specific
  6159. subsets of CD4 T-cells.
  6160. \end_layout
  6161. \begin_layout Subsection
  6162. Follow up on hints of interesting patterns in promoter relative coverage
  6163. profiles
  6164. \end_layout
  6165. \begin_layout Standard
  6166. \begin_inset Flex TODO Note (inline)
  6167. status open
  6168. \begin_layout Plain Layout
  6169. I think I might need to write up the negative results for the Promoter CpG
  6170. and defined pattern analysis before writing this section.
  6171. \end_layout
  6172. \end_inset
  6173. \end_layout
  6174. \begin_layout Itemize
  6175. Also find better normalizations: maybe borrow from MACS/SICER background
  6176. correction methods?
  6177. \end_layout
  6178. \begin_layout Itemize
  6179. For H3K4, define polar coordinates based on PC1 & 2: R = peak size, Theta
  6180. = peak position.
  6181. Then correlate with expression.
  6182. \end_layout
  6183. \begin_layout Itemize
  6184. Current analysis only at Day 0.
  6185. Need to study across time points.
  6186. \end_layout
  6187. \begin_layout Itemize
  6188. Integrating data across so many dimensions is a significant analysis challenge
  6189. \end_layout
  6190. \begin_layout Subsection
  6191. Investigate causes of high correlation between mutually exclusive histone
  6192. marks
  6193. \end_layout
  6194. \begin_layout Standard
  6195. The high correlation between coverage depth observed between H3K4me2 and
  6196. H3K4me3 is both expected and unexpected.
  6197. Since both marks are associated with elevated gene transcription, a positive
  6198. correlation between them is not surprising.
  6199. However, these two marks represent different post-translational modifications
  6200. of the
  6201. \emph on
  6202. same
  6203. \emph default
  6204. lysine residue on the histone H3 polypeptide, which means that they cannot
  6205. both be present on the same H3 subunit.
  6206. Thus, the high correlation between them has several potential explanations.
  6207. One possible reason is cell population heterogeneity: perhaps some genomic
  6208. loci are frequently marked with H3K4me2 in some cells, while in other cells
  6209. the same loci are marked with H3K4me3.
  6210. Another possibility is allele-specific modifications: the loci are marked
  6211. in each diploid cell with H3K4me2 on one allele and H3K4me3 on the other
  6212. allele.
  6213. Lastly, since each histone octamer contains 2 H3 subunits, it is possible
  6214. that having one H3K4me2 mark and one H3K4me3 mark on a given histone octamer
  6215. represents a distinct epigenetic state with a different function than either
  6216. double H3K4me2 or double H3K4me3.
  6217. \end_layout
  6218. \begin_layout Standard
  6219. These three hypotheses could be disentangled by single-cell ChIP-seq.
  6220. If the correlation between these two histone marks persists even within
  6221. the reads for each individual cell, then cell population heterogeneity
  6222. cannot explain the correlation.
  6223. Allele-specific modification can be tested for by looking at the correlation
  6224. between read coverage of the two histone marks at heterozygous loci.
  6225. If the correlation between read counts for opposite loci is low, then this
  6226. is consistent with allele-specific modification.
  6227. Finally if the modifications do not separate by either cell or allele,
  6228. the colocation of these two marks is most likely occurring at the level
  6229. of individual histones, with the heterogeneously modified histone representing
  6230. a distinct state.
  6231. \end_layout
  6232. \begin_layout Standard
  6233. However, another experiment would be required to show direct evidence of
  6234. such a heterogeneously modified state.
  6235. Specifically a
  6236. \begin_inset Quotes eld
  6237. \end_inset
  6238. double ChIP
  6239. \begin_inset Quotes erd
  6240. \end_inset
  6241. experiment would need to be performed, where the input DNA is first subjected
  6242. to an immunoprecipitation pulldown from the anti-H3K4me2 antibody, and
  6243. then the enriched material is collected, with proteins still bound, and
  6244. immunoprecipitated
  6245. \emph on
  6246. again
  6247. \emph default
  6248. using the anti-H3K4me3 antibody.
  6249. If this yields significant numbers of non-artifactual reads in the same
  6250. regions as the individual pulldowns of the two marks, this is strong evidence
  6251. that the two marks are occurring on opposite H3 subunits of the same histones.
  6252. \end_layout
  6253. \begin_layout Standard
  6254. \begin_inset Flex TODO Note (inline)
  6255. status open
  6256. \begin_layout Plain Layout
  6257. Try to see if double ChIP-seq is actually feasible, and if not, come up
  6258. with some other idea for directly detecting the mixed mod state.
  6259. Oh! Actually ChIP-seq isn't required, only double ChIP followed by quantificati
  6260. on.
  6261. That's one possible angle.
  6262. \end_layout
  6263. \end_inset
  6264. \end_layout
  6265. \begin_layout Chapter
  6266. Improving array-based diagnostics for transplant rejection by optimizing
  6267. data preprocessing
  6268. \end_layout
  6269. \begin_layout Standard
  6270. \begin_inset Note Note
  6271. status open
  6272. \begin_layout Plain Layout
  6273. Chapter author list: Me, Sunil, Tom, Padma, Dan
  6274. \end_layout
  6275. \end_inset
  6276. \end_layout
  6277. \begin_layout Standard
  6278. \begin_inset ERT
  6279. status collapsed
  6280. \begin_layout Plain Layout
  6281. \backslash
  6282. glsresetall
  6283. \end_layout
  6284. \end_inset
  6285. \end_layout
  6286. \begin_layout Section
  6287. Approach
  6288. \end_layout
  6289. \begin_layout Subsection
  6290. Proper pre-processing is essential for array data
  6291. \end_layout
  6292. \begin_layout Standard
  6293. \begin_inset Flex TODO Note (inline)
  6294. status open
  6295. \begin_layout Plain Layout
  6296. This section could probably use some citations
  6297. \end_layout
  6298. \end_inset
  6299. \end_layout
  6300. \begin_layout Standard
  6301. Microarrays, bead arrays, and similar assays produce raw data in the form
  6302. of fluorescence intensity measurements, with the each intensity measurement
  6303. proportional to the abundance of some fluorescently labelled target DNA
  6304. or RNA sequence that base pairs to a specific probe sequence.
  6305. However, these measurements for each probe are also affected my many technical
  6306. confounding factors, such as the concentration of target material, strength
  6307. of off-target binding, and the sensitivity of the imaging sensor.
  6308. Some array designs also use multiple probe sequences for each target.
  6309. Hence, extensive pre-processing of array data is necessary to normalize
  6310. out the effects of these technical factors and summarize the information
  6311. from multiple probes to arrive at a single usable estimate of abundance
  6312. or other relevant quantity, such as a ratio of two abundances, for each
  6313. target.
  6314. \end_layout
  6315. \begin_layout Standard
  6316. The choice of pre-processing algorithms used in the analysis of an array
  6317. data set can have a large effect on the results of that analysis.
  6318. However, despite their importance, these steps are often neglected or rushed
  6319. in order to get to the more scientifically interesting analysis steps involving
  6320. the actual biology of the system under study.
  6321. Hence, it is often possible to achieve substantial gains in statistical
  6322. power, model goodness-of-fit, or other relevant performance measures, by
  6323. checking the assumptions made by each preprocessing step and choosing specific
  6324. normalization methods tailored to the specific goals of the current analysis.
  6325. \end_layout
  6326. \begin_layout Subsection
  6327. Clinical diagnostic applications for microarrays require single-channel
  6328. normalization
  6329. \end_layout
  6330. \begin_layout Standard
  6331. As the cost of performing microarray assays falls, there is increasing interest
  6332. in using genomic assays for diagnostic purposes, such as distinguishing
  6333. healthy transplants (TX) from transplants undergoing acute rejection (AR)
  6334. or acute dysfunction with no rejection (ADNR).
  6335. However, the the standard normalization algorithm used for microarray data,
  6336. Robust Multi-chip Average (RMA)
  6337. \begin_inset CommandInset citation
  6338. LatexCommand cite
  6339. key "Irizarry2003a"
  6340. literal "false"
  6341. \end_inset
  6342. , is not applicable in a clinical setting.
  6343. Two of the steps in RMA, quantile normalization and probe summarization
  6344. by median polish, depend on every array in the data set being normalized.
  6345. This means that adding or removing any arrays from a data set changes the
  6346. normalized values for all arrays, and data sets that have been normalized
  6347. separately cannot be compared to each other.
  6348. Hence, when using RMA, any arrays to be analyzed together must also be
  6349. normalized together, and the set of arrays included in the data set must
  6350. be held constant throughout an analysis.
  6351. \end_layout
  6352. \begin_layout Standard
  6353. These limitations present serious impediments to the use of arrays as a
  6354. diagnostic tool.
  6355. When training a classifier, the samples to be classified must not be involved
  6356. in any step of the training process, lest their inclusion bias the training
  6357. process.
  6358. Once a classifier is deployed in a clinical setting, the samples to be
  6359. classified will not even
  6360. \emph on
  6361. exist
  6362. \emph default
  6363. at the time of training, so including them would be impossible even if
  6364. it were statistically justifiable.
  6365. Therefore, any machine learning application for microarrays demands that
  6366. the normalized expression values computed for an array must depend only
  6367. on information contained within that array.
  6368. This would ensure that each array's normalization is independent of every
  6369. other array, and that arrays normalized separately can still be compared
  6370. to each other without bias.
  6371. Such a normalization is commonly referred to as
  6372. \begin_inset Quotes eld
  6373. \end_inset
  6374. single-channel normalization
  6375. \begin_inset Quotes erd
  6376. \end_inset
  6377. .
  6378. \end_layout
  6379. \begin_layout Standard
  6380. Frozen RMA (fRMA) addresses these concerns by replacing the quantile normalizati
  6381. on and median polish with alternatives that do not introduce inter-array
  6382. dependence, allowing each array to be normalized independently of all others
  6383. \begin_inset CommandInset citation
  6384. LatexCommand cite
  6385. key "McCall2010"
  6386. literal "false"
  6387. \end_inset
  6388. .
  6389. Quantile normalization is performed against a pre-generated set of quantiles
  6390. learned from a collection of 850 publicly available arrays sampled from
  6391. a wide variety of tissues in the Gene Expression Omnibus (GEO).
  6392. Each array's probe intensity distribution is normalized against these pre-gener
  6393. ated quantiles.
  6394. The median polish step is replaced with a robust weighted average of probe
  6395. intensities, using inverse variance weights learned from the same public
  6396. GEO data.
  6397. The result is a normalization that satisfies the requirements mentioned
  6398. above: each array is normalized independently of all others, and any two
  6399. normalized arrays can be compared directly to each other.
  6400. \end_layout
  6401. \begin_layout Standard
  6402. One important limitation of fRMA is that it requires a separate reference
  6403. data set from which to learn the parameters (reference quantiles and probe
  6404. weights) that will be used to normalize each array.
  6405. These parameters are specific to a given array platform, and pre-generated
  6406. parameters are only provided for the most common platforms, such as Affymetrix
  6407. hgu133plus2.
  6408. For a less common platform, such as hthgu133pluspm, is is necessary to
  6409. learn custom parameters from in-house data before fRMA can be used to normalize
  6410. samples on that platform
  6411. \begin_inset CommandInset citation
  6412. LatexCommand cite
  6413. key "McCall2011"
  6414. literal "false"
  6415. \end_inset
  6416. .
  6417. \end_layout
  6418. \begin_layout Standard
  6419. One other option is the aptly-named Single Channel Array Normalization (SCAN),
  6420. which adapts a normalization method originally designed for tiling arrays
  6421. \begin_inset CommandInset citation
  6422. LatexCommand cite
  6423. key "Piccolo2012"
  6424. literal "false"
  6425. \end_inset
  6426. .
  6427. SCAN is truly single-channel in that it does not require a set of normalization
  6428. parameters estimated from an external set of reference samples like fRMA
  6429. does.
  6430. \end_layout
  6431. \begin_layout Subsection
  6432. Heteroskedasticity must be accounted for in methylation array data
  6433. \end_layout
  6434. \begin_layout Standard
  6435. DNA methylation arrays are a relatively new kind of assay that uses microarrays
  6436. to measure the degree of methylation on cytosines in specific regions arrayed
  6437. across the genome.
  6438. First, bisulfite treatment converts all unmethylated cytosines to uracil
  6439. (which are read as thymine during amplification and sequencing) while leaving
  6440. methylated cytosines unaffected.
  6441. Then, each target region is interrogated with two probes: one binds to
  6442. the original genomic sequence and interrogates the level of methylated
  6443. DNA, and the other binds to the same sequence with all cytosines replaced
  6444. by thymidines and interrogates the level of unmethylated DNA.
  6445. \end_layout
  6446. \begin_layout Standard
  6447. \begin_inset Float figure
  6448. wide false
  6449. sideways false
  6450. status collapsed
  6451. \begin_layout Plain Layout
  6452. \align center
  6453. \begin_inset Graphics
  6454. filename graphics/methylvoom/sigmoid.pdf
  6455. lyxscale 50
  6456. width 60col%
  6457. groupId colwidth
  6458. \end_inset
  6459. \end_layout
  6460. \begin_layout Plain Layout
  6461. \begin_inset Caption Standard
  6462. \begin_layout Plain Layout
  6463. \begin_inset CommandInset label
  6464. LatexCommand label
  6465. name "fig:Sigmoid-beta-m-mapping"
  6466. \end_inset
  6467. \series bold
  6468. Sigmoid shape of the mapping between β and M values
  6469. \end_layout
  6470. \end_inset
  6471. \end_layout
  6472. \end_inset
  6473. \end_layout
  6474. \begin_layout Standard
  6475. After normalization, these two probe intensities are summarized in one of
  6476. two ways, each with advantages and disadvantages.
  6477. β
  6478. \series bold
  6479. \series default
  6480. values, interpreted as fraction of DNA copies methylated, range from 0 to
  6481. 1.
  6482. β
  6483. \series bold
  6484. \series default
  6485. values are conceptually easy to interpret, but the constrained range makes
  6486. them unsuitable for linear modeling, and their error distributions are
  6487. highly non-normal, which also frustrates linear modeling.
  6488. M-values, interpreted as the log ratio of methylated to unmethylated copies,
  6489. are computed by mapping the beta values from
  6490. \begin_inset Formula $[0,1]$
  6491. \end_inset
  6492. onto
  6493. \begin_inset Formula $(-\infty,+\infty)$
  6494. \end_inset
  6495. using a sigmoid curve (Figure
  6496. \begin_inset CommandInset ref
  6497. LatexCommand ref
  6498. reference "fig:Sigmoid-beta-m-mapping"
  6499. plural "false"
  6500. caps "false"
  6501. noprefix "false"
  6502. \end_inset
  6503. ).
  6504. This transformation results in values with better statistical properties:
  6505. the unconstrained range is suitable for linear modeling, and the error
  6506. distributions are more normal.
  6507. Hence, most linear modeling and other statistical testing on methylation
  6508. arrays is performed using M-values.
  6509. \end_layout
  6510. \begin_layout Standard
  6511. However, the steep slope of the sigmoid transformation near 0 and 1 tends
  6512. to over-exaggerate small differences in β values near those extremes, which
  6513. in turn amplifies the error in those values, leading to a U-shaped trend
  6514. in the mean-variance curve: extreme values have higher variances than values
  6515. near the middle.
  6516. This mean-variance dependency must be accounted for when fitting the linear
  6517. model for differential methylation, or else the variance will be systematically
  6518. overestimated for probes with moderate M-values and underestimated for
  6519. probes with extreme M-values.
  6520. This is particularly undesirable for methylation data because the intermediate
  6521. M-values are the ones of most interest, since they are more likely to represent
  6522. areas of varying methylation, whereas extreme M-values typically represent
  6523. complete methylation or complete lack of methylation.
  6524. \end_layout
  6525. \begin_layout Standard
  6526. \begin_inset Flex Glossary Term (Capital)
  6527. status open
  6528. \begin_layout Plain Layout
  6529. RNA-seq
  6530. \end_layout
  6531. \end_inset
  6532. read count data are also known to show heteroskedasticity, and the voom
  6533. method was introduced for modeling this heteroskedasticity by estimating
  6534. the mean-variance trend in the data and using this trend to assign precision
  6535. weights to each observation
  6536. \begin_inset CommandInset citation
  6537. LatexCommand cite
  6538. key "Law2013"
  6539. literal "false"
  6540. \end_inset
  6541. .
  6542. While methylation array data are not derived from counts and have a very
  6543. different mean-variance relationship from that of typical
  6544. \begin_inset Flex Glossary Term
  6545. status open
  6546. \begin_layout Plain Layout
  6547. RNA-seq
  6548. \end_layout
  6549. \end_inset
  6550. data, the voom method makes no specific assumptions on the shape of the
  6551. mean-variance relationship – it only assumes that the relationship can
  6552. be modeled as a smooth curve.
  6553. Hence, the method is sufficiently general to model the mean-variance relationsh
  6554. ip in methylation array data.
  6555. However, the standard implementation of voom assumes that the input is
  6556. given in raw read counts, and it must be adapted to run on methylation
  6557. M-values.
  6558. \end_layout
  6559. \begin_layout Section
  6560. Methods
  6561. \end_layout
  6562. \begin_layout Subsection
  6563. Evaluation of classifier performance with different normalization methods
  6564. \end_layout
  6565. \begin_layout Standard
  6566. For testing different expression microarray normalizations, a data set of
  6567. 157 hgu133plus2 arrays was used, consisting of blood samples from kidney
  6568. transplant patients whose grafts had been graded as TX, AR, or ADNR via
  6569. biopsy and histology (46 TX, 69 AR, 42 ADNR)
  6570. \begin_inset CommandInset citation
  6571. LatexCommand cite
  6572. key "Kurian2014"
  6573. literal "true"
  6574. \end_inset
  6575. .
  6576. Additionally, an external validation set of 75 samples was gathered from
  6577. public GEO data (37 TX, 38 AR, no ADNR).
  6578. \end_layout
  6579. \begin_layout Standard
  6580. \begin_inset Flex TODO Note (inline)
  6581. status open
  6582. \begin_layout Plain Layout
  6583. Find appropriate GEO identifiers if possible.
  6584. Kurian 2014 says GSE15296, but this seems to be different data.
  6585. I also need to look up the GEO accession for the external validation set.
  6586. \end_layout
  6587. \end_inset
  6588. \end_layout
  6589. \begin_layout Standard
  6590. To evaluate the effect of each normalization on classifier performance,
  6591. the same classifier training and validation procedure was used after each
  6592. normalization method.
  6593. The PAM package was used to train a nearest shrunken centroid classifier
  6594. on the training set and select the appropriate threshold for centroid shrinking.
  6595. Then the trained classifier was used to predict the class probabilities
  6596. of each validation sample.
  6597. From these class probabilities, ROC curves and area-under-curve (AUC) values
  6598. were generated
  6599. \begin_inset CommandInset citation
  6600. LatexCommand cite
  6601. key "Turck2011"
  6602. literal "false"
  6603. \end_inset
  6604. .
  6605. Each normalization was tested on two different sets of training and validation
  6606. samples.
  6607. For internal validation, the 115 TX and AR arrays in the internal set were
  6608. split at random into two equal sized sets, one for training and one for
  6609. validation, each containing the same numbers of TX and AR samples as the
  6610. other set.
  6611. For external validation, the full set of 115 TX and AR samples were used
  6612. as a training set, and the 75 external TX and AR samples were used as the
  6613. validation set.
  6614. Thus, 2 ROC curves and AUC values were generated for each normalization
  6615. method: one internal and one external.
  6616. Because the external validation set contains no ADNR samples, only classificati
  6617. on of TX and AR samples was considered.
  6618. The ADNR samples were included during normalization but excluded from all
  6619. classifier training and validation.
  6620. This ensures that the performance on internal and external validation sets
  6621. is directly comparable, since both are performing the same task: distinguishing
  6622. TX from AR.
  6623. \end_layout
  6624. \begin_layout Standard
  6625. \begin_inset Flex TODO Note (inline)
  6626. status open
  6627. \begin_layout Plain Layout
  6628. Summarize the get.best.threshold algorithm for PAM threshold selection, or
  6629. just put the code online?
  6630. \end_layout
  6631. \end_inset
  6632. \end_layout
  6633. \begin_layout Standard
  6634. Six different normalization strategies were evaluated.
  6635. First, 2 well-known non-single-channel normalization methods were considered:
  6636. RMA and dChip
  6637. \begin_inset CommandInset citation
  6638. LatexCommand cite
  6639. key "Li2001,Irizarry2003a"
  6640. literal "false"
  6641. \end_inset
  6642. .
  6643. Since RMA produces expression values on a log2 scale and dChip does not,
  6644. the values from dChip were log2 transformed after normalization.
  6645. Next, RMA and dChip followed by Global Rank-invariant Set Normalization
  6646. (GRSN) were tested
  6647. \begin_inset CommandInset citation
  6648. LatexCommand cite
  6649. key "Pelz2008"
  6650. literal "false"
  6651. \end_inset
  6652. .
  6653. Post-processing with GRSN does not turn RMA or dChip into single-channel
  6654. methods, but it may help mitigate batch effects and is therefore useful
  6655. as a benchmark.
  6656. Lastly, the two single-channel normalization methods, fRMA and SCAN, were
  6657. tested
  6658. \begin_inset CommandInset citation
  6659. LatexCommand cite
  6660. key "McCall2010,Piccolo2012"
  6661. literal "false"
  6662. \end_inset
  6663. .
  6664. When evaluating internal validation performance, only the 157 internal
  6665. samples were normalized; when evaluating external validation performance,
  6666. all 157 internal samples and 75 external samples were normalized together.
  6667. \end_layout
  6668. \begin_layout Standard
  6669. For demonstrating the problem with separate normalization of training and
  6670. validation data, one additional normalization was performed: the internal
  6671. and external sets were each normalized separately using RMA, and the normalized
  6672. data for each set were combined into a single set with no further attempts
  6673. at normalizing between the two sets.
  6674. The represents approximately how RMA would have to be used in a clinical
  6675. setting, where the samples to be classified are not available at the time
  6676. the classifier is trained.
  6677. \end_layout
  6678. \begin_layout Subsection
  6679. Generating custom fRMA vectors for hthgu133pluspm array platform
  6680. \end_layout
  6681. \begin_layout Standard
  6682. In order to enable fRMA normalization for the hthgu133pluspm array platform,
  6683. custom fRMA normalization vectors were trained using the
  6684. \begin_inset Flex Code
  6685. status open
  6686. \begin_layout Plain Layout
  6687. frmaTools
  6688. \end_layout
  6689. \end_inset
  6690. package
  6691. \begin_inset CommandInset citation
  6692. LatexCommand cite
  6693. key "McCall2011"
  6694. literal "false"
  6695. \end_inset
  6696. .
  6697. Separate vectors were created for two types of samples: kidney graft biopsy
  6698. samples and blood samples from graft recipients.
  6699. For training, a 341 kidney biopsy samples from 2 data sets and 965 blood
  6700. samples from 5 data sets were used as the reference set.
  6701. Arrays were groups into batches based on unique combinations of sample
  6702. type (blood or biopsy), diagnosis (TX, AR, etc.), data set, and scan date.
  6703. Thus, each batch represents arrays of the same kind that were run together
  6704. on the same day.
  6705. For estimating the probe inverse variance weights, frmaTools requires equal-siz
  6706. ed batches, which means a batch size must be chosen, and then batches smaller
  6707. than that size must be ignored, while batches larger than the chosen size
  6708. must be downsampled.
  6709. This downsampling is performed randomly, so the sampling process is repeated
  6710. 5 times and the resulting normalizations are compared to each other.
  6711. \end_layout
  6712. \begin_layout Standard
  6713. To evaluate the consistency of the generated normalization vectors, the
  6714. 5 fRMA vector sets generated from 5 random batch samplings were each used
  6715. to normalize the same 20 randomly selected samples from each tissue.
  6716. Then the normalized expression values for each probe on each array were
  6717. compared across all normalizations.
  6718. Each fRMA normalization was also compared against the normalized expression
  6719. values obtained by normalizing the same 20 samples with ordinary RMA.
  6720. \end_layout
  6721. \begin_layout Subsection
  6722. Modeling methylation array M-value heteroskedasticy in linear models with
  6723. modified voom implementation
  6724. \end_layout
  6725. \begin_layout Standard
  6726. \begin_inset Flex TODO Note (inline)
  6727. status open
  6728. \begin_layout Plain Layout
  6729. Put code on Github and reference it.
  6730. \end_layout
  6731. \end_inset
  6732. \end_layout
  6733. \begin_layout Standard
  6734. To investigate the whether DNA methylation could be used to distinguish
  6735. between healthy and dysfunctional transplants, a data set of 78 Illumina
  6736. 450k methylation arrays from human kidney graft biopsies was analyzed for
  6737. differential methylation between 4 transplant statuses: healthy transplant
  6738. (TX), transplants undergoing acute rejection (AR), acute dysfunction with
  6739. no rejection (ADNR), and chronic allograft nephropathy (CAN).
  6740. The data consisted of 33 TX, 9 AR, 8 ADNR, and 28 CAN samples.
  6741. The uneven group sizes are a result of taking the biopsy samples before
  6742. the eventual fate of the transplant was known.
  6743. Each sample was additionally annotated with a donor ID (anonymized), Sex,
  6744. Age, Ethnicity, Creatinine Level, and Diabetes diagnosis (all samples in
  6745. this data set came from patients with either Type 1 or Type 2 diabetes).
  6746. \end_layout
  6747. \begin_layout Standard
  6748. The intensity data were first normalized using subset-quantile within array
  6749. normalization (SWAN)
  6750. \begin_inset CommandInset citation
  6751. LatexCommand cite
  6752. key "Maksimovic2012"
  6753. literal "false"
  6754. \end_inset
  6755. , then converted to intensity ratios (beta values)
  6756. \begin_inset CommandInset citation
  6757. LatexCommand cite
  6758. key "Aryee2014"
  6759. literal "false"
  6760. \end_inset
  6761. .
  6762. Any probes binding to loci that overlapped annotated SNPs were dropped,
  6763. and the annotated sex of each sample was verified against the sex inferred
  6764. from the ratio of median probe intensities for the X and Y chromosomes.
  6765. Then, the ratios were transformed to M-values.
  6766. \end_layout
  6767. \begin_layout Standard
  6768. \begin_inset Float table
  6769. wide false
  6770. sideways false
  6771. status open
  6772. \begin_layout Plain Layout
  6773. \align center
  6774. \begin_inset Tabular
  6775. <lyxtabular version="3" rows="4" columns="6">
  6776. <features tabularvalignment="middle">
  6777. <column alignment="center" valignment="top">
  6778. <column alignment="center" valignment="top">
  6779. <column alignment="center" valignment="top">
  6780. <column alignment="center" valignment="top">
  6781. <column alignment="center" valignment="top">
  6782. <column alignment="center" valignment="top">
  6783. <row>
  6784. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6785. \begin_inset Text
  6786. \begin_layout Plain Layout
  6787. Analysis
  6788. \end_layout
  6789. \end_inset
  6790. </cell>
  6791. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6792. \begin_inset Text
  6793. \begin_layout Plain Layout
  6794. random effect
  6795. \end_layout
  6796. \end_inset
  6797. </cell>
  6798. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6799. \begin_inset Text
  6800. \begin_layout Plain Layout
  6801. eBayes
  6802. \end_layout
  6803. \end_inset
  6804. </cell>
  6805. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6806. \begin_inset Text
  6807. \begin_layout Plain Layout
  6808. SVA
  6809. \end_layout
  6810. \end_inset
  6811. </cell>
  6812. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6813. \begin_inset Text
  6814. \begin_layout Plain Layout
  6815. weights
  6816. \end_layout
  6817. \end_inset
  6818. </cell>
  6819. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  6820. \begin_inset Text
  6821. \begin_layout Plain Layout
  6822. voom
  6823. \end_layout
  6824. \end_inset
  6825. </cell>
  6826. </row>
  6827. <row>
  6828. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6829. \begin_inset Text
  6830. \begin_layout Plain Layout
  6831. A
  6832. \end_layout
  6833. \end_inset
  6834. </cell>
  6835. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6836. \begin_inset Text
  6837. \begin_layout Plain Layout
  6838. Yes
  6839. \end_layout
  6840. \end_inset
  6841. </cell>
  6842. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6843. \begin_inset Text
  6844. \begin_layout Plain Layout
  6845. Yes
  6846. \end_layout
  6847. \end_inset
  6848. </cell>
  6849. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6850. \begin_inset Text
  6851. \begin_layout Plain Layout
  6852. No
  6853. \end_layout
  6854. \end_inset
  6855. </cell>
  6856. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6857. \begin_inset Text
  6858. \begin_layout Plain Layout
  6859. No
  6860. \end_layout
  6861. \end_inset
  6862. </cell>
  6863. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6864. \begin_inset Text
  6865. \begin_layout Plain Layout
  6866. No
  6867. \end_layout
  6868. \end_inset
  6869. </cell>
  6870. </row>
  6871. <row>
  6872. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6873. \begin_inset Text
  6874. \begin_layout Plain Layout
  6875. B
  6876. \end_layout
  6877. \end_inset
  6878. </cell>
  6879. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6880. \begin_inset Text
  6881. \begin_layout Plain Layout
  6882. Yes
  6883. \end_layout
  6884. \end_inset
  6885. </cell>
  6886. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6887. \begin_inset Text
  6888. \begin_layout Plain Layout
  6889. Yes
  6890. \end_layout
  6891. \end_inset
  6892. </cell>
  6893. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6894. \begin_inset Text
  6895. \begin_layout Plain Layout
  6896. Yes
  6897. \end_layout
  6898. \end_inset
  6899. </cell>
  6900. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6901. \begin_inset Text
  6902. \begin_layout Plain Layout
  6903. Yes
  6904. \end_layout
  6905. \end_inset
  6906. </cell>
  6907. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6908. \begin_inset Text
  6909. \begin_layout Plain Layout
  6910. No
  6911. \end_layout
  6912. \end_inset
  6913. </cell>
  6914. </row>
  6915. <row>
  6916. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6917. \begin_inset Text
  6918. \begin_layout Plain Layout
  6919. C
  6920. \end_layout
  6921. \end_inset
  6922. </cell>
  6923. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6924. \begin_inset Text
  6925. \begin_layout Plain Layout
  6926. Yes
  6927. \end_layout
  6928. \end_inset
  6929. </cell>
  6930. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6931. \begin_inset Text
  6932. \begin_layout Plain Layout
  6933. Yes
  6934. \end_layout
  6935. \end_inset
  6936. </cell>
  6937. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6938. \begin_inset Text
  6939. \begin_layout Plain Layout
  6940. Yes
  6941. \end_layout
  6942. \end_inset
  6943. </cell>
  6944. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6945. \begin_inset Text
  6946. \begin_layout Plain Layout
  6947. Yes
  6948. \end_layout
  6949. \end_inset
  6950. </cell>
  6951. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  6952. \begin_inset Text
  6953. \begin_layout Plain Layout
  6954. Yes
  6955. \end_layout
  6956. \end_inset
  6957. </cell>
  6958. </row>
  6959. </lyxtabular>
  6960. \end_inset
  6961. \end_layout
  6962. \begin_layout Plain Layout
  6963. \begin_inset Caption Standard
  6964. \begin_layout Plain Layout
  6965. \series bold
  6966. \begin_inset CommandInset label
  6967. LatexCommand label
  6968. name "tab:Summary-of-meth-analysis"
  6969. \end_inset
  6970. Summary of analysis variants for methylation array data.
  6971. \series default
  6972. Each analysis included a different set of steps to adjust or account for
  6973. various systematic features of the data.
  6974. Random effect: The model included a random effect accounting for correlation
  6975. between samples from the same patient
  6976. \begin_inset CommandInset citation
  6977. LatexCommand cite
  6978. key "Smyth2005a"
  6979. literal "false"
  6980. \end_inset
  6981. ; eBayes: Empirical bayes squeezing of per-probe variances toward the mean-varia
  6982. nce trend
  6983. \begin_inset CommandInset citation
  6984. LatexCommand cite
  6985. key "Ritchie2015"
  6986. literal "false"
  6987. \end_inset
  6988. ; SVA: Surrogate variable analysis to account for unobserved confounders
  6989. \begin_inset CommandInset citation
  6990. LatexCommand cite
  6991. key "Leek2007"
  6992. literal "false"
  6993. \end_inset
  6994. ; Weights: Estimate sample weights to account for differences in sample
  6995. quality
  6996. \begin_inset CommandInset citation
  6997. LatexCommand cite
  6998. key "Liu2015,Ritchie2006"
  6999. literal "false"
  7000. \end_inset
  7001. ; voom: Use mean-variance trend to assign individual sample weights
  7002. \begin_inset CommandInset citation
  7003. LatexCommand cite
  7004. key "Law2013"
  7005. literal "false"
  7006. \end_inset
  7007. .
  7008. See the text for a more detailed explanation of each step.
  7009. \end_layout
  7010. \end_inset
  7011. \end_layout
  7012. \end_inset
  7013. \end_layout
  7014. \begin_layout Standard
  7015. From the M-values, a series of parallel analyses was performed, each adding
  7016. additional steps into the model fit to accommodate a feature of the data
  7017. (see Table
  7018. \begin_inset CommandInset ref
  7019. LatexCommand ref
  7020. reference "tab:Summary-of-meth-analysis"
  7021. plural "false"
  7022. caps "false"
  7023. noprefix "false"
  7024. \end_inset
  7025. ).
  7026. For analysis A, a
  7027. \begin_inset Quotes eld
  7028. \end_inset
  7029. basic
  7030. \begin_inset Quotes erd
  7031. \end_inset
  7032. linear modeling analysis was performed, compensating for known confounders
  7033. by including terms for the factor of interest (transplant status) as well
  7034. as the known biological confounders: sex, age, ethnicity, and diabetes.
  7035. Since some samples came from the same patients at different times, the
  7036. intra-patient correlation was modeled as a random effect, estimating a
  7037. shared correlation value across all probes
  7038. \begin_inset CommandInset citation
  7039. LatexCommand cite
  7040. key "Smyth2005a"
  7041. literal "false"
  7042. \end_inset
  7043. .
  7044. Then the linear model was fit, and the variance was modeled using empirical
  7045. Bayes squeezing toward the mean-variance trend
  7046. \begin_inset CommandInset citation
  7047. LatexCommand cite
  7048. key "Ritchie2015"
  7049. literal "false"
  7050. \end_inset
  7051. .
  7052. Finally, t-tests or F-tests were performed as appropriate for each test:
  7053. t-tests for single contrasts, and F-tests for multiple contrasts.
  7054. P-values were corrected for multiple testing using the Benjamini-Hochberg
  7055. procedure for FDR control
  7056. \begin_inset CommandInset citation
  7057. LatexCommand cite
  7058. key "Benjamini1995"
  7059. literal "false"
  7060. \end_inset
  7061. .
  7062. \end_layout
  7063. \begin_layout Standard
  7064. For the analysis B, surrogate variable analysis (SVA) was used to infer
  7065. additional unobserved sources of heterogeneity in the data
  7066. \begin_inset CommandInset citation
  7067. LatexCommand cite
  7068. key "Leek2007"
  7069. literal "false"
  7070. \end_inset
  7071. .
  7072. These surrogate variables were added to the design matrix before fitting
  7073. the linear model.
  7074. In addition, sample quality weights were estimated from the data and used
  7075. during linear modeling to down-weight the contribution of highly variable
  7076. arrays while increasing the weight to arrays with lower variability
  7077. \begin_inset CommandInset citation
  7078. LatexCommand cite
  7079. key "Ritchie2006"
  7080. literal "false"
  7081. \end_inset
  7082. .
  7083. The remainder of the analysis proceeded as in analysis A.
  7084. For analysis C, the voom method was adapted to run on methylation array
  7085. data and used to model and correct for the mean-variance trend using individual
  7086. observation weights
  7087. \begin_inset CommandInset citation
  7088. LatexCommand cite
  7089. key "Law2013"
  7090. literal "false"
  7091. \end_inset
  7092. , which were combined with the sample weights
  7093. \begin_inset CommandInset citation
  7094. LatexCommand cite
  7095. key "Liu2015,Ritchie2006"
  7096. literal "false"
  7097. \end_inset
  7098. .
  7099. Each time weights were used, they were estimated once before estimating
  7100. the random effect correlation value, and then the weights were re-estimated
  7101. taking the random effect into account.
  7102. The remainder of the analysis proceeded as in analysis B.
  7103. \end_layout
  7104. \begin_layout Section
  7105. Results
  7106. \end_layout
  7107. \begin_layout Standard
  7108. \begin_inset Flex TODO Note (inline)
  7109. status open
  7110. \begin_layout Plain Layout
  7111. Improve subsection titles in this section.
  7112. \end_layout
  7113. \end_inset
  7114. \end_layout
  7115. \begin_layout Standard
  7116. \begin_inset Flex TODO Note (inline)
  7117. status open
  7118. \begin_layout Plain Layout
  7119. Reconsider subsection organization?
  7120. \end_layout
  7121. \end_inset
  7122. \end_layout
  7123. \begin_layout Subsection
  7124. Separate normalization with RMA introduces unwanted biases in classification
  7125. \end_layout
  7126. \begin_layout Standard
  7127. \begin_inset Float figure
  7128. wide false
  7129. sideways false
  7130. status open
  7131. \begin_layout Plain Layout
  7132. \align center
  7133. \begin_inset Graphics
  7134. filename graphics/PAM/predplot.pdf
  7135. lyxscale 50
  7136. width 60col%
  7137. groupId colwidth
  7138. \end_inset
  7139. \end_layout
  7140. \begin_layout Plain Layout
  7141. \begin_inset Caption Standard
  7142. \begin_layout Plain Layout
  7143. \begin_inset CommandInset label
  7144. LatexCommand label
  7145. name "fig:Classifier-probabilities-RMA"
  7146. \end_inset
  7147. \series bold
  7148. Classifier probabilities on validation samples when normalized with RMA
  7149. together vs.
  7150. separately.
  7151. \series default
  7152. The PAM classifier algorithm was trained on the training set of arrays to
  7153. distinguish AR from TX and then used to assign class probabilities to the
  7154. validation set.
  7155. The process was performed after normalizing all samples together and after
  7156. normalizing the training and test sets separately, and the class probabilities
  7157. assigned to each sample in the validation set were plotted against each
  7158. other (PP(AR), posterior probability of being AR).
  7159. The color of each point indicates the true classification of that sample.
  7160. \end_layout
  7161. \end_inset
  7162. \end_layout
  7163. \end_inset
  7164. \end_layout
  7165. \begin_layout Standard
  7166. To demonstrate the problem with non-single-channel normalization methods,
  7167. we considered the problem of training a classifier to distinguish TX from
  7168. AR using the samples from the internal set as training data, evaluating
  7169. performance on the external set.
  7170. First, training and evaluation were performed after normalizing all array
  7171. samples together as a single set using RMA, and second, the internal samples
  7172. were normalized separately from the external samples and the training and
  7173. evaluation were repeated.
  7174. For each sample in the validation set, the classifier probabilities from
  7175. both classifiers were plotted against each other (Fig.
  7176. \begin_inset CommandInset ref
  7177. LatexCommand ref
  7178. reference "fig:Classifier-probabilities-RMA"
  7179. plural "false"
  7180. caps "false"
  7181. noprefix "false"
  7182. \end_inset
  7183. ).
  7184. As expected, separate normalization biases the classifier probabilities,
  7185. resulting in several misclassifications.
  7186. In this case, the bias from separate normalization causes the classifier
  7187. to assign a lower probability of AR to every sample.
  7188. \end_layout
  7189. \begin_layout Subsection
  7190. fRMA and SCAN maintain classification performance while eliminating dependence
  7191. on normalization strategy
  7192. \end_layout
  7193. \begin_layout Standard
  7194. \begin_inset Float figure
  7195. wide false
  7196. sideways false
  7197. status open
  7198. \begin_layout Plain Layout
  7199. \align center
  7200. \begin_inset Float figure
  7201. placement tb
  7202. wide false
  7203. sideways false
  7204. status open
  7205. \begin_layout Plain Layout
  7206. \align center
  7207. \begin_inset Graphics
  7208. filename graphics/PAM/ROC-TXvsAR-internal.pdf
  7209. lyxscale 50
  7210. height 40theight%
  7211. groupId roc-pam
  7212. \end_inset
  7213. \end_layout
  7214. \begin_layout Plain Layout
  7215. \begin_inset Caption Standard
  7216. \begin_layout Plain Layout
  7217. \begin_inset CommandInset label
  7218. LatexCommand label
  7219. name "fig:ROC-PAM-int"
  7220. \end_inset
  7221. ROC curves for PAM on internal validation data
  7222. \end_layout
  7223. \end_inset
  7224. \end_layout
  7225. \end_inset
  7226. \end_layout
  7227. \begin_layout Plain Layout
  7228. \align center
  7229. \begin_inset Float figure
  7230. placement tb
  7231. wide false
  7232. sideways false
  7233. status open
  7234. \begin_layout Plain Layout
  7235. \align center
  7236. \begin_inset Graphics
  7237. filename graphics/PAM/ROC-TXvsAR-external.pdf
  7238. lyxscale 50
  7239. height 40theight%
  7240. groupId roc-pam
  7241. \end_inset
  7242. \end_layout
  7243. \begin_layout Plain Layout
  7244. \begin_inset Caption Standard
  7245. \begin_layout Plain Layout
  7246. \begin_inset CommandInset label
  7247. LatexCommand label
  7248. name "fig:ROC-PAM-ext"
  7249. \end_inset
  7250. ROC curves for PAM on external validation data
  7251. \end_layout
  7252. \end_inset
  7253. \end_layout
  7254. \end_inset
  7255. \end_layout
  7256. \begin_layout Plain Layout
  7257. \begin_inset Caption Standard
  7258. \begin_layout Plain Layout
  7259. \series bold
  7260. \begin_inset CommandInset label
  7261. LatexCommand label
  7262. name "fig:ROC-PAM-main"
  7263. \end_inset
  7264. ROC curves for PAM using different normalization strategies.
  7265. \series default
  7266. ROC curves were generated for PAM classification of AR vs TX after 6 different
  7267. normalization strategies applied to the same data sets.
  7268. Only fRMA and SCAN are single-channel normalizations.
  7269. The other normalizations are for comparison.
  7270. \end_layout
  7271. \end_inset
  7272. \end_layout
  7273. \end_inset
  7274. \end_layout
  7275. \begin_layout Standard
  7276. \begin_inset Float table
  7277. wide false
  7278. sideways false
  7279. status open
  7280. \begin_layout Plain Layout
  7281. \align center
  7282. \begin_inset Tabular
  7283. <lyxtabular version="3" rows="7" columns="4">
  7284. <features tabularvalignment="middle">
  7285. <column alignment="center" valignment="top">
  7286. <column alignment="center" valignment="top">
  7287. <column alignment="center" valignment="top">
  7288. <column alignment="center" valignment="top">
  7289. <row>
  7290. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7291. \begin_inset Text
  7292. \begin_layout Plain Layout
  7293. \family roman
  7294. \series medium
  7295. \shape up
  7296. \size normal
  7297. \emph off
  7298. \bar no
  7299. \strikeout off
  7300. \xout off
  7301. \uuline off
  7302. \uwave off
  7303. \noun off
  7304. \color none
  7305. Normalization
  7306. \end_layout
  7307. \end_inset
  7308. </cell>
  7309. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7310. \begin_inset Text
  7311. \begin_layout Plain Layout
  7312. Single-channel?
  7313. \end_layout
  7314. \end_inset
  7315. </cell>
  7316. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7317. \begin_inset Text
  7318. \begin_layout Plain Layout
  7319. \family roman
  7320. \series medium
  7321. \shape up
  7322. \size normal
  7323. \emph off
  7324. \bar no
  7325. \strikeout off
  7326. \xout off
  7327. \uuline off
  7328. \uwave off
  7329. \noun off
  7330. \color none
  7331. Internal Val.
  7332. AUC
  7333. \end_layout
  7334. \end_inset
  7335. </cell>
  7336. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  7337. \begin_inset Text
  7338. \begin_layout Plain Layout
  7339. External Val.
  7340. AUC
  7341. \end_layout
  7342. \end_inset
  7343. </cell>
  7344. </row>
  7345. <row>
  7346. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7347. \begin_inset Text
  7348. \begin_layout Plain Layout
  7349. \family roman
  7350. \series medium
  7351. \shape up
  7352. \size normal
  7353. \emph off
  7354. \bar no
  7355. \strikeout off
  7356. \xout off
  7357. \uuline off
  7358. \uwave off
  7359. \noun off
  7360. \color none
  7361. RMA
  7362. \end_layout
  7363. \end_inset
  7364. </cell>
  7365. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7366. \begin_inset Text
  7367. \begin_layout Plain Layout
  7368. No
  7369. \end_layout
  7370. \end_inset
  7371. </cell>
  7372. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7373. \begin_inset Text
  7374. \begin_layout Plain Layout
  7375. \family roman
  7376. \series medium
  7377. \shape up
  7378. \size normal
  7379. \emph off
  7380. \bar no
  7381. \strikeout off
  7382. \xout off
  7383. \uuline off
  7384. \uwave off
  7385. \noun off
  7386. \color none
  7387. 0.852
  7388. \end_layout
  7389. \end_inset
  7390. </cell>
  7391. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  7392. \begin_inset Text
  7393. \begin_layout Plain Layout
  7394. \family roman
  7395. \series medium
  7396. \shape up
  7397. \size normal
  7398. \emph off
  7399. \bar no
  7400. \strikeout off
  7401. \xout off
  7402. \uuline off
  7403. \uwave off
  7404. \noun off
  7405. \color none
  7406. 0.713
  7407. \end_layout
  7408. \end_inset
  7409. </cell>
  7410. </row>
  7411. <row>
  7412. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7413. \begin_inset Text
  7414. \begin_layout Plain Layout
  7415. \family roman
  7416. \series medium
  7417. \shape up
  7418. \size normal
  7419. \emph off
  7420. \bar no
  7421. \strikeout off
  7422. \xout off
  7423. \uuline off
  7424. \uwave off
  7425. \noun off
  7426. \color none
  7427. dChip
  7428. \end_layout
  7429. \end_inset
  7430. </cell>
  7431. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7432. \begin_inset Text
  7433. \begin_layout Plain Layout
  7434. No
  7435. \end_layout
  7436. \end_inset
  7437. </cell>
  7438. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7439. \begin_inset Text
  7440. \begin_layout Plain Layout
  7441. \family roman
  7442. \series medium
  7443. \shape up
  7444. \size normal
  7445. \emph off
  7446. \bar no
  7447. \strikeout off
  7448. \xout off
  7449. \uuline off
  7450. \uwave off
  7451. \noun off
  7452. \color none
  7453. 0.891
  7454. \end_layout
  7455. \end_inset
  7456. </cell>
  7457. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  7458. \begin_inset Text
  7459. \begin_layout Plain Layout
  7460. \family roman
  7461. \series medium
  7462. \shape up
  7463. \size normal
  7464. \emph off
  7465. \bar no
  7466. \strikeout off
  7467. \xout off
  7468. \uuline off
  7469. \uwave off
  7470. \noun off
  7471. \color none
  7472. 0.657
  7473. \end_layout
  7474. \end_inset
  7475. </cell>
  7476. </row>
  7477. <row>
  7478. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7479. \begin_inset Text
  7480. \begin_layout Plain Layout
  7481. \family roman
  7482. \series medium
  7483. \shape up
  7484. \size normal
  7485. \emph off
  7486. \bar no
  7487. \strikeout off
  7488. \xout off
  7489. \uuline off
  7490. \uwave off
  7491. \noun off
  7492. \color none
  7493. RMA + GRSN
  7494. \end_layout
  7495. \end_inset
  7496. </cell>
  7497. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7498. \begin_inset Text
  7499. \begin_layout Plain Layout
  7500. No
  7501. \end_layout
  7502. \end_inset
  7503. </cell>
  7504. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7505. \begin_inset Text
  7506. \begin_layout Plain Layout
  7507. \family roman
  7508. \series medium
  7509. \shape up
  7510. \size normal
  7511. \emph off
  7512. \bar no
  7513. \strikeout off
  7514. \xout off
  7515. \uuline off
  7516. \uwave off
  7517. \noun off
  7518. \color none
  7519. 0.816
  7520. \end_layout
  7521. \end_inset
  7522. </cell>
  7523. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  7524. \begin_inset Text
  7525. \begin_layout Plain Layout
  7526. \family roman
  7527. \series medium
  7528. \shape up
  7529. \size normal
  7530. \emph off
  7531. \bar no
  7532. \strikeout off
  7533. \xout off
  7534. \uuline off
  7535. \uwave off
  7536. \noun off
  7537. \color none
  7538. 0.750
  7539. \end_layout
  7540. \end_inset
  7541. </cell>
  7542. </row>
  7543. <row>
  7544. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7545. \begin_inset Text
  7546. \begin_layout Plain Layout
  7547. \family roman
  7548. \series medium
  7549. \shape up
  7550. \size normal
  7551. \emph off
  7552. \bar no
  7553. \strikeout off
  7554. \xout off
  7555. \uuline off
  7556. \uwave off
  7557. \noun off
  7558. \color none
  7559. dChip + GRSN
  7560. \end_layout
  7561. \end_inset
  7562. </cell>
  7563. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7564. \begin_inset Text
  7565. \begin_layout Plain Layout
  7566. No
  7567. \end_layout
  7568. \end_inset
  7569. </cell>
  7570. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7571. \begin_inset Text
  7572. \begin_layout Plain Layout
  7573. \family roman
  7574. \series medium
  7575. \shape up
  7576. \size normal
  7577. \emph off
  7578. \bar no
  7579. \strikeout off
  7580. \xout off
  7581. \uuline off
  7582. \uwave off
  7583. \noun off
  7584. \color none
  7585. 0.875
  7586. \end_layout
  7587. \end_inset
  7588. </cell>
  7589. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  7590. \begin_inset Text
  7591. \begin_layout Plain Layout
  7592. \family roman
  7593. \series medium
  7594. \shape up
  7595. \size normal
  7596. \emph off
  7597. \bar no
  7598. \strikeout off
  7599. \xout off
  7600. \uuline off
  7601. \uwave off
  7602. \noun off
  7603. \color none
  7604. 0.642
  7605. \end_layout
  7606. \end_inset
  7607. </cell>
  7608. </row>
  7609. <row>
  7610. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7611. \begin_inset Text
  7612. \begin_layout Plain Layout
  7613. \family roman
  7614. \series medium
  7615. \shape up
  7616. \size normal
  7617. \emph off
  7618. \bar no
  7619. \strikeout off
  7620. \xout off
  7621. \uuline off
  7622. \uwave off
  7623. \noun off
  7624. \color none
  7625. fRMA
  7626. \end_layout
  7627. \end_inset
  7628. </cell>
  7629. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7630. \begin_inset Text
  7631. \begin_layout Plain Layout
  7632. Yes
  7633. \end_layout
  7634. \end_inset
  7635. </cell>
  7636. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  7637. \begin_inset Text
  7638. \begin_layout Plain Layout
  7639. \family roman
  7640. \series medium
  7641. \shape up
  7642. \size normal
  7643. \emph off
  7644. \bar no
  7645. \strikeout off
  7646. \xout off
  7647. \uuline off
  7648. \uwave off
  7649. \noun off
  7650. \color none
  7651. 0.863
  7652. \end_layout
  7653. \end_inset
  7654. </cell>
  7655. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  7656. \begin_inset Text
  7657. \begin_layout Plain Layout
  7658. \family roman
  7659. \series medium
  7660. \shape up
  7661. \size normal
  7662. \emph off
  7663. \bar no
  7664. \strikeout off
  7665. \xout off
  7666. \uuline off
  7667. \uwave off
  7668. \noun off
  7669. \color none
  7670. 0.718
  7671. \end_layout
  7672. \end_inset
  7673. </cell>
  7674. </row>
  7675. <row>
  7676. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7677. \begin_inset Text
  7678. \begin_layout Plain Layout
  7679. \family roman
  7680. \series medium
  7681. \shape up
  7682. \size normal
  7683. \emph off
  7684. \bar no
  7685. \strikeout off
  7686. \xout off
  7687. \uuline off
  7688. \uwave off
  7689. \noun off
  7690. \color none
  7691. SCAN
  7692. \end_layout
  7693. \end_inset
  7694. </cell>
  7695. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7696. \begin_inset Text
  7697. \begin_layout Plain Layout
  7698. Yes
  7699. \end_layout
  7700. \end_inset
  7701. </cell>
  7702. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7703. \begin_inset Text
  7704. \begin_layout Plain Layout
  7705. \family roman
  7706. \series medium
  7707. \shape up
  7708. \size normal
  7709. \emph off
  7710. \bar no
  7711. \strikeout off
  7712. \xout off
  7713. \uuline off
  7714. \uwave off
  7715. \noun off
  7716. \color none
  7717. 0.853
  7718. \end_layout
  7719. \end_inset
  7720. </cell>
  7721. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  7722. \begin_inset Text
  7723. \begin_layout Plain Layout
  7724. \family roman
  7725. \series medium
  7726. \shape up
  7727. \size normal
  7728. \emph off
  7729. \bar no
  7730. \strikeout off
  7731. \xout off
  7732. \uuline off
  7733. \uwave off
  7734. \noun off
  7735. \color none
  7736. 0.689
  7737. \end_layout
  7738. \end_inset
  7739. </cell>
  7740. </row>
  7741. </lyxtabular>
  7742. \end_inset
  7743. \end_layout
  7744. \begin_layout Plain Layout
  7745. \begin_inset Caption Standard
  7746. \begin_layout Plain Layout
  7747. \begin_inset CommandInset label
  7748. LatexCommand label
  7749. name "tab:AUC-PAM"
  7750. \end_inset
  7751. \series bold
  7752. ROC curve AUC values for internal and external validation with 6 different
  7753. normalization strategies.
  7754. \series default
  7755. These AUC values correspond to the ROC curves in Figure
  7756. \begin_inset CommandInset ref
  7757. LatexCommand ref
  7758. reference "fig:ROC-PAM-main"
  7759. plural "false"
  7760. caps "false"
  7761. noprefix "false"
  7762. \end_inset
  7763. .
  7764. \end_layout
  7765. \end_inset
  7766. \end_layout
  7767. \end_inset
  7768. \end_layout
  7769. \begin_layout Standard
  7770. For internal validation, the 6 methods' AUC values ranged from 0.816 to 0.891,
  7771. as shown in Table
  7772. \begin_inset CommandInset ref
  7773. LatexCommand ref
  7774. reference "tab:AUC-PAM"
  7775. plural "false"
  7776. caps "false"
  7777. noprefix "false"
  7778. \end_inset
  7779. .
  7780. Among the non-single-channel normalizations, dChip outperformed RMA, while
  7781. GRSN reduced the AUC values for both dChip and RMA.
  7782. Both single-channel methods, fRMA and SCAN, slightly outperformed RMA,
  7783. with fRMA ahead of SCAN.
  7784. However, the difference between RMA and fRMA is still quite small.
  7785. Figure
  7786. \begin_inset CommandInset ref
  7787. LatexCommand ref
  7788. reference "fig:ROC-PAM-int"
  7789. plural "false"
  7790. caps "false"
  7791. noprefix "false"
  7792. \end_inset
  7793. shows that the ROC curves for RMA, dChip, and fRMA look very similar and
  7794. relatively smooth, while both GRSN curves and the curve for SCAN have a
  7795. more jagged appearance.
  7796. \end_layout
  7797. \begin_layout Standard
  7798. For external validation, as expected, all the AUC values are lower than
  7799. the internal validations, ranging from 0.642 to 0.750 (Table
  7800. \begin_inset CommandInset ref
  7801. LatexCommand ref
  7802. reference "tab:AUC-PAM"
  7803. plural "false"
  7804. caps "false"
  7805. noprefix "false"
  7806. \end_inset
  7807. ).
  7808. With or without GRSN, RMA shows its dominance over dChip in this more challengi
  7809. ng test.
  7810. Unlike in the internal validation, GRSN actually improves the classifier
  7811. performance for RMA, although it does not for dChip.
  7812. Once again, both single-channel methods perform about on par with RMA,
  7813. with fRMA performing slightly better and SCAN performing a bit worse.
  7814. Figure
  7815. \begin_inset CommandInset ref
  7816. LatexCommand ref
  7817. reference "fig:ROC-PAM-ext"
  7818. plural "false"
  7819. caps "false"
  7820. noprefix "false"
  7821. \end_inset
  7822. shows the ROC curves for the external validation test.
  7823. As expected, none of them are as clean-looking as the internal validation
  7824. ROC curves.
  7825. The curves for RMA, RMA+GRSN, and fRMA all look similar, while the other
  7826. curves look more divergent.
  7827. \end_layout
  7828. \begin_layout Subsection
  7829. fRMA with custom-generated vectors enables single-channel normalization
  7830. on hthgu133pluspm platform
  7831. \end_layout
  7832. \begin_layout Standard
  7833. \begin_inset Float figure
  7834. wide false
  7835. sideways false
  7836. status open
  7837. \begin_layout Plain Layout
  7838. \align center
  7839. \begin_inset Float figure
  7840. placement tb
  7841. wide false
  7842. sideways false
  7843. status collapsed
  7844. \begin_layout Plain Layout
  7845. \align center
  7846. \begin_inset Graphics
  7847. filename graphics/frma-pax-bx/batchsize_batches.pdf
  7848. lyxscale 50
  7849. height 35theight%
  7850. groupId frmatools-subfig
  7851. \end_inset
  7852. \end_layout
  7853. \begin_layout Plain Layout
  7854. \begin_inset Caption Standard
  7855. \begin_layout Plain Layout
  7856. \begin_inset CommandInset label
  7857. LatexCommand label
  7858. name "fig:batch-size-batches"
  7859. \end_inset
  7860. \series bold
  7861. Number of batches usable in fRMA probe weight learning as a function of
  7862. batch size.
  7863. \end_layout
  7864. \end_inset
  7865. \end_layout
  7866. \end_inset
  7867. \end_layout
  7868. \begin_layout Plain Layout
  7869. \align center
  7870. \begin_inset Float figure
  7871. placement tb
  7872. wide false
  7873. sideways false
  7874. status collapsed
  7875. \begin_layout Plain Layout
  7876. \align center
  7877. \begin_inset Graphics
  7878. filename graphics/frma-pax-bx/batchsize_samples.pdf
  7879. lyxscale 50
  7880. height 35theight%
  7881. groupId frmatools-subfig
  7882. \end_inset
  7883. \end_layout
  7884. \begin_layout Plain Layout
  7885. \begin_inset Caption Standard
  7886. \begin_layout Plain Layout
  7887. \begin_inset CommandInset label
  7888. LatexCommand label
  7889. name "fig:batch-size-samples"
  7890. \end_inset
  7891. \series bold
  7892. Number of samples usable in fRMA probe weight learning as a function of
  7893. batch size.
  7894. \end_layout
  7895. \end_inset
  7896. \end_layout
  7897. \end_inset
  7898. \end_layout
  7899. \begin_layout Plain Layout
  7900. \begin_inset Caption Standard
  7901. \begin_layout Plain Layout
  7902. \series bold
  7903. \begin_inset CommandInset label
  7904. LatexCommand label
  7905. name "fig:frmatools-batch-size"
  7906. \end_inset
  7907. Effect of batch size selection on number of batches and number of samples
  7908. included in fRMA probe weight learning.
  7909. \series default
  7910. For batch sizes ranging from 3 to 15, the number of batches (a) and samples
  7911. (b) included in probe weight training were plotted for biopsy (BX) and
  7912. blood (PAX) samples.
  7913. The selected batch size, 5, is marked with a dotted vertical line.
  7914. \end_layout
  7915. \end_inset
  7916. \end_layout
  7917. \end_inset
  7918. \end_layout
  7919. \begin_layout Standard
  7920. In order to enable use of fRMA to normalize hthgu133pluspm, a custom set
  7921. of fRMA vectors was created.
  7922. First, an appropriate batch size was chosen by looking at the number of
  7923. batches and number of samples included as a function of batch size (Figure
  7924. \begin_inset CommandInset ref
  7925. LatexCommand ref
  7926. reference "fig:frmatools-batch-size"
  7927. plural "false"
  7928. caps "false"
  7929. noprefix "false"
  7930. \end_inset
  7931. ).
  7932. For a given batch size, all batches with fewer samples that the chosen
  7933. size must be ignored during training, while larger batches must be randomly
  7934. downsampled to the chosen size.
  7935. Hence, the number of samples included for a given batch size equals the
  7936. batch size times the number of batches with at least that many samples.
  7937. From Figure
  7938. \begin_inset CommandInset ref
  7939. LatexCommand ref
  7940. reference "fig:batch-size-samples"
  7941. plural "false"
  7942. caps "false"
  7943. noprefix "false"
  7944. \end_inset
  7945. , it is apparent that that a batch size of 8 maximizes the number of samples
  7946. included in training.
  7947. Increasing the batch size beyond this causes too many smaller batches to
  7948. be excluded, reducing the total number of samples for both tissue types.
  7949. However, a batch size of 8 is not necessarily optimal.
  7950. The article introducing frmaTools concluded that it was highly advantageous
  7951. to use a smaller batch size in order to include more batches, even at the
  7952. expense of including fewer total samples in training
  7953. \begin_inset CommandInset citation
  7954. LatexCommand cite
  7955. key "McCall2011"
  7956. literal "false"
  7957. \end_inset
  7958. .
  7959. To strike an appropriate balance between more batches and more samples,
  7960. a batch size of 5 was chosen.
  7961. For both blood and biopsy samples, this increased the number of batches
  7962. included by 10, with only a modest reduction in the number of samples compared
  7963. to a batch size of 8.
  7964. With a batch size of 5, 26 batches of biopsy samples and 46 batches of
  7965. blood samples were available.
  7966. \end_layout
  7967. \begin_layout Standard
  7968. \begin_inset Float figure
  7969. wide false
  7970. sideways false
  7971. status collapsed
  7972. \begin_layout Plain Layout
  7973. \begin_inset Float figure
  7974. wide false
  7975. sideways false
  7976. status open
  7977. \begin_layout Plain Layout
  7978. \align center
  7979. \begin_inset Graphics
  7980. filename graphics/frma-pax-bx/M-BX-violin.pdf
  7981. lyxscale 40
  7982. width 45col%
  7983. groupId m-violin
  7984. \end_inset
  7985. \end_layout
  7986. \begin_layout Plain Layout
  7987. \begin_inset Caption Standard
  7988. \begin_layout Plain Layout
  7989. \begin_inset CommandInset label
  7990. LatexCommand label
  7991. name "fig:m-bx-violin"
  7992. \end_inset
  7993. \series bold
  7994. Violin plot of inter-normalization log ratios for biopsy samples.
  7995. \end_layout
  7996. \end_inset
  7997. \end_layout
  7998. \end_inset
  7999. \begin_inset space \hfill{}
  8000. \end_inset
  8001. \begin_inset Float figure
  8002. wide false
  8003. sideways false
  8004. status collapsed
  8005. \begin_layout Plain Layout
  8006. \align center
  8007. \begin_inset Graphics
  8008. filename graphics/frma-pax-bx/M-PAX-violin.pdf
  8009. lyxscale 40
  8010. width 45col%
  8011. groupId m-violin
  8012. \end_inset
  8013. \end_layout
  8014. \begin_layout Plain Layout
  8015. \begin_inset Caption Standard
  8016. \begin_layout Plain Layout
  8017. \begin_inset CommandInset label
  8018. LatexCommand label
  8019. name "fig:m-pax-violin"
  8020. \end_inset
  8021. \series bold
  8022. Violin plot of inter-normalization log ratios for blood samples.
  8023. \end_layout
  8024. \end_inset
  8025. \end_layout
  8026. \end_inset
  8027. \end_layout
  8028. \begin_layout Plain Layout
  8029. \begin_inset Caption Standard
  8030. \begin_layout Plain Layout
  8031. \begin_inset CommandInset label
  8032. LatexCommand label
  8033. name "fig:frma-violin"
  8034. \end_inset
  8035. \series bold
  8036. Violin plot of log ratios between normalizations for 20 biopsy samples.
  8037. \series default
  8038. Each of 20 randomly selected samples was normalized with RMA and with 5
  8039. different sets of fRMA vectors.
  8040. The distribution of log ratios between normalized expression values, aggregated
  8041. across all 20 arrays, was plotted for each pair of normalizations.
  8042. \end_layout
  8043. \end_inset
  8044. \end_layout
  8045. \end_inset
  8046. \end_layout
  8047. \begin_layout Standard
  8048. Since fRMA training requires equal-size batches, larger batches are downsampled
  8049. randomly.
  8050. This introduces a nondeterministic step in the generation of normalization
  8051. vectors.
  8052. To show that this randomness does not substantially change the outcome,
  8053. the random downsampling and subsequent vector learning was repeated 5 times,
  8054. with a different random seed each time.
  8055. 20 samples were selected at random as a test set and normalized with each
  8056. of the 5 sets of fRMA normalization vectors as well as ordinary RMA, and
  8057. the normalized expression values were compared across normalizations.
  8058. Figure
  8059. \begin_inset CommandInset ref
  8060. LatexCommand ref
  8061. reference "fig:m-bx-violin"
  8062. plural "false"
  8063. caps "false"
  8064. noprefix "false"
  8065. \end_inset
  8066. shows a summary of these comparisons for biopsy samples.
  8067. Comparing RMA to each of the 5 fRMA normalizations, the distribution of
  8068. log ratios is somewhat wide, indicating that the normalizations disagree
  8069. on the expression values of a fair number of probe sets.
  8070. In contrast, comparisons of fRMA against fRMA, the vast majority of probe
  8071. sets have very small log ratios, indicating a very high agreement between
  8072. the normalized values generated by the two normalizations.
  8073. This shows that the fRMA normalization's behavior is not very sensitive
  8074. to the random downsampling of larger batches during training.
  8075. \end_layout
  8076. \begin_layout Standard
  8077. \begin_inset Float figure
  8078. wide false
  8079. sideways false
  8080. status open
  8081. \begin_layout Plain Layout
  8082. \align center
  8083. \begin_inset Float figure
  8084. wide false
  8085. sideways false
  8086. status collapsed
  8087. \begin_layout Plain Layout
  8088. \align center
  8089. \begin_inset Graphics
  8090. filename graphics/frma-pax-bx/MA-BX-RMA.fRMA-RASTER.png
  8091. lyxscale 10
  8092. width 45col%
  8093. groupId ma-frma
  8094. \end_inset
  8095. \end_layout
  8096. \begin_layout Plain Layout
  8097. \begin_inset Caption Standard
  8098. \begin_layout Plain Layout
  8099. \begin_inset CommandInset label
  8100. LatexCommand label
  8101. name "fig:ma-bx-rma-frma"
  8102. \end_inset
  8103. RMA vs.
  8104. fRMA for biopsy samples.
  8105. \end_layout
  8106. \end_inset
  8107. \end_layout
  8108. \end_inset
  8109. \begin_inset space \hfill{}
  8110. \end_inset
  8111. \begin_inset Float figure
  8112. wide false
  8113. sideways false
  8114. status collapsed
  8115. \begin_layout Plain Layout
  8116. \align center
  8117. \begin_inset Graphics
  8118. filename graphics/frma-pax-bx/MA-BX-fRMA.fRMA-RASTER.png
  8119. lyxscale 10
  8120. width 45col%
  8121. groupId ma-frma
  8122. \end_inset
  8123. \end_layout
  8124. \begin_layout Plain Layout
  8125. \begin_inset Caption Standard
  8126. \begin_layout Plain Layout
  8127. \begin_inset CommandInset label
  8128. LatexCommand label
  8129. name "fig:ma-bx-frma-frma"
  8130. \end_inset
  8131. fRMA vs fRMA for biopsy samples.
  8132. \end_layout
  8133. \end_inset
  8134. \end_layout
  8135. \end_inset
  8136. \end_layout
  8137. \begin_layout Plain Layout
  8138. \align center
  8139. \begin_inset Float figure
  8140. wide false
  8141. sideways false
  8142. status collapsed
  8143. \begin_layout Plain Layout
  8144. \align center
  8145. \begin_inset Graphics
  8146. filename graphics/frma-pax-bx/MA-PAX-RMA.fRMA-RASTER.png
  8147. lyxscale 10
  8148. width 45col%
  8149. groupId ma-frma
  8150. \end_inset
  8151. \end_layout
  8152. \begin_layout Plain Layout
  8153. \begin_inset Caption Standard
  8154. \begin_layout Plain Layout
  8155. \begin_inset CommandInset label
  8156. LatexCommand label
  8157. name "fig:MA-PAX-rma-frma"
  8158. \end_inset
  8159. RMA vs.
  8160. fRMA for blood samples.
  8161. \end_layout
  8162. \end_inset
  8163. \end_layout
  8164. \end_inset
  8165. \begin_inset space \hfill{}
  8166. \end_inset
  8167. \begin_inset Float figure
  8168. wide false
  8169. sideways false
  8170. status collapsed
  8171. \begin_layout Plain Layout
  8172. \align center
  8173. \begin_inset Graphics
  8174. filename graphics/frma-pax-bx/MA-PAX-fRMA.fRMA-RASTER.png
  8175. lyxscale 10
  8176. width 45col%
  8177. groupId ma-frma
  8178. \end_inset
  8179. \end_layout
  8180. \begin_layout Plain Layout
  8181. \begin_inset Caption Standard
  8182. \begin_layout Plain Layout
  8183. \begin_inset CommandInset label
  8184. LatexCommand label
  8185. name "fig:MA-PAX-frma-frma"
  8186. \end_inset
  8187. fRMA vs fRMA for blood samples.
  8188. \end_layout
  8189. \end_inset
  8190. \end_layout
  8191. \end_inset
  8192. \end_layout
  8193. \begin_layout Plain Layout
  8194. \begin_inset Caption Standard
  8195. \begin_layout Plain Layout
  8196. \series bold
  8197. \begin_inset CommandInset label
  8198. LatexCommand label
  8199. name "fig:Representative-MA-plots"
  8200. \end_inset
  8201. Representative MA plots comparing RMA and custom fRMA normalizations.
  8202. \series default
  8203. For each plot, 20 samples were normalized using 2 different normalizations,
  8204. and then averages (A) and log ratios (M) were plotted between the two different
  8205. normalizations for every probe.
  8206. For the
  8207. \begin_inset Quotes eld
  8208. \end_inset
  8209. fRMA vs fRMA
  8210. \begin_inset Quotes erd
  8211. \end_inset
  8212. plots (b & d), two different fRMA normalizations using vectors from two
  8213. independent batch samplings were compared.
  8214. Density of points is represented by blue shading, and individual outlier
  8215. points are plotted.
  8216. \end_layout
  8217. \end_inset
  8218. \end_layout
  8219. \end_inset
  8220. \end_layout
  8221. \begin_layout Standard
  8222. Figure
  8223. \begin_inset CommandInset ref
  8224. LatexCommand ref
  8225. reference "fig:ma-bx-rma-frma"
  8226. plural "false"
  8227. caps "false"
  8228. noprefix "false"
  8229. \end_inset
  8230. shows an MA plot of the RMA-normalized values against the fRMA-normalized
  8231. values for the same probe sets and arrays, corresponding to the first row
  8232. of Figure
  8233. \begin_inset CommandInset ref
  8234. LatexCommand ref
  8235. reference "fig:m-bx-violin"
  8236. plural "false"
  8237. caps "false"
  8238. noprefix "false"
  8239. \end_inset
  8240. .
  8241. This MA plot shows that not only is there a wide distribution of M-values,
  8242. but the trend of M-values is dependent on the average normalized intensity.
  8243. This is expected, since the overall trend represents the differences in
  8244. the quantile normalization step.
  8245. When running RMA, only the quantiles for these specific 20 arrays are used,
  8246. while for fRMA the quantile distribution is taking from all arrays used
  8247. in training.
  8248. Figure
  8249. \begin_inset CommandInset ref
  8250. LatexCommand ref
  8251. reference "fig:ma-bx-frma-frma"
  8252. plural "false"
  8253. caps "false"
  8254. noprefix "false"
  8255. \end_inset
  8256. shows a similar MA plot comparing 2 different fRMA normalizations, correspondin
  8257. g to the 6th row of Figure
  8258. \begin_inset CommandInset ref
  8259. LatexCommand ref
  8260. reference "fig:m-bx-violin"
  8261. plural "false"
  8262. caps "false"
  8263. noprefix "false"
  8264. \end_inset
  8265. .
  8266. The MA plot is very tightly centered around zero with no visible trend.
  8267. Figures
  8268. \begin_inset CommandInset ref
  8269. LatexCommand ref
  8270. reference "fig:m-pax-violin"
  8271. plural "false"
  8272. caps "false"
  8273. noprefix "false"
  8274. \end_inset
  8275. ,
  8276. \begin_inset CommandInset ref
  8277. LatexCommand ref
  8278. reference "fig:MA-PAX-rma-frma"
  8279. plural "false"
  8280. caps "false"
  8281. noprefix "false"
  8282. \end_inset
  8283. , and
  8284. \begin_inset CommandInset ref
  8285. LatexCommand ref
  8286. reference "fig:ma-bx-frma-frma"
  8287. plural "false"
  8288. caps "false"
  8289. noprefix "false"
  8290. \end_inset
  8291. show exactly the same information for the blood samples, once again comparing
  8292. the normalized expression values between normalizations for all probe sets
  8293. across 20 randomly selected test arrays.
  8294. Once again, there is a wider distribution of log ratios between RMA-normalized
  8295. values and fRMA-normalized, and a much tighter distribution when comparing
  8296. different fRMA normalizations to each other, indicating that the fRMA training
  8297. process is robust to random batch downsampling for the blood samples as
  8298. well.
  8299. \end_layout
  8300. \begin_layout Subsection
  8301. SVA, voom, and array weights improve model fit for methylation array data
  8302. \end_layout
  8303. \begin_layout Standard
  8304. \begin_inset ERT
  8305. status open
  8306. \begin_layout Plain Layout
  8307. \backslash
  8308. afterpage{
  8309. \end_layout
  8310. \begin_layout Plain Layout
  8311. \backslash
  8312. begin{landscape}
  8313. \end_layout
  8314. \end_inset
  8315. \end_layout
  8316. \begin_layout Standard
  8317. \begin_inset Float figure
  8318. wide false
  8319. sideways false
  8320. status open
  8321. \begin_layout Plain Layout
  8322. \begin_inset Flex TODO Note (inline)
  8323. status open
  8324. \begin_layout Plain Layout
  8325. Fix axis labels:
  8326. \begin_inset Quotes eld
  8327. \end_inset
  8328. log2 M-value
  8329. \begin_inset Quotes erd
  8330. \end_inset
  8331. is redundant because M-values are already log scale
  8332. \end_layout
  8333. \end_inset
  8334. \end_layout
  8335. \begin_layout Plain Layout
  8336. \begin_inset Float figure
  8337. wide false
  8338. sideways false
  8339. status collapsed
  8340. \begin_layout Plain Layout
  8341. \align center
  8342. \begin_inset Graphics
  8343. filename graphics/methylvoom/unadj.dupcor/meanvar-trends-PAGE1-CROP-RASTER.png
  8344. lyxscale 15
  8345. width 30col%
  8346. groupId voomaw-subfig
  8347. \end_inset
  8348. \end_layout
  8349. \begin_layout Plain Layout
  8350. \begin_inset Caption Standard
  8351. \begin_layout Plain Layout
  8352. \begin_inset CommandInset label
  8353. LatexCommand label
  8354. name "fig:meanvar-basic"
  8355. \end_inset
  8356. Mean-variance trend for analysis A.
  8357. \end_layout
  8358. \end_inset
  8359. \end_layout
  8360. \end_inset
  8361. \begin_inset space \hfill{}
  8362. \end_inset
  8363. \begin_inset Float figure
  8364. wide false
  8365. sideways false
  8366. status collapsed
  8367. \begin_layout Plain Layout
  8368. \align center
  8369. \begin_inset Graphics
  8370. filename graphics/methylvoom/unadj.dupcor.sva.aw/meanvar-trends-PAGE1-CROP-RASTER.png
  8371. lyxscale 15
  8372. width 30col%
  8373. groupId voomaw-subfig
  8374. \end_inset
  8375. \end_layout
  8376. \begin_layout Plain Layout
  8377. \begin_inset Caption Standard
  8378. \begin_layout Plain Layout
  8379. \begin_inset CommandInset label
  8380. LatexCommand label
  8381. name "fig:meanvar-sva-aw"
  8382. \end_inset
  8383. Mean-variance trend for analysis B.
  8384. \end_layout
  8385. \end_inset
  8386. \end_layout
  8387. \end_inset
  8388. \begin_inset space \hfill{}
  8389. \end_inset
  8390. \begin_inset Float figure
  8391. wide false
  8392. sideways false
  8393. status collapsed
  8394. \begin_layout Plain Layout
  8395. \align center
  8396. \begin_inset Graphics
  8397. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/meanvar-trends-PAGE2-CROP-RASTER.png
  8398. lyxscale 15
  8399. width 30col%
  8400. groupId voomaw-subfig
  8401. \end_inset
  8402. \end_layout
  8403. \begin_layout Plain Layout
  8404. \begin_inset Caption Standard
  8405. \begin_layout Plain Layout
  8406. \begin_inset CommandInset label
  8407. LatexCommand label
  8408. name "fig:meanvar-sva-voomaw"
  8409. \end_inset
  8410. Mean-variance trend after voom modeling in analysis C.
  8411. \end_layout
  8412. \end_inset
  8413. \end_layout
  8414. \end_inset
  8415. \end_layout
  8416. \begin_layout Plain Layout
  8417. \begin_inset Caption Standard
  8418. \begin_layout Plain Layout
  8419. \series bold
  8420. Mean-variance trend modeling in methylation array data.
  8421. \series default
  8422. The estimated log2(standard deviation) for each probe is plotted against
  8423. the probe's average M-value across all samples as a black point, with some
  8424. transparency to make over-plotting more visible, since there are about
  8425. 450,000 points.
  8426. Density of points is also indicated by the dark blue contour lines.
  8427. The prior variance trend estimated by eBayes is shown in light blue, while
  8428. the lowess trend of the points is shown in red.
  8429. \end_layout
  8430. \end_inset
  8431. \end_layout
  8432. \end_inset
  8433. \end_layout
  8434. \begin_layout Standard
  8435. \begin_inset ERT
  8436. status open
  8437. \begin_layout Plain Layout
  8438. \backslash
  8439. end{landscape}
  8440. \end_layout
  8441. \begin_layout Plain Layout
  8442. }
  8443. \end_layout
  8444. \end_inset
  8445. \end_layout
  8446. \begin_layout Standard
  8447. Figure
  8448. \begin_inset CommandInset ref
  8449. LatexCommand ref
  8450. reference "fig:meanvar-basic"
  8451. plural "false"
  8452. caps "false"
  8453. noprefix "false"
  8454. \end_inset
  8455. shows the relationship between the mean M-value and the standard deviation
  8456. calculated for each probe in the methylation array data set.
  8457. A few features of the data are apparent.
  8458. First, the data are very strongly bimodal, with peaks in the density around
  8459. M-values of +4 and -4.
  8460. These modes correspond to methylation sites that are nearly 100% methylated
  8461. and nearly 100% unmethylated, respectively.
  8462. The strong bimodality indicates that a majority of probes interrogate sites
  8463. that fall into one of these two categories.
  8464. The points in between these modes represent sites that are either partially
  8465. methylated in many samples, or are fully methylated in some samples and
  8466. fully unmethylated in other samples, or some combination.
  8467. The next visible feature of the data is the W-shaped variance trend.
  8468. The upticks in the variance trend on either side are expected, based on
  8469. the sigmoid transformation exaggerating small differences at extreme M-values
  8470. (Figure
  8471. \begin_inset CommandInset ref
  8472. LatexCommand ref
  8473. reference "fig:Sigmoid-beta-m-mapping"
  8474. plural "false"
  8475. caps "false"
  8476. noprefix "false"
  8477. \end_inset
  8478. ).
  8479. However, the uptick in the center is interesting: it indicates that sites
  8480. that are not constitutively methylated or unmethylated have a higher variance.
  8481. This could be a genuine biological effect, or it could be spurious noise
  8482. that is only observable at sites with varying methylation.
  8483. \end_layout
  8484. \begin_layout Standard
  8485. In Figure
  8486. \begin_inset CommandInset ref
  8487. LatexCommand ref
  8488. reference "fig:meanvar-sva-aw"
  8489. plural "false"
  8490. caps "false"
  8491. noprefix "false"
  8492. \end_inset
  8493. , we see the mean-variance trend for the same methylation array data, this
  8494. time with surrogate variables and sample quality weights estimated from
  8495. the data and included in the model.
  8496. As expected, the overall average variance is smaller, since the surrogate
  8497. variables account for some of the variance.
  8498. In addition, the uptick in variance in the middle of the M-value range
  8499. has disappeared, turning the W shape into a wide U shape.
  8500. This indicates that the excess variance in the probes with intermediate
  8501. M-values was explained by systematic variations not correlated with known
  8502. covariates, and these variations were modeled by the surrogate variables.
  8503. The result is a nearly flat variance trend for the entire intermediate
  8504. M-value range from about -3 to +3.
  8505. Note that this corresponds closely to the range within which the M-value
  8506. transformation shown in Figure
  8507. \begin_inset CommandInset ref
  8508. LatexCommand ref
  8509. reference "fig:Sigmoid-beta-m-mapping"
  8510. plural "false"
  8511. caps "false"
  8512. noprefix "false"
  8513. \end_inset
  8514. is nearly linear.
  8515. In contrast, the excess variance at the extremes (greater than +3 and less
  8516. than -3) was not
  8517. \begin_inset Quotes eld
  8518. \end_inset
  8519. absorbed
  8520. \begin_inset Quotes erd
  8521. \end_inset
  8522. by the surrogate variables and remains in the plot, indicating that this
  8523. variation has no systematic component: probes with extreme M-values are
  8524. uniformly more variable across all samples, as expected.
  8525. \end_layout
  8526. \begin_layout Standard
  8527. Figure
  8528. \begin_inset CommandInset ref
  8529. LatexCommand ref
  8530. reference "fig:meanvar-sva-voomaw"
  8531. plural "false"
  8532. caps "false"
  8533. noprefix "false"
  8534. \end_inset
  8535. shows the mean-variance trend after fitting the model with the observation
  8536. weights assigned by voom based on the mean-variance trend shown in Figure
  8537. \begin_inset CommandInset ref
  8538. LatexCommand ref
  8539. reference "fig:meanvar-sva-aw"
  8540. plural "false"
  8541. caps "false"
  8542. noprefix "false"
  8543. \end_inset
  8544. .
  8545. As expected, the weights exactly counteract the trend in the data, resulting
  8546. in a nearly flat trend centered vertically at 1 (i.e.
  8547. 0 on the log scale).
  8548. This shows that the observations with extreme M-values have been appropriately
  8549. down-weighted to account for the fact that the noise in those observations
  8550. has been amplified by the non-linear M-value transformation.
  8551. In turn, this gives relatively more weight to observations in the middle
  8552. region, which are more likely to correspond to probes measuring interesting
  8553. biology (not constitutively methylated or unmethylated).
  8554. \end_layout
  8555. \begin_layout Standard
  8556. \begin_inset Float table
  8557. wide false
  8558. sideways false
  8559. status open
  8560. \begin_layout Plain Layout
  8561. \align center
  8562. \begin_inset Tabular
  8563. <lyxtabular version="3" rows="5" columns="3">
  8564. <features tabularvalignment="middle">
  8565. <column alignment="center" valignment="top">
  8566. <column alignment="center" valignment="top">
  8567. <column alignment="center" valignment="top">
  8568. <row>
  8569. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8570. \begin_inset Text
  8571. \begin_layout Plain Layout
  8572. Covariate
  8573. \end_layout
  8574. \end_inset
  8575. </cell>
  8576. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8577. \begin_inset Text
  8578. \begin_layout Plain Layout
  8579. Test used
  8580. \end_layout
  8581. \end_inset
  8582. </cell>
  8583. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  8584. \begin_inset Text
  8585. \begin_layout Plain Layout
  8586. p-value
  8587. \end_layout
  8588. \end_inset
  8589. </cell>
  8590. </row>
  8591. <row>
  8592. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8593. \begin_inset Text
  8594. \begin_layout Plain Layout
  8595. Transplant Status
  8596. \end_layout
  8597. \end_inset
  8598. </cell>
  8599. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8600. \begin_inset Text
  8601. \begin_layout Plain Layout
  8602. F-test
  8603. \end_layout
  8604. \end_inset
  8605. </cell>
  8606. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8607. \begin_inset Text
  8608. \begin_layout Plain Layout
  8609. 0.404
  8610. \end_layout
  8611. \end_inset
  8612. </cell>
  8613. </row>
  8614. <row>
  8615. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8616. \begin_inset Text
  8617. \begin_layout Plain Layout
  8618. Diabetes Diagnosis
  8619. \end_layout
  8620. \end_inset
  8621. </cell>
  8622. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8623. \begin_inset Text
  8624. \begin_layout Plain Layout
  8625. \emph on
  8626. t
  8627. \emph default
  8628. -test
  8629. \end_layout
  8630. \end_inset
  8631. </cell>
  8632. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8633. \begin_inset Text
  8634. \begin_layout Plain Layout
  8635. 0.00106
  8636. \end_layout
  8637. \end_inset
  8638. </cell>
  8639. </row>
  8640. <row>
  8641. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8642. \begin_inset Text
  8643. \begin_layout Plain Layout
  8644. Sex
  8645. \end_layout
  8646. \end_inset
  8647. </cell>
  8648. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8649. \begin_inset Text
  8650. \begin_layout Plain Layout
  8651. \emph on
  8652. t
  8653. \emph default
  8654. -test
  8655. \end_layout
  8656. \end_inset
  8657. </cell>
  8658. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8659. \begin_inset Text
  8660. \begin_layout Plain Layout
  8661. 0.148
  8662. \end_layout
  8663. \end_inset
  8664. </cell>
  8665. </row>
  8666. <row>
  8667. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8668. \begin_inset Text
  8669. \begin_layout Plain Layout
  8670. Age
  8671. \end_layout
  8672. \end_inset
  8673. </cell>
  8674. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8675. \begin_inset Text
  8676. \begin_layout Plain Layout
  8677. linear regression
  8678. \end_layout
  8679. \end_inset
  8680. </cell>
  8681. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  8682. \begin_inset Text
  8683. \begin_layout Plain Layout
  8684. 0.212
  8685. \end_layout
  8686. \end_inset
  8687. </cell>
  8688. </row>
  8689. </lyxtabular>
  8690. \end_inset
  8691. \end_layout
  8692. \begin_layout Plain Layout
  8693. \begin_inset Caption Standard
  8694. \begin_layout Plain Layout
  8695. \series bold
  8696. \begin_inset CommandInset label
  8697. LatexCommand label
  8698. name "tab:weight-covariate-tests"
  8699. \end_inset
  8700. Association of sample weights with clinical covariates in methylation array
  8701. data.
  8702. \series default
  8703. Computed sample quality log weights were tested for significant association
  8704. with each of the variables in the model (1st column).
  8705. An appropriate test was selected for each variable based on whether the
  8706. variable had 2 categories (
  8707. \emph on
  8708. t
  8709. \emph default
  8710. -test), had more than 2 categories (F-test), or was numeric (linear regression).
  8711. The test selected is shown in the 2nd column.
  8712. P-values for association with the log weights are shown in the 3rd column.
  8713. No multiple testing adjustment was performed for these p-values.
  8714. \end_layout
  8715. \end_inset
  8716. \end_layout
  8717. \end_inset
  8718. \end_layout
  8719. \begin_layout Standard
  8720. \begin_inset Float figure
  8721. wide false
  8722. sideways false
  8723. status open
  8724. \begin_layout Plain Layout
  8725. \begin_inset Flex TODO Note (inline)
  8726. status open
  8727. \begin_layout Plain Layout
  8728. Redo the sample weight boxplot with notches, and remove fill colors
  8729. \end_layout
  8730. \end_inset
  8731. \end_layout
  8732. \begin_layout Plain Layout
  8733. \align center
  8734. \begin_inset Graphics
  8735. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/sample-weights-PAGE3-CROP.pdf
  8736. lyxscale 50
  8737. width 60col%
  8738. groupId colwidth
  8739. \end_inset
  8740. \end_layout
  8741. \begin_layout Plain Layout
  8742. \begin_inset Caption Standard
  8743. \begin_layout Plain Layout
  8744. \begin_inset CommandInset label
  8745. LatexCommand label
  8746. name "fig:diabetes-sample-weights"
  8747. \end_inset
  8748. \series bold
  8749. Box-and-whiskers plot of sample quality weights grouped by diabetes diagnosis.
  8750. \series default
  8751. Samples were grouped based on diabetes diagnosis, and the distribution of
  8752. sample quality weights for each diagnosis was plotted as a box-and-whiskers
  8753. plot
  8754. \begin_inset CommandInset citation
  8755. LatexCommand cite
  8756. key "McGill1978"
  8757. literal "false"
  8758. \end_inset
  8759. .
  8760. \end_layout
  8761. \end_inset
  8762. \end_layout
  8763. \begin_layout Plain Layout
  8764. \end_layout
  8765. \end_inset
  8766. \end_layout
  8767. \begin_layout Standard
  8768. To determine whether any of the known experimental factors had an impact
  8769. on data quality, the sample quality weights estimated from the data were
  8770. tested for association with each of the experimental factors (Table
  8771. \begin_inset CommandInset ref
  8772. LatexCommand ref
  8773. reference "tab:weight-covariate-tests"
  8774. plural "false"
  8775. caps "false"
  8776. noprefix "false"
  8777. \end_inset
  8778. ).
  8779. Diabetes diagnosis was found to have a potentially significant association
  8780. with the sample weights, with a t-test p-value of
  8781. \begin_inset Formula $1.06\times10^{-3}$
  8782. \end_inset
  8783. .
  8784. Figure
  8785. \begin_inset CommandInset ref
  8786. LatexCommand ref
  8787. reference "fig:diabetes-sample-weights"
  8788. plural "false"
  8789. caps "false"
  8790. noprefix "false"
  8791. \end_inset
  8792. shows the distribution of sample weights grouped by diabetes diagnosis.
  8793. The samples from patients with Type 2 diabetes were assigned significantly
  8794. lower weights than those from patients with Type 1 diabetes.
  8795. This indicates that the type 2 diabetes samples had an overall higher variance
  8796. on average across all probes.
  8797. \end_layout
  8798. \begin_layout Standard
  8799. \begin_inset Float table
  8800. wide false
  8801. sideways false
  8802. status open
  8803. \begin_layout Plain Layout
  8804. \align center
  8805. \begin_inset Flex TODO Note (inline)
  8806. status open
  8807. \begin_layout Plain Layout
  8808. Consider transposing these tables
  8809. \end_layout
  8810. \end_inset
  8811. \end_layout
  8812. \begin_layout Plain Layout
  8813. \begin_inset Float table
  8814. wide false
  8815. sideways false
  8816. status open
  8817. \begin_layout Plain Layout
  8818. \align center
  8819. \begin_inset Tabular
  8820. <lyxtabular version="3" rows="5" columns="4">
  8821. <features tabularvalignment="middle">
  8822. <column alignment="center" valignment="top">
  8823. <column alignment="center" valignment="top">
  8824. <column alignment="center" valignment="top">
  8825. <column alignment="center" valignment="top">
  8826. <row>
  8827. <cell alignment="center" valignment="top" usebox="none">
  8828. \begin_inset Text
  8829. \begin_layout Plain Layout
  8830. \end_layout
  8831. \end_inset
  8832. </cell>
  8833. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8834. \begin_inset Text
  8835. \begin_layout Plain Layout
  8836. Analysis
  8837. \end_layout
  8838. \end_inset
  8839. </cell>
  8840. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8841. \begin_inset Text
  8842. \begin_layout Plain Layout
  8843. \end_layout
  8844. \end_inset
  8845. </cell>
  8846. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8847. \begin_inset Text
  8848. \begin_layout Plain Layout
  8849. \end_layout
  8850. \end_inset
  8851. </cell>
  8852. </row>
  8853. <row>
  8854. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8855. \begin_inset Text
  8856. \begin_layout Plain Layout
  8857. Contrast
  8858. \end_layout
  8859. \end_inset
  8860. </cell>
  8861. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8862. \begin_inset Text
  8863. \begin_layout Plain Layout
  8864. A
  8865. \end_layout
  8866. \end_inset
  8867. </cell>
  8868. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8869. \begin_inset Text
  8870. \begin_layout Plain Layout
  8871. B
  8872. \end_layout
  8873. \end_inset
  8874. </cell>
  8875. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  8876. \begin_inset Text
  8877. \begin_layout Plain Layout
  8878. C
  8879. \end_layout
  8880. \end_inset
  8881. </cell>
  8882. </row>
  8883. <row>
  8884. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8885. \begin_inset Text
  8886. \begin_layout Plain Layout
  8887. TX vs AR
  8888. \end_layout
  8889. \end_inset
  8890. </cell>
  8891. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8892. \begin_inset Text
  8893. \begin_layout Plain Layout
  8894. 0
  8895. \end_layout
  8896. \end_inset
  8897. </cell>
  8898. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8899. \begin_inset Text
  8900. \begin_layout Plain Layout
  8901. 25
  8902. \end_layout
  8903. \end_inset
  8904. </cell>
  8905. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8906. \begin_inset Text
  8907. \begin_layout Plain Layout
  8908. 22
  8909. \end_layout
  8910. \end_inset
  8911. </cell>
  8912. </row>
  8913. <row>
  8914. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8915. \begin_inset Text
  8916. \begin_layout Plain Layout
  8917. TX vs ADNR
  8918. \end_layout
  8919. \end_inset
  8920. </cell>
  8921. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8922. \begin_inset Text
  8923. \begin_layout Plain Layout
  8924. 7
  8925. \end_layout
  8926. \end_inset
  8927. </cell>
  8928. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  8929. \begin_inset Text
  8930. \begin_layout Plain Layout
  8931. 338
  8932. \end_layout
  8933. \end_inset
  8934. </cell>
  8935. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  8936. \begin_inset Text
  8937. \begin_layout Plain Layout
  8938. 369
  8939. \end_layout
  8940. \end_inset
  8941. </cell>
  8942. </row>
  8943. <row>
  8944. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8945. \begin_inset Text
  8946. \begin_layout Plain Layout
  8947. TX vs CAN
  8948. \end_layout
  8949. \end_inset
  8950. </cell>
  8951. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8952. \begin_inset Text
  8953. \begin_layout Plain Layout
  8954. 0
  8955. \end_layout
  8956. \end_inset
  8957. </cell>
  8958. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  8959. \begin_inset Text
  8960. \begin_layout Plain Layout
  8961. 231
  8962. \end_layout
  8963. \end_inset
  8964. </cell>
  8965. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  8966. \begin_inset Text
  8967. \begin_layout Plain Layout
  8968. 278
  8969. \end_layout
  8970. \end_inset
  8971. </cell>
  8972. </row>
  8973. </lyxtabular>
  8974. \end_inset
  8975. \end_layout
  8976. \begin_layout Plain Layout
  8977. \begin_inset Caption Standard
  8978. \begin_layout Plain Layout
  8979. \begin_inset CommandInset label
  8980. LatexCommand label
  8981. name "tab:methyl-num-signif"
  8982. \end_inset
  8983. Number of probes significant at 10% FDR.
  8984. \end_layout
  8985. \end_inset
  8986. \end_layout
  8987. \end_inset
  8988. \begin_inset space \hfill{}
  8989. \end_inset
  8990. \begin_inset Float table
  8991. wide false
  8992. sideways false
  8993. status open
  8994. \begin_layout Plain Layout
  8995. \align center
  8996. \begin_inset Tabular
  8997. <lyxtabular version="3" rows="5" columns="4">
  8998. <features tabularvalignment="middle">
  8999. <column alignment="center" valignment="top">
  9000. <column alignment="center" valignment="top">
  9001. <column alignment="center" valignment="top">
  9002. <column alignment="center" valignment="top">
  9003. <row>
  9004. <cell alignment="center" valignment="top" usebox="none">
  9005. \begin_inset Text
  9006. \begin_layout Plain Layout
  9007. \end_layout
  9008. \end_inset
  9009. </cell>
  9010. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  9011. \begin_inset Text
  9012. \begin_layout Plain Layout
  9013. Analysis
  9014. \end_layout
  9015. \end_inset
  9016. </cell>
  9017. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9018. \begin_inset Text
  9019. \begin_layout Plain Layout
  9020. \end_layout
  9021. \end_inset
  9022. </cell>
  9023. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  9024. \begin_inset Text
  9025. \begin_layout Plain Layout
  9026. \end_layout
  9027. \end_inset
  9028. </cell>
  9029. </row>
  9030. <row>
  9031. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  9032. \begin_inset Text
  9033. \begin_layout Plain Layout
  9034. Contrast
  9035. \end_layout
  9036. \end_inset
  9037. </cell>
  9038. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  9039. \begin_inset Text
  9040. \begin_layout Plain Layout
  9041. A
  9042. \end_layout
  9043. \end_inset
  9044. </cell>
  9045. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  9046. \begin_inset Text
  9047. \begin_layout Plain Layout
  9048. B
  9049. \end_layout
  9050. \end_inset
  9051. </cell>
  9052. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  9053. \begin_inset Text
  9054. \begin_layout Plain Layout
  9055. C
  9056. \end_layout
  9057. \end_inset
  9058. </cell>
  9059. </row>
  9060. <row>
  9061. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9062. \begin_inset Text
  9063. \begin_layout Plain Layout
  9064. TX vs AR
  9065. \end_layout
  9066. \end_inset
  9067. </cell>
  9068. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9069. \begin_inset Text
  9070. \begin_layout Plain Layout
  9071. 0
  9072. \end_layout
  9073. \end_inset
  9074. </cell>
  9075. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9076. \begin_inset Text
  9077. \begin_layout Plain Layout
  9078. 10,063
  9079. \end_layout
  9080. \end_inset
  9081. </cell>
  9082. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  9083. \begin_inset Text
  9084. \begin_layout Plain Layout
  9085. 11,225
  9086. \end_layout
  9087. \end_inset
  9088. </cell>
  9089. </row>
  9090. <row>
  9091. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9092. \begin_inset Text
  9093. \begin_layout Plain Layout
  9094. TX vs ADNR
  9095. \end_layout
  9096. \end_inset
  9097. </cell>
  9098. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9099. \begin_inset Text
  9100. \begin_layout Plain Layout
  9101. 27
  9102. \end_layout
  9103. \end_inset
  9104. </cell>
  9105. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  9106. \begin_inset Text
  9107. \begin_layout Plain Layout
  9108. 12,674
  9109. \end_layout
  9110. \end_inset
  9111. </cell>
  9112. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  9113. \begin_inset Text
  9114. \begin_layout Plain Layout
  9115. 13,086
  9116. \end_layout
  9117. \end_inset
  9118. </cell>
  9119. </row>
  9120. <row>
  9121. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  9122. \begin_inset Text
  9123. \begin_layout Plain Layout
  9124. TX vs CAN
  9125. \end_layout
  9126. \end_inset
  9127. </cell>
  9128. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  9129. \begin_inset Text
  9130. \begin_layout Plain Layout
  9131. 966
  9132. \end_layout
  9133. \end_inset
  9134. </cell>
  9135. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  9136. \begin_inset Text
  9137. \begin_layout Plain Layout
  9138. 20,039
  9139. \end_layout
  9140. \end_inset
  9141. </cell>
  9142. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  9143. \begin_inset Text
  9144. \begin_layout Plain Layout
  9145. 20,955
  9146. \end_layout
  9147. \end_inset
  9148. </cell>
  9149. </row>
  9150. </lyxtabular>
  9151. \end_inset
  9152. \end_layout
  9153. \begin_layout Plain Layout
  9154. \begin_inset Caption Standard
  9155. \begin_layout Plain Layout
  9156. \begin_inset CommandInset label
  9157. LatexCommand label
  9158. name "tab:methyl-est-nonnull"
  9159. \end_inset
  9160. Estimated number of non-null tests, using the method of averaging local
  9161. FDR values
  9162. \begin_inset CommandInset citation
  9163. LatexCommand cite
  9164. key "Phipson2013Thesis"
  9165. literal "false"
  9166. \end_inset
  9167. .
  9168. \end_layout
  9169. \end_inset
  9170. \end_layout
  9171. \end_inset
  9172. \end_layout
  9173. \begin_layout Plain Layout
  9174. \begin_inset Caption Standard
  9175. \begin_layout Plain Layout
  9176. \series bold
  9177. Estimates of degree of differential methylation in for each contrast in
  9178. each analysis.
  9179. \series default
  9180. For each of the analyses in Table
  9181. \begin_inset CommandInset ref
  9182. LatexCommand ref
  9183. reference "tab:Summary-of-meth-analysis"
  9184. plural "false"
  9185. caps "false"
  9186. noprefix "false"
  9187. \end_inset
  9188. , these tables show the number of probes called significantly differentially
  9189. methylated at a threshold of 10% FDR for each comparison between TX and
  9190. the other 3 transplant statuses (a) and the estimated total number of probes
  9191. that are differentially methylated (b).
  9192. \end_layout
  9193. \end_inset
  9194. \end_layout
  9195. \end_inset
  9196. \end_layout
  9197. \begin_layout Standard
  9198. \begin_inset Float figure
  9199. wide false
  9200. sideways false
  9201. status open
  9202. \begin_layout Plain Layout
  9203. \align center
  9204. \series bold
  9205. \begin_inset Float figure
  9206. wide false
  9207. sideways false
  9208. status collapsed
  9209. \begin_layout Plain Layout
  9210. \align center
  9211. \begin_inset Graphics
  9212. filename graphics/methylvoom/unadj.dupcor/pval-histograms-PAGE1.pdf
  9213. lyxscale 33
  9214. width 30col%
  9215. groupId meth-pval-hist
  9216. \end_inset
  9217. \end_layout
  9218. \begin_layout Plain Layout
  9219. \series bold
  9220. \begin_inset Caption Standard
  9221. \begin_layout Plain Layout
  9222. AR vs.
  9223. TX, Analysis A
  9224. \end_layout
  9225. \end_inset
  9226. \end_layout
  9227. \begin_layout Plain Layout
  9228. \end_layout
  9229. \end_inset
  9230. \begin_inset space \hfill{}
  9231. \end_inset
  9232. \begin_inset Float figure
  9233. wide false
  9234. sideways false
  9235. status collapsed
  9236. \begin_layout Plain Layout
  9237. \align center
  9238. \begin_inset Graphics
  9239. filename graphics/methylvoom/unadj.dupcor/pval-histograms-PAGE2.pdf
  9240. lyxscale 33
  9241. width 30col%
  9242. groupId meth-pval-hist
  9243. \end_inset
  9244. \end_layout
  9245. \begin_layout Plain Layout
  9246. \series bold
  9247. \begin_inset Caption Standard
  9248. \begin_layout Plain Layout
  9249. ADNR vs.
  9250. TX, Analysis A
  9251. \end_layout
  9252. \end_inset
  9253. \end_layout
  9254. \end_inset
  9255. \begin_inset space \hfill{}
  9256. \end_inset
  9257. \begin_inset Float figure
  9258. wide false
  9259. sideways false
  9260. status collapsed
  9261. \begin_layout Plain Layout
  9262. \align center
  9263. \begin_inset Graphics
  9264. filename graphics/methylvoom/unadj.dupcor/pval-histograms-PAGE3.pdf
  9265. lyxscale 33
  9266. width 30col%
  9267. groupId meth-pval-hist
  9268. \end_inset
  9269. \end_layout
  9270. \begin_layout Plain Layout
  9271. \series bold
  9272. \begin_inset Caption Standard
  9273. \begin_layout Plain Layout
  9274. CAN vs.
  9275. TX, Analysis A
  9276. \end_layout
  9277. \end_inset
  9278. \end_layout
  9279. \end_inset
  9280. \end_layout
  9281. \begin_layout Plain Layout
  9282. \align center
  9283. \series bold
  9284. \begin_inset Float figure
  9285. wide false
  9286. sideways false
  9287. status collapsed
  9288. \begin_layout Plain Layout
  9289. \align center
  9290. \begin_inset Graphics
  9291. filename graphics/methylvoom/unadj.dupcor.sva.aw/pval-histograms-PAGE1.pdf
  9292. lyxscale 33
  9293. width 30col%
  9294. groupId meth-pval-hist
  9295. \end_inset
  9296. \end_layout
  9297. \begin_layout Plain Layout
  9298. \series bold
  9299. \begin_inset Caption Standard
  9300. \begin_layout Plain Layout
  9301. AR vs.
  9302. TX, Analysis B
  9303. \end_layout
  9304. \end_inset
  9305. \end_layout
  9306. \end_inset
  9307. \begin_inset space \hfill{}
  9308. \end_inset
  9309. \begin_inset Float figure
  9310. wide false
  9311. sideways false
  9312. status collapsed
  9313. \begin_layout Plain Layout
  9314. \align center
  9315. \begin_inset Graphics
  9316. filename graphics/methylvoom/unadj.dupcor.sva.aw/pval-histograms-PAGE2.pdf
  9317. lyxscale 33
  9318. width 30col%
  9319. groupId meth-pval-hist
  9320. \end_inset
  9321. \end_layout
  9322. \begin_layout Plain Layout
  9323. \series bold
  9324. \begin_inset Caption Standard
  9325. \begin_layout Plain Layout
  9326. ADNR vs.
  9327. TX, Analysis B
  9328. \end_layout
  9329. \end_inset
  9330. \end_layout
  9331. \end_inset
  9332. \begin_inset space \hfill{}
  9333. \end_inset
  9334. \begin_inset Float figure
  9335. wide false
  9336. sideways false
  9337. status collapsed
  9338. \begin_layout Plain Layout
  9339. \align center
  9340. \begin_inset Graphics
  9341. filename graphics/methylvoom/unadj.dupcor.sva.aw/pval-histograms-PAGE3.pdf
  9342. lyxscale 33
  9343. width 30col%
  9344. groupId meth-pval-hist
  9345. \end_inset
  9346. \end_layout
  9347. \begin_layout Plain Layout
  9348. \series bold
  9349. \begin_inset Caption Standard
  9350. \begin_layout Plain Layout
  9351. CAN vs.
  9352. TX, Analysis B
  9353. \end_layout
  9354. \end_inset
  9355. \end_layout
  9356. \end_inset
  9357. \end_layout
  9358. \begin_layout Plain Layout
  9359. \align center
  9360. \series bold
  9361. \begin_inset Float figure
  9362. wide false
  9363. sideways false
  9364. status collapsed
  9365. \begin_layout Plain Layout
  9366. \align center
  9367. \begin_inset Graphics
  9368. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/pval-histograms-PAGE1.pdf
  9369. lyxscale 33
  9370. width 30col%
  9371. groupId meth-pval-hist
  9372. \end_inset
  9373. \end_layout
  9374. \begin_layout Plain Layout
  9375. \series bold
  9376. \begin_inset Caption Standard
  9377. \begin_layout Plain Layout
  9378. AR vs.
  9379. TX, Analysis C
  9380. \end_layout
  9381. \end_inset
  9382. \end_layout
  9383. \end_inset
  9384. \begin_inset space \hfill{}
  9385. \end_inset
  9386. \begin_inset Float figure
  9387. wide false
  9388. sideways false
  9389. status collapsed
  9390. \begin_layout Plain Layout
  9391. \align center
  9392. \begin_inset Graphics
  9393. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/pval-histograms-PAGE2.pdf
  9394. lyxscale 33
  9395. width 30col%
  9396. groupId meth-pval-hist
  9397. \end_inset
  9398. \end_layout
  9399. \begin_layout Plain Layout
  9400. \series bold
  9401. \begin_inset Caption Standard
  9402. \begin_layout Plain Layout
  9403. ADNR vs.
  9404. TX, Analysis C
  9405. \end_layout
  9406. \end_inset
  9407. \end_layout
  9408. \end_inset
  9409. \begin_inset space \hfill{}
  9410. \end_inset
  9411. \begin_inset Float figure
  9412. wide false
  9413. sideways false
  9414. status collapsed
  9415. \begin_layout Plain Layout
  9416. \align center
  9417. \begin_inset Graphics
  9418. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/pval-histograms-PAGE3.pdf
  9419. lyxscale 33
  9420. width 30col%
  9421. groupId meth-pval-hist
  9422. \end_inset
  9423. \end_layout
  9424. \begin_layout Plain Layout
  9425. \series bold
  9426. \begin_inset Caption Standard
  9427. \begin_layout Plain Layout
  9428. CAN vs.
  9429. TX, Analysis C
  9430. \end_layout
  9431. \end_inset
  9432. \end_layout
  9433. \end_inset
  9434. \end_layout
  9435. \begin_layout Plain Layout
  9436. \begin_inset Caption Standard
  9437. \begin_layout Plain Layout
  9438. \series bold
  9439. \begin_inset CommandInset label
  9440. LatexCommand label
  9441. name "fig:meth-p-value-histograms"
  9442. \end_inset
  9443. Probe p-value histograms for each contrast in each analysis.
  9444. \series default
  9445. For each differential methylation test of interest, the distribution of
  9446. p-values across all probes is plotted as a histogram.
  9447. The red solid line indicates the density that would be expected under the
  9448. null hypothesis for all probes (a
  9449. \begin_inset Formula $\mathrm{Uniform}(0,1)$
  9450. \end_inset
  9451. distribution), while the blue dotted line indicates the fraction of p-values
  9452. that actually follow the null hypothesis (
  9453. \begin_inset Formula $\hat{\pi}_{0}$
  9454. \end_inset
  9455. ) estimated using the method of averaging local FDR values
  9456. \begin_inset CommandInset citation
  9457. LatexCommand cite
  9458. key "Phipson2013Thesis"
  9459. literal "false"
  9460. \end_inset
  9461. .
  9462. the blue line is only shown in each plot if the estimate of
  9463. \begin_inset Formula $\hat{\pi}_{0}$
  9464. \end_inset
  9465. for that p-value distribution is different from 1.
  9466. \end_layout
  9467. \end_inset
  9468. \end_layout
  9469. \end_inset
  9470. \end_layout
  9471. \begin_layout Standard
  9472. Table
  9473. \begin_inset CommandInset ref
  9474. LatexCommand ref
  9475. reference "tab:methyl-num-signif"
  9476. plural "false"
  9477. caps "false"
  9478. noprefix "false"
  9479. \end_inset
  9480. shows the number of significantly differentially methylated probes reported
  9481. by each analysis for each comparison of interest at an FDR of 10%.
  9482. As expected, the more elaborate analyses, B and C, report more significant
  9483. probes than the more basic analysis A, consistent with the conclusions
  9484. above that the data contain hidden systematic variations that must be modeled.
  9485. Table
  9486. \begin_inset CommandInset ref
  9487. LatexCommand ref
  9488. reference "tab:methyl-est-nonnull"
  9489. plural "false"
  9490. caps "false"
  9491. noprefix "false"
  9492. \end_inset
  9493. shows the estimated number differentially methylated probes for each test
  9494. from each analysis.
  9495. This was computed by estimating the proportion of null hypotheses that
  9496. were true using the method of
  9497. \begin_inset CommandInset citation
  9498. LatexCommand cite
  9499. key "Phipson2013Thesis"
  9500. literal "false"
  9501. \end_inset
  9502. and subtracting that fraction from the total number of probes, yielding
  9503. an estimate of the number of null hypotheses that are false based on the
  9504. distribution of p-values across the entire dataset.
  9505. Note that this does not identify which null hypotheses should be rejected
  9506. (i.e.
  9507. which probes are significant); it only estimates the true number of such
  9508. probes.
  9509. Once again, analyses B and C result it much larger estimates for the number
  9510. of differentially methylated probes.
  9511. In this case, analysis C, the only analysis that includes voom, estimates
  9512. the largest number of differentially methylated probes for all 3 contrasts.
  9513. If the assumptions of all the methods employed hold, then this represents
  9514. a gain in statistical power over the simpler analysis A.
  9515. Figure
  9516. \begin_inset CommandInset ref
  9517. LatexCommand ref
  9518. reference "fig:meth-p-value-histograms"
  9519. plural "false"
  9520. caps "false"
  9521. noprefix "false"
  9522. \end_inset
  9523. shows the p-value distributions for each test, from which the numbers in
  9524. Table
  9525. \begin_inset CommandInset ref
  9526. LatexCommand ref
  9527. reference "tab:methyl-est-nonnull"
  9528. plural "false"
  9529. caps "false"
  9530. noprefix "false"
  9531. \end_inset
  9532. were generated.
  9533. The distributions for analysis A all have a dip in density near zero, which
  9534. is a strong sign of a poor model fit.
  9535. The histograms for analyses B and C are more well-behaved, with a uniform
  9536. component stretching all the way from 0 to 1 representing the probes for
  9537. which the null hypotheses is true (no differential methylation), and a
  9538. zero-biased component representing the probes for which the null hypothesis
  9539. is false (differentially methylated).
  9540. These histograms do not indicate any major issues with the model fit.
  9541. \end_layout
  9542. \begin_layout Standard
  9543. \begin_inset Flex TODO Note (inline)
  9544. status open
  9545. \begin_layout Plain Layout
  9546. If time allows, maybe generate the PCA plots before/after SVA effect subtraction
  9547. ?
  9548. \end_layout
  9549. \end_inset
  9550. \end_layout
  9551. \begin_layout Section
  9552. Discussion
  9553. \end_layout
  9554. \begin_layout Subsection
  9555. fRMA achieves clinically applicable normalization without sacrificing classifica
  9556. tion performance
  9557. \end_layout
  9558. \begin_layout Standard
  9559. As shown in Figure
  9560. \begin_inset CommandInset ref
  9561. LatexCommand ref
  9562. reference "fig:Classifier-probabilities-RMA"
  9563. plural "false"
  9564. caps "false"
  9565. noprefix "false"
  9566. \end_inset
  9567. , improper normalization, particularly separate normalization of training
  9568. and test samples, leads to unwanted biases in classification.
  9569. In a controlled experimental context, it is always possible to correct
  9570. this issue by normalizing all experimental samples together.
  9571. However, because it is not feasible to normalize all samples together in
  9572. a clinical context, a single-channel normalization is required is required.
  9573. \end_layout
  9574. \begin_layout Standard
  9575. The major concern in using a single-channel normalization is that non-single-cha
  9576. nnel methods can share information between arrays to improve the normalization,
  9577. and single-channel methods risk sacrificing the gains in normalization
  9578. accuracy that come from this information sharing.
  9579. In the case of RMA, this information sharing is accomplished through quantile
  9580. normalization and median polish steps.
  9581. The need for information sharing in quantile normalization can easily be
  9582. removed by learning a fixed set of quantiles from external data and normalizing
  9583. each array to these fixed quantiles, instead of the quantiles of the data
  9584. itself.
  9585. As long as the fixed quantiles are reasonable, the result will be similar
  9586. to standard RMA.
  9587. However, there is no analogous way to eliminate cross-array information
  9588. sharing in the median polish step, so fRMA replaces this with a weighted
  9589. average of probes on each array, with the weights learned from external
  9590. data.
  9591. This step of fRMA has the greatest potential to diverge from RMA un undesirable
  9592. ways.
  9593. \end_layout
  9594. \begin_layout Standard
  9595. However, when run on real data, fRMA performed at least as well as RMA in
  9596. both the internal validation and external validation tests.
  9597. This shows that fRMA can be used to normalize individual clinical samples
  9598. in a class prediction context without sacrificing the classifier performance
  9599. that would be obtained by using the more well-established RMA for normalization.
  9600. The other single-channel normalization method considered, SCAN, showed
  9601. some loss of AUC in the external validation test.
  9602. Based on these results, fRMA is the preferred normalization for clinical
  9603. samples in a class prediction context.
  9604. \end_layout
  9605. \begin_layout Subsection
  9606. Robust fRMA vectors can be generated for new array platforms
  9607. \end_layout
  9608. \begin_layout Standard
  9609. \begin_inset Flex TODO Note (inline)
  9610. status open
  9611. \begin_layout Plain Layout
  9612. Look up the exact numbers, do a find & replace for
  9613. \begin_inset Quotes eld
  9614. \end_inset
  9615. 850
  9616. \begin_inset Quotes erd
  9617. \end_inset
  9618. \end_layout
  9619. \end_inset
  9620. \end_layout
  9621. \begin_layout Standard
  9622. The published fRMA normalization vectors for the hgu133plus2 platform were
  9623. generated from a set of about 850 samples chosen from a wide range of tissues,
  9624. which the authors determined was sufficient to generate a robust set of
  9625. normalization vectors that could be applied across all tissues
  9626. \begin_inset CommandInset citation
  9627. LatexCommand cite
  9628. key "McCall2010"
  9629. literal "false"
  9630. \end_inset
  9631. .
  9632. Since we only had hthgu133pluspm for 2 tissues of interest, our needs were
  9633. more modest.
  9634. Even using only 130 samples in 26 batches of 5 samples each for kidney
  9635. biopsies, we were able to train a robust set of fRMA normalization vectors
  9636. that were not meaningfully affected by the random selection of 5 samples
  9637. from each batch.
  9638. As expected, the training process was just as robust for the blood samples
  9639. with 230 samples in 46 batches of 5 samples each.
  9640. Because these vectors were each generated using training samples from a
  9641. single tissue, they are not suitable for general use, unlike the vectors
  9642. provided with fRMA itself.
  9643. They are purpose-built for normalizing a specific type of sample on a specific
  9644. platform.
  9645. This is a mostly acceptable limitation in the context of developing a machine
  9646. learning classifier for diagnosing a disease based on samples of a specific
  9647. tissue.
  9648. \end_layout
  9649. \begin_layout Standard
  9650. \begin_inset Flex TODO Note (inline)
  9651. status open
  9652. \begin_layout Plain Layout
  9653. Talk about how these vectors can be used for any data from these tissues
  9654. on this platform even though they were custom made for this data set.
  9655. \end_layout
  9656. \end_inset
  9657. \end_layout
  9658. \begin_layout Standard
  9659. \begin_inset Flex TODO Note (inline)
  9660. status open
  9661. \begin_layout Plain Layout
  9662. How to bring up that these custom vectors were used in another project by
  9663. someone else that was never published?
  9664. \end_layout
  9665. \end_inset
  9666. \end_layout
  9667. \begin_layout Subsection
  9668. Methylation array data can be successfully analyzed using existing techniques,
  9669. but machine learning poses additional challenges
  9670. \end_layout
  9671. \begin_layout Standard
  9672. Both analysis strategies B and C both yield a reasonable analysis, with
  9673. a mean-variance trend that matches the expected behavior for the non-linear
  9674. M-value transformation (Figure
  9675. \begin_inset CommandInset ref
  9676. LatexCommand ref
  9677. reference "fig:meanvar-sva-aw"
  9678. plural "false"
  9679. caps "false"
  9680. noprefix "false"
  9681. \end_inset
  9682. ) and well-behaved p-value distributions (Figure
  9683. \begin_inset CommandInset ref
  9684. LatexCommand ref
  9685. reference "fig:meth-p-value-histograms"
  9686. plural "false"
  9687. caps "false"
  9688. noprefix "false"
  9689. \end_inset
  9690. ).
  9691. These two analyses also yield similar numbers of significant probes (Table
  9692. \begin_inset CommandInset ref
  9693. LatexCommand ref
  9694. reference "tab:methyl-num-signif"
  9695. plural "false"
  9696. caps "false"
  9697. noprefix "false"
  9698. \end_inset
  9699. ) and similar estimates of the number of differentially methylated probes
  9700. (Table
  9701. \begin_inset CommandInset ref
  9702. LatexCommand ref
  9703. reference "tab:methyl-est-nonnull"
  9704. plural "false"
  9705. caps "false"
  9706. noprefix "false"
  9707. \end_inset
  9708. ).
  9709. The main difference between these two analyses is the method used to account
  9710. for the mean-variance trend.
  9711. In analysis B, the trend is estimated and applied at the probe level: each
  9712. probe's estimated variance is squeezed toward the trend using an empirical
  9713. Bayes procedure (Figure
  9714. \begin_inset CommandInset ref
  9715. LatexCommand ref
  9716. reference "fig:meanvar-sva-aw"
  9717. plural "false"
  9718. caps "false"
  9719. noprefix "false"
  9720. \end_inset
  9721. ).
  9722. In analysis C, the trend is still estimated at the probe level, but instead
  9723. of estimating a single variance value shared across all observations for
  9724. a given probe, the voom method computes an initial estimate of the variance
  9725. for each observation individually based on where its model-fitted M-value
  9726. falls on the trend line and then assigns inverse-variance weights to model
  9727. the difference in variance between observations.
  9728. An overall variance is still estimated for each probe using the same empirical
  9729. Bayes method, but now the residual trend is flat (Figure
  9730. \begin_inset CommandInset ref
  9731. LatexCommand ref
  9732. reference "fig:meanvar-sva-voomaw"
  9733. plural "false"
  9734. caps "false"
  9735. noprefix "false"
  9736. \end_inset
  9737. ), indicating that the mean-variance trend is adequately modeled by scaling
  9738. the estimated variance for each observation using the weights computed
  9739. by voom.
  9740. \end_layout
  9741. \begin_layout Standard
  9742. The difference between the standard empirical Bayes trended variance modeling
  9743. (analysis B) and voom (analysis C) is analogous to the difference between
  9744. a t-test with equal variance and a t-test with unequal variance, except
  9745. that the unequal group variances used in the latter test are estimated
  9746. based on the mean-variance trend from all the probes rather than the data
  9747. for the specific probe being tested, thus stabilizing the group variance
  9748. estimates by sharing information between probes.
  9749. Allowing voom to model the variance using observation weights in this manner
  9750. allows the linear model fit to concentrate statistical power where it will
  9751. do the most good.
  9752. For example, if a particular probe's M-values are always at the extreme
  9753. of the M-value range (e.g.
  9754. less than -4) for ADNR samples, but the M-values for that probe in TX and
  9755. CAN samples are within the flat region of the mean-variance trend (between
  9756. -3 and +3), voom is able to down-weight the contribution of the high-variance
  9757. M-values from the ADNR samples in order to gain more statistical power
  9758. while testing for differential methylation between TX and CAN.
  9759. In contrast, modeling the mean-variance trend only at the probe level would
  9760. combine the high-variance ADNR samples and lower-variance samples from
  9761. other conditions and estimate an intermediate variance for this probe.
  9762. In practice, analysis B shows that this approach is adequate, but the voom
  9763. approach in analysis C is at least as good on all model fit criteria and
  9764. yields a larger estimate for the number of differentially methylated genes,
  9765. \emph on
  9766. and
  9767. \emph default
  9768. it matches up better with the theoretical
  9769. \end_layout
  9770. \begin_layout Standard
  9771. The significant association of diabetes diagnosis with sample quality is
  9772. interesting.
  9773. The samples with Type 2 diabetes tended to have more variation, averaged
  9774. across all probes, than those with Type 1 diabetes.
  9775. This is consistent with the consensus that type 2 diabetes and the associated
  9776. metabolic syndrome represent a broad dysregulation of the body's endocrine
  9777. signaling related to metabolism [citation needed].
  9778. This dysregulation could easily manifest as a greater degree of variation
  9779. in the DNA methylation patterns of affected tissues.
  9780. In contrast, Type 1 diabetes has a more specific cause and effect, so a
  9781. less variable methylation signature is expected.
  9782. \end_layout
  9783. \begin_layout Standard
  9784. This preliminary analysis suggests that some degree of differential methylation
  9785. exists between TX and each of the three types of transplant disfunction
  9786. studied.
  9787. Hence, it may be feasible to train a classifier to diagnose transplant
  9788. disfunction from DNA methylation array data.
  9789. However, the major importance of both SVA and sample quality weighting
  9790. for proper modeling of this data poses significant challenges for any attempt
  9791. at a machine learning on data of similar quality.
  9792. While these are easily used in a modeling context with full sample information,
  9793. neither of these methods is directly applicable in a machine learning context,
  9794. where the diagnosis is not known ahead of time.
  9795. If a machine learning approach for methylation-based diagnosis is to be
  9796. pursued, it will either require machine-learning-friendly methods to address
  9797. the same systematic trends in the data that SVA and sample quality weighting
  9798. address, or it will require higher quality data with substantially less
  9799. systematic perturbation of the data.
  9800. \end_layout
  9801. \begin_layout Section
  9802. Future Directions
  9803. \end_layout
  9804. \begin_layout Standard
  9805. \begin_inset Flex TODO Note (inline)
  9806. status open
  9807. \begin_layout Plain Layout
  9808. Some work was already being done with the existing fRMA vectors.
  9809. Do I mention that here?
  9810. \end_layout
  9811. \end_inset
  9812. \end_layout
  9813. \begin_layout Subsection
  9814. Improving fRMA to allow training from batches of unequal size
  9815. \end_layout
  9816. \begin_layout Standard
  9817. Because the tools for building fRMA normalization vectors require equal-size
  9818. batches, many samples must be discarded from the training data.
  9819. This is undesirable for a few reasons.
  9820. First, more data is simply better, all other things being equal.
  9821. In this case,
  9822. \begin_inset Quotes eld
  9823. \end_inset
  9824. better
  9825. \begin_inset Quotes erd
  9826. \end_inset
  9827. means a more precise estimate of normalization parameters.
  9828. In addition, the samples to be discarded must be chosen arbitrarily, which
  9829. introduces an unnecessary element of randomness into the estimation process.
  9830. While the randomness can be made deterministic by setting a consistent
  9831. random seed, the need for equal size batches also introduces a need for
  9832. the analyst to decide on the appropriate trade-off between batch size and
  9833. the number of batches.
  9834. This introduces an unnecessary and undesirable
  9835. \begin_inset Quotes eld
  9836. \end_inset
  9837. researcher degree of freedom
  9838. \begin_inset Quotes erd
  9839. \end_inset
  9840. into the analysis, since the generated normalization vectors now depend
  9841. on the choice of batch size based on vague selection criteria and instinct,
  9842. which can unintentionally introduce bias if the researcher chooses a batch
  9843. size based on what seems to yield the most favorable downstream results
  9844. \begin_inset CommandInset citation
  9845. LatexCommand cite
  9846. key "Simmons2011"
  9847. literal "false"
  9848. \end_inset
  9849. .
  9850. \end_layout
  9851. \begin_layout Standard
  9852. Fortunately, the requirement for equal-size batches is not inherent to the
  9853. fRMA algorithm but rather a limitation of the implementation in the
  9854. \begin_inset Flex Code
  9855. status open
  9856. \begin_layout Plain Layout
  9857. frmaTools
  9858. \end_layout
  9859. \end_inset
  9860. package.
  9861. In personal communication, the package's author, Matthew McCall, has indicated
  9862. that with some work, it should be possible to improve the implementation
  9863. to work with batches of unequal sizes.
  9864. The current implementation ignores the batch size when calculating with-batch
  9865. and between-batch residual variances, since the batch size constant cancels
  9866. out later in the calculations as long as all batches are of equal size.
  9867. Hence, the calculations of these parameters would need to be modified to
  9868. remove this optimization and properly calculate the variances using the
  9869. full formula.
  9870. Once this modification is made, a new strategy would need to be developed
  9871. for assessing the stability of parameter estimates, since the random subsamplin
  9872. g step is eliminated, meaning that different subsamplings can no longer
  9873. be compared as in Figures
  9874. \begin_inset CommandInset ref
  9875. LatexCommand ref
  9876. reference "fig:frma-violin"
  9877. plural "false"
  9878. caps "false"
  9879. noprefix "false"
  9880. \end_inset
  9881. and
  9882. \begin_inset CommandInset ref
  9883. LatexCommand ref
  9884. reference "fig:Representative-MA-plots"
  9885. plural "false"
  9886. caps "false"
  9887. noprefix "false"
  9888. \end_inset
  9889. .
  9890. Bootstrap resampling is likely a good candidate here: sample many training
  9891. sets of equal size from the existing training set with replacement, estimate
  9892. parameters from each resampled training set, and compare the estimated
  9893. parameters between bootstraps in order to quantify the variability in each
  9894. parameter's estimation.
  9895. \end_layout
  9896. \begin_layout Subsection
  9897. Developing methylation arrays as a diagnostic tool for kidney transplant
  9898. rejection
  9899. \end_layout
  9900. \begin_layout Standard
  9901. The current study has showed that DNA methylation, as assayed by Illumina
  9902. 450k methylation arrays, has some potential for diagnosing transplant dysfuncti
  9903. ons, including rejection.
  9904. However, very few probes could be confidently identified as differentially
  9905. methylated between healthy and dysfunctional transplants.
  9906. One likely explanation for this is the predominant influence of unobserved
  9907. confounding factors.
  9908. SVA can model and correct for such factors, but the correction can never
  9909. be perfect, so some degree of unwanted systematic variation will always
  9910. remain after SVA correction.
  9911. If the effect size of the confounding factors was similar to that of the
  9912. factor of interest (in this case, transplant status), this would be an
  9913. acceptable limitation, since removing most of the confounding factors'
  9914. effects would allow the main effect to stand out.
  9915. However, in this data set, the confounding factors have a much larger effect
  9916. size than transplant status, which means that the small degree of remaining
  9917. variation not removed by SVA can still swamp the effect of interest, making
  9918. it difficult to detect.
  9919. This is, of course, a major issue when the end goal is to develop a classifier
  9920. to diagnose transplant rejection from methylation data, since batch-correction
  9921. methods like SVA that work in a linear modeling context cannot be applied
  9922. in a machine learning context.
  9923. \end_layout
  9924. \begin_layout Standard
  9925. Currently, the source of these unwanted systematic variations in the data
  9926. is unknown.
  9927. The best solution would be to determine the cause of the variation and
  9928. eliminate it, thereby eliminating the need to model and remove that variation.
  9929. However, if this proves impractical, another option is to use SVA to identify
  9930. probes that are highly associated with the surrogate variables that describe
  9931. the unwanted variation in the data.
  9932. These probes could be discarded prior to classifier training, in order
  9933. to maximize the chance that the training algorithm will be able to identify
  9934. highly predictive probes from those remaining.
  9935. Lastly, it is possible that some of this unwanted variation is a result
  9936. of the array-based assay being used and would be eliminated by switching
  9937. to assaying DNA methylation using bisulphite sequencing.
  9938. However, this carries the risk that the sequencing assay will have its
  9939. own set of biases that must be corrected for in a different way.
  9940. \end_layout
  9941. \begin_layout Chapter
  9942. Globin-blocking for more effective blood RNA-seq analysis in primate animal
  9943. model
  9944. \end_layout
  9945. \begin_layout Standard
  9946. \begin_inset ERT
  9947. status collapsed
  9948. \begin_layout Plain Layout
  9949. \backslash
  9950. glsresetall
  9951. \end_layout
  9952. \end_inset
  9953. \end_layout
  9954. \begin_layout Standard
  9955. \begin_inset Flex TODO Note (inline)
  9956. status open
  9957. \begin_layout Plain Layout
  9958. Choose between above and the paper title: Optimizing yield of deep RNA sequencin
  9959. g for gene expression profiling by globin reduction of peripheral blood
  9960. samples from cynomolgus monkeys (Macaca fascicularis).
  9961. \end_layout
  9962. \end_inset
  9963. \end_layout
  9964. \begin_layout Standard
  9965. \begin_inset Flex TODO Note (inline)
  9966. status open
  9967. \begin_layout Plain Layout
  9968. Chapter author list:
  9969. \begin_inset CommandInset href
  9970. LatexCommand href
  9971. target "https://tex.stackexchange.com/questions/156862/displaying-author-for-each-chapter-in-book"
  9972. \end_inset
  9973. Every chapter gets an author list, which may or may not be part of a citation
  9974. to a published/preprinted paper.
  9975. \end_layout
  9976. \end_inset
  9977. \end_layout
  9978. \begin_layout Standard
  9979. \begin_inset Flex TODO Note (inline)
  9980. status open
  9981. \begin_layout Plain Layout
  9982. Preprint then cite the paper
  9983. \end_layout
  9984. \end_inset
  9985. \end_layout
  9986. \begin_layout Section*
  9987. Abstract
  9988. \end_layout
  9989. \begin_layout Paragraph
  9990. Background
  9991. \end_layout
  9992. \begin_layout Standard
  9993. Primate blood contains high concentrations of globin messenger RNA.
  9994. Globin reduction is a standard technique used to improve the expression
  9995. results obtained by DNA microarrays on RNA from blood samples.
  9996. However, with
  9997. \begin_inset Flex Glossary Term
  9998. status open
  9999. \begin_layout Plain Layout
  10000. RNA-seq
  10001. \end_layout
  10002. \end_inset
  10003. quickly replacing microarrays for many applications, the impact of globin
  10004. reduction for
  10005. \begin_inset Flex Glossary Term
  10006. status open
  10007. \begin_layout Plain Layout
  10008. RNA-seq
  10009. \end_layout
  10010. \end_inset
  10011. has not been previously studied.
  10012. Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
  10013. primates.
  10014. \end_layout
  10015. \begin_layout Paragraph
  10016. Results
  10017. \end_layout
  10018. \begin_layout Standard
  10019. Here we report a protocol for
  10020. \begin_inset Flex Glossary Term
  10021. status open
  10022. \begin_layout Plain Layout
  10023. RNA-seq
  10024. \end_layout
  10025. \end_inset
  10026. in primate blood samples that uses complimentary oligonucleotides to block
  10027. reverse transcription of the alpha and beta globin genes.
  10028. In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
  10029. blocking protocol approximately doubles the yield of informative (non-globin)
  10030. reads by greatly reducing the fraction of globin reads, while also improving
  10031. the consistency in sequencing depth between samples.
  10032. The increased yield enables detection of about 2000 more genes, significantly
  10033. increases the correlation in measured gene expression levels between samples,
  10034. and increases the sensitivity of differential gene expression tests.
  10035. \end_layout
  10036. \begin_layout Paragraph
  10037. Conclusions
  10038. \end_layout
  10039. \begin_layout Standard
  10040. These results show that globin blocking significantly improves the cost-effectiv
  10041. eness of mRNA sequencing in primate blood samples by doubling the yield
  10042. of useful reads, allowing detection of more genes, and improving the precision
  10043. of gene expression measurements.
  10044. Based on these results, a globin reducing or blocking protocol is recommended
  10045. for all
  10046. \begin_inset Flex Glossary Term
  10047. status open
  10048. \begin_layout Plain Layout
  10049. RNA-seq
  10050. \end_layout
  10051. \end_inset
  10052. studies of primate blood samples.
  10053. \end_layout
  10054. \begin_layout Standard
  10055. \begin_inset ERT
  10056. status collapsed
  10057. \begin_layout Plain Layout
  10058. \backslash
  10059. glsresetall
  10060. \end_layout
  10061. \end_inset
  10062. \end_layout
  10063. \begin_layout Section
  10064. Approach
  10065. \end_layout
  10066. \begin_layout Standard
  10067. \begin_inset Note Note
  10068. status open
  10069. \begin_layout Plain Layout
  10070. Consider putting some of this in the Intro chapter
  10071. \end_layout
  10072. \begin_layout Itemize
  10073. Cynomolgus monkeys as a model organism
  10074. \end_layout
  10075. \begin_deeper
  10076. \begin_layout Itemize
  10077. Highly related to humans
  10078. \end_layout
  10079. \begin_layout Itemize
  10080. Small size and short life cycle - good research animal
  10081. \end_layout
  10082. \begin_layout Itemize
  10083. Genomics resources still in development
  10084. \end_layout
  10085. \end_deeper
  10086. \begin_layout Itemize
  10087. Inadequacy of existing blood RNA-seq protocols
  10088. \end_layout
  10089. \begin_deeper
  10090. \begin_layout Itemize
  10091. Existing protocols use a separate globin pulldown step, slowing down processing
  10092. \end_layout
  10093. \end_deeper
  10094. \end_inset
  10095. \end_layout
  10096. \begin_layout Standard
  10097. Increasingly, researchers are turning to
  10098. \begin_inset Flex Glossary Term
  10099. status open
  10100. \begin_layout Plain Layout
  10101. RNA-seq
  10102. \end_layout
  10103. \end_inset
  10104. in preference to expression microarrays for analysis of gene expression
  10105. \begin_inset CommandInset citation
  10106. LatexCommand cite
  10107. key "Mutz2012"
  10108. literal "false"
  10109. \end_inset
  10110. .
  10111. The advantages are even greater for study of model organisms with no well-estab
  10112. lished array platforms available, such as the cynomolgus monkey (Macaca
  10113. fascicularis).
  10114. High fractions of globin mRNA are naturally present in mammalian peripheral
  10115. blood samples (up to 70% of total mRNA) and these are known to interfere
  10116. with the results of array-based expression profiling
  10117. \begin_inset CommandInset citation
  10118. LatexCommand cite
  10119. key "Winn2010"
  10120. literal "false"
  10121. \end_inset
  10122. .
  10123. The importance of globin reduction for
  10124. \begin_inset Flex Glossary Term
  10125. status open
  10126. \begin_layout Plain Layout
  10127. RNA-seq
  10128. \end_layout
  10129. \end_inset
  10130. of blood has only been evaluated for a deepSAGE protocol on human samples
  10131. \begin_inset CommandInset citation
  10132. LatexCommand cite
  10133. key "Mastrokolias2012"
  10134. literal "false"
  10135. \end_inset
  10136. .
  10137. In the present report, we evaluated globin reduction using custom blocking
  10138. oligonucleotides for deep
  10139. \begin_inset Flex Glossary Term
  10140. status open
  10141. \begin_layout Plain Layout
  10142. RNA-seq
  10143. \end_layout
  10144. \end_inset
  10145. of peripheral blood samples from a nonhuman primate, cynomolgus monkey,
  10146. using the Illumina technology platform.
  10147. We demonstrate that globin reduction significantly improves the cost-effectiven
  10148. ess of
  10149. \begin_inset Flex Glossary Term
  10150. status open
  10151. \begin_layout Plain Layout
  10152. RNA-seq
  10153. \end_layout
  10154. \end_inset
  10155. in blood samples.
  10156. Thus, our protocol offers a significant advantage to any investigator planning
  10157. to use
  10158. \begin_inset Flex Glossary Term
  10159. status open
  10160. \begin_layout Plain Layout
  10161. RNA-seq
  10162. \end_layout
  10163. \end_inset
  10164. for gene expression profiling of nonhuman primate blood samples.
  10165. Our method can be generally applied to any species by designing complementary
  10166. oligonucleotide blocking probes to the globin gene sequences of that species.
  10167. Indeed, any highly expressed but biologically uninformative transcripts
  10168. can also be blocked to further increase sequencing efficiency and value
  10169. \begin_inset CommandInset citation
  10170. LatexCommand cite
  10171. key "Arnaud2016"
  10172. literal "false"
  10173. \end_inset
  10174. .
  10175. \end_layout
  10176. \begin_layout Section
  10177. Methods
  10178. \end_layout
  10179. \begin_layout Subsection
  10180. Sample collection
  10181. \end_layout
  10182. \begin_layout Standard
  10183. All research reported here was done under IACUC-approved protocols at the
  10184. University of Miami and complied with all applicable federal and state
  10185. regulations and ethical principles for nonhuman primate research.
  10186. Blood draws occurred between 16 April 2012 and 18 June 2015.
  10187. The experimental system involved intrahepatic pancreatic islet transplantation
  10188. into Cynomolgus monkeys with induced diabetes mellitus with or without
  10189. concomitant infusion of mesenchymal stem cells.
  10190. Blood was collected at serial time points before and after transplantation
  10191. into PAXgene Blood RNA tubes (PreAnalytiX/Qiagen, Valencia, CA) at the
  10192. precise volume:volume ratio of 2.5 ml whole blood into 6.9 ml of PAX gene
  10193. additive.
  10194. \end_layout
  10195. \begin_layout Subsection
  10196. Globin Blocking
  10197. \end_layout
  10198. \begin_layout Standard
  10199. Four oligonucleotides were designed to hybridize to the 3’ end of the transcript
  10200. s for Cynomolgus HBA1, HBA2 and HBB, with two hybridization sites for HBB
  10201. and 2 sites for HBA (the chosen sites were identical in both HBA genes).
  10202. All oligos were purchased from Sigma and were entirely composed of 2’O-Me
  10203. bases with a C3 spacer positioned at the 3’ ends to prevent any polymerase
  10204. mediated primer extension.
  10205. \end_layout
  10206. \begin_layout Quote
  10207. HBA1/2 site 1: GCCCACUCAGACUUUAUUCAAAG-C3spacer
  10208. \end_layout
  10209. \begin_layout Quote
  10210. HBA1/2 site 2: GGUGCAAGGAGGGGAGGAG-C3spacer
  10211. \end_layout
  10212. \begin_layout Quote
  10213. HBB site 1: AAUGAAAAUAAAUGUUUUUUAUUAG-C3spacer
  10214. \end_layout
  10215. \begin_layout Quote
  10216. HBB site 2: CUCAAGGCCCUUCAUAAUAUCCC-C3spacer
  10217. \end_layout
  10218. \begin_layout Subsection
  10219. RNA-seq Library Preparation
  10220. \end_layout
  10221. \begin_layout Standard
  10222. \begin_inset Flex TODO Note (inline)
  10223. status open
  10224. \begin_layout Plain Layout
  10225. Add protected spaces where appropriate to prevent unwanted line breaks.
  10226. \end_layout
  10227. \end_inset
  10228. \end_layout
  10229. \begin_layout Standard
  10230. Sequencing libraries were prepared with 200
  10231. \begin_inset space ~
  10232. \end_inset
  10233. ng total RNA from each sample.
  10234. Polyadenylated mRNA was selected from 200 ng aliquots of cynomolgus blood-deriv
  10235. ed total RNA using Ambion Dynabeads Oligo(dT)25 beads (Invitrogen) following
  10236. manufacturer’s recommended protocol.
  10237. PolyA selected RNA was then combined with 8 pmol of HBA1/2 (site 1), 8
  10238. pmol of HBA1/2 (site 2), 12 pmol of HBB (site 1) and 12 pmol of HBB (site
  10239. 2) oligonucleotides.
  10240. In addition, 20 pmol of RT primer containing a portion of the Illumina
  10241. adapter sequence (B-oligo-dTV: GAGTTCCTTGGCACCCGAGAATTCCATTTTTTTTTTTTTTTTTTTV)
  10242. and 4 µL of 5X First Strand buffer (250 mM Tris-HCl pH 8.3, 375 mM KCl,
  10243. 15mM MgCl2) were added in a total volume of 15 µL.
  10244. The RNA was fragmented by heating this cocktail for 3 minutes at 95°C and
  10245. then placed on ice.
  10246. This was followed by the addition of 2 µL 0.1 M DTT, 1 µL RNaseOUT, 1 µL
  10247. 10mM dNTPs 10% biotin-16 aminoallyl-2’- dUTP and 10% biotin-16 aminoallyl-2’-
  10248. dCTP (TriLink Biotech, San Diego, CA), 1 µL Superscript II (200U/ µL, Thermo-Fi
  10249. sher).
  10250. A second “unblocked” library was prepared in the same way for each sample
  10251. but replacing the blocking oligos with an equivalent volume of water.
  10252. The reaction was carried out at 25°C for 15 minutes and 42°C for 40 minutes,
  10253. followed by incubation at 75°C for 10 minutes to inactivate the reverse
  10254. transcriptase.
  10255. \end_layout
  10256. \begin_layout Standard
  10257. The cDNA/RNA hybrid molecules were purified using 1.8X Ampure XP beads (Agencourt
  10258. ) following supplier’s recommended protocol.
  10259. The cDNA/RNA hybrid was eluted in 25 µL of 10 mM Tris-HCl pH 8.0, and then
  10260. bound to 25 µL of M280 Magnetic Streptavidin beads washed per recommended
  10261. protocol (Thermo-Fisher).
  10262. After 30 minutes of binding, beads were washed one time in 100 µL 0.1N NaOH
  10263. to denature and remove the bound RNA, followed by two 100 µL washes with
  10264. 1X TE buffer.
  10265. \end_layout
  10266. \begin_layout Standard
  10267. Subsequent attachment of the 5-prime Illumina A adapter was performed by
  10268. on-bead random primer extension of the following sequence (A-N8 primer:
  10269. TTCAGAGTTCTACAGTCCGACGATCNNNNNNNN).
  10270. Briefly, beads were resuspended in a 20 µL reaction containing 5 µM A-N8
  10271. primer, 40mM Tris-HCl pH 7.5, 20mM MgCl2, 50mM NaCl, 0.325U/µL Sequenase
  10272. 2.0 (Affymetrix, Santa Clara, CA), 0.0025U/µL inorganic pyrophosphatase (Affymetr
  10273. ix) and 300 µM each dNTP.
  10274. Reaction was incubated at 22°C for 30 minutes, then beads were washed 2
  10275. times with 1X TE buffer (200µL).
  10276. \end_layout
  10277. \begin_layout Standard
  10278. The magnetic streptavidin beads were resuspended in 34 µL nuclease-free
  10279. water and added directly to a PCR tube.
  10280. The two Illumina protocol-specified PCR primers were added at 0.53 µM (Illumina
  10281. TruSeq Universal Primer 1 and Illumina TruSeq barcoded PCR primer 2), along
  10282. with 40 µL 2X KAPA HiFi Hotstart ReadyMix (KAPA, Willmington MA) and thermocycl
  10283. ed as follows: starting with 98°C (2 min-hold); 15 cycles of 98°C, 20sec;
  10284. 60°C, 30sec; 72°C, 30sec; and finished with a 72°C (2 min-hold).
  10285. \end_layout
  10286. \begin_layout Standard
  10287. PCR products were purified with 1X Ampure Beads following manufacturer’s
  10288. recommended protocol.
  10289. Libraries were then analyzed using the Agilent TapeStation and quantitation
  10290. of desired size range was performed by “smear analysis”.
  10291. Samples were pooled in equimolar batches of 16 samples.
  10292. Pooled libraries were size selected on 2% agarose gels (E-Gel EX Agarose
  10293. Gels; Thermo-Fisher).
  10294. Products were cut between 250 and 350 bp (corresponding to insert sizes
  10295. of 130 to 230 bps).
  10296. Finished library pools were then sequenced on the Illumina NextSeq500 instrumen
  10297. t with 75 base read lengths.
  10298. \end_layout
  10299. \begin_layout Subsection
  10300. Read alignment and counting
  10301. \end_layout
  10302. \begin_layout Standard
  10303. Reads were aligned to the cynomolgus genome using STAR
  10304. \begin_inset CommandInset citation
  10305. LatexCommand cite
  10306. key "Dobin2013,Wilson2013"
  10307. literal "false"
  10308. \end_inset
  10309. .
  10310. Counts of uniquely mapped reads were obtained for every gene in each sample
  10311. with the
  10312. \begin_inset Flex Code
  10313. status open
  10314. \begin_layout Plain Layout
  10315. featureCounts
  10316. \end_layout
  10317. \end_inset
  10318. function from the
  10319. \begin_inset Flex Code
  10320. status open
  10321. \begin_layout Plain Layout
  10322. Rsubread
  10323. \end_layout
  10324. \end_inset
  10325. package, using each of the three possibilities for the
  10326. \begin_inset Flex Code
  10327. status open
  10328. \begin_layout Plain Layout
  10329. strandSpecific
  10330. \end_layout
  10331. \end_inset
  10332. option: sense, antisense, and unstranded
  10333. \begin_inset CommandInset citation
  10334. LatexCommand cite
  10335. key "Liao2014"
  10336. literal "false"
  10337. \end_inset
  10338. .
  10339. A few artifacts in the cynomolgus genome annotation complicated read counting.
  10340. First, no ortholog is annotated for alpha globin in the cynomolgus genome,
  10341. presumably because the human genome has two alpha globin genes with nearly
  10342. identical sequences, making the orthology relationship ambiguous.
  10343. However, two loci in the cynomolgus genome are annotated as “hemoglobin
  10344. subunit alpha-like” (LOC102136192 and LOC102136846).
  10345. LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
  10346. as protein-coding.
  10347. Our globin reduction protocol was designed to include blocking of these
  10348. two genes.
  10349. Indeed, these two genes have almost the same read counts in each library
  10350. as the properly-annotated HBB gene and much larger counts than any other
  10351. gene in the unblocked libraries, giving confidence that reads derived from
  10352. the real alpha globin are mapping to both genes.
  10353. Thus, reads from both of these loci were counted as alpha globin reads
  10354. in all further analyses.
  10355. The second artifact is a small, uncharacterized non-coding RNA gene (LOC1021365
  10356. 91), which overlaps the HBA-like gene (LOC102136192) on the opposite strand.
  10357. If counting is not performed in stranded mode (or if a non-strand-specific
  10358. sequencing protocol is used), many reads mapping to the globin gene will
  10359. be discarded as ambiguous due to their overlap with this ncRNA gene, resulting
  10360. in significant undercounting of globin reads.
  10361. Therefore, stranded sense counts were used for all further analysis in
  10362. the present study to insure that we accurately accounted for globin transcript
  10363. reduction.
  10364. However, we note that stranded reads are not necessary for
  10365. \begin_inset Flex Glossary Term
  10366. status open
  10367. \begin_layout Plain Layout
  10368. RNA-seq
  10369. \end_layout
  10370. \end_inset
  10371. using our protocol in standard practice.
  10372. \end_layout
  10373. \begin_layout Subsection
  10374. Normalization and Exploratory Data Analysis
  10375. \end_layout
  10376. \begin_layout Standard
  10377. Libraries were normalized by computing scaling factors using the
  10378. \begin_inset Flex Code
  10379. status open
  10380. \begin_layout Plain Layout
  10381. edgeR
  10382. \end_layout
  10383. \end_inset
  10384. package’s Trimmed Mean of M-values method
  10385. \begin_inset CommandInset citation
  10386. LatexCommand cite
  10387. key "Robinson2010"
  10388. literal "false"
  10389. \end_inset
  10390. .
  10391. Log2 counts per million values (logCPM) were calculated using the cpm function
  10392. in
  10393. \begin_inset Flex Code
  10394. status open
  10395. \begin_layout Plain Layout
  10396. edgeR
  10397. \end_layout
  10398. \end_inset
  10399. for individual samples and
  10400. \begin_inset Flex Code
  10401. status open
  10402. \begin_layout Plain Layout
  10403. aveLogCPM
  10404. \end_layout
  10405. \end_inset
  10406. function for averages across groups of samples, using those functions’
  10407. default prior count values to avoid taking the logarithm of 0.
  10408. Genes were considered “present” if their average normalized logCPM values
  10409. across all libraries were at least
  10410. \begin_inset Formula $-1$
  10411. \end_inset
  10412. .
  10413. Normalizing for gene length was unnecessary because the sequencing protocol
  10414. is 3’-biased and hence the expected read count for each gene is related
  10415. to the transcript’s copy number but not its length.
  10416. \end_layout
  10417. \begin_layout Standard
  10418. In order to assess the effect of blocking on reproducibility, Pearson and
  10419. Spearman correlation coefficients were computed between the logCPM values
  10420. for every pair of libraries within the globin-blocked (GB) and unblocked
  10421. (non-GB) groups, and
  10422. \begin_inset Flex Code
  10423. status open
  10424. \begin_layout Plain Layout
  10425. edgeR
  10426. \end_layout
  10427. \end_inset
  10428. 's
  10429. \begin_inset Flex Code
  10430. status open
  10431. \begin_layout Plain Layout
  10432. estimateDisp
  10433. \end_layout
  10434. \end_inset
  10435. function was used to compute negative binomial dispersions separately for
  10436. the two groups
  10437. \begin_inset CommandInset citation
  10438. LatexCommand cite
  10439. key "Chen2014"
  10440. literal "false"
  10441. \end_inset
  10442. .
  10443. \end_layout
  10444. \begin_layout Subsection
  10445. Differential Expression Analysis
  10446. \end_layout
  10447. \begin_layout Standard
  10448. All tests for differential gene expression were performed using
  10449. \begin_inset Flex Code
  10450. status open
  10451. \begin_layout Plain Layout
  10452. edgeR
  10453. \end_layout
  10454. \end_inset
  10455. , by first fitting a negative binomial generalized linear model to the counts
  10456. and normalization factors and then performing a quasi-likelihood F-test
  10457. with robust estimation of outlier gene dispersions
  10458. \begin_inset CommandInset citation
  10459. LatexCommand cite
  10460. key "Lund2012,Phipson2016"
  10461. literal "false"
  10462. \end_inset
  10463. .
  10464. To investigate the effects of globin blocking on each gene, an additive
  10465. model was fit to the full data with coefficients for globin blocking and
  10466. SampleID.
  10467. To test the effect of globin blocking on detection of differentially expressed
  10468. genes, the GB samples and non-GB samples were each analyzed independently
  10469. as follows: for each animal with both a pre-transplant and a post-transplant
  10470. time point in the data set, the pre-transplant sample and the earliest
  10471. post-transplant sample were selected, and all others were excluded, yielding
  10472. a pre-/post-transplant pair of samples for each animal (N=7 animals with
  10473. paired samples).
  10474. These samples were analyzed for pre-transplant vs.
  10475. post-transplant differential gene expression while controlling for inter-animal
  10476. variation using an additive model with coefficients for transplant and
  10477. animal ID.
  10478. In all analyses, p-values were adjusted using the Benjamini-Hochberg procedure
  10479. for FDR control
  10480. \begin_inset CommandInset citation
  10481. LatexCommand cite
  10482. key "Benjamini1995"
  10483. literal "false"
  10484. \end_inset
  10485. .
  10486. \end_layout
  10487. \begin_layout Standard
  10488. \begin_inset Note Note
  10489. status open
  10490. \begin_layout Itemize
  10491. New blood RNA-seq protocol to block reverse transcription of globin genes
  10492. \end_layout
  10493. \begin_layout Itemize
  10494. Blood RNA-seq time course after transplants with/without MSC infusion
  10495. \end_layout
  10496. \end_inset
  10497. \end_layout
  10498. \begin_layout Section
  10499. Results
  10500. \end_layout
  10501. \begin_layout Subsection
  10502. Globin blocking yields a larger and more consistent fraction of useful reads
  10503. \end_layout
  10504. \begin_layout Standard
  10505. \begin_inset ERT
  10506. status open
  10507. \begin_layout Plain Layout
  10508. \backslash
  10509. afterpage{
  10510. \end_layout
  10511. \begin_layout Plain Layout
  10512. \backslash
  10513. begin{landscape}
  10514. \end_layout
  10515. \end_inset
  10516. \end_layout
  10517. \begin_layout Standard
  10518. \begin_inset Float table
  10519. placement p
  10520. wide false
  10521. sideways false
  10522. status open
  10523. \begin_layout Plain Layout
  10524. \align center
  10525. \begin_inset Tabular
  10526. <lyxtabular version="3" rows="4" columns="7">
  10527. <features tabularvalignment="middle">
  10528. <column alignment="center" valignment="top">
  10529. <column alignment="center" valignment="top">
  10530. <column alignment="center" valignment="top">
  10531. <column alignment="center" valignment="top">
  10532. <column alignment="center" valignment="top">
  10533. <column alignment="center" valignment="top">
  10534. <column alignment="center" valignment="top">
  10535. <row>
  10536. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10537. \begin_inset Text
  10538. \begin_layout Plain Layout
  10539. \end_layout
  10540. \end_inset
  10541. </cell>
  10542. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10543. \begin_inset Text
  10544. \begin_layout Plain Layout
  10545. \family roman
  10546. \series medium
  10547. \shape up
  10548. \size normal
  10549. \emph off
  10550. \bar no
  10551. \strikeout off
  10552. \xout off
  10553. \uuline off
  10554. \uwave off
  10555. \noun off
  10556. \color none
  10557. Percent of Total Reads
  10558. \end_layout
  10559. \end_inset
  10560. </cell>
  10561. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10562. \begin_inset Text
  10563. \begin_layout Plain Layout
  10564. \end_layout
  10565. \end_inset
  10566. </cell>
  10567. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10568. \begin_inset Text
  10569. \begin_layout Plain Layout
  10570. \end_layout
  10571. \end_inset
  10572. </cell>
  10573. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10574. \begin_inset Text
  10575. \begin_layout Plain Layout
  10576. \end_layout
  10577. \end_inset
  10578. </cell>
  10579. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  10580. \begin_inset Text
  10581. \begin_layout Plain Layout
  10582. \family roman
  10583. \series medium
  10584. \shape up
  10585. \size normal
  10586. \emph off
  10587. \bar no
  10588. \strikeout off
  10589. \xout off
  10590. \uuline off
  10591. \uwave off
  10592. \noun off
  10593. \color none
  10594. Percent of Genic Reads
  10595. \end_layout
  10596. \end_inset
  10597. </cell>
  10598. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  10599. \begin_inset Text
  10600. \begin_layout Plain Layout
  10601. \end_layout
  10602. \end_inset
  10603. </cell>
  10604. </row>
  10605. <row>
  10606. <cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
  10607. \begin_inset Text
  10608. \begin_layout Plain Layout
  10609. GB
  10610. \end_layout
  10611. \end_inset
  10612. </cell>
  10613. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10614. \begin_inset Text
  10615. \begin_layout Plain Layout
  10616. \family roman
  10617. \series medium
  10618. \shape up
  10619. \size normal
  10620. \emph off
  10621. \bar no
  10622. \strikeout off
  10623. \xout off
  10624. \uuline off
  10625. \uwave off
  10626. \noun off
  10627. \color none
  10628. Non-globin Reads
  10629. \end_layout
  10630. \end_inset
  10631. </cell>
  10632. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10633. \begin_inset Text
  10634. \begin_layout Plain Layout
  10635. \family roman
  10636. \series medium
  10637. \shape up
  10638. \size normal
  10639. \emph off
  10640. \bar no
  10641. \strikeout off
  10642. \xout off
  10643. \uuline off
  10644. \uwave off
  10645. \noun off
  10646. \color none
  10647. Globin Reads
  10648. \end_layout
  10649. \end_inset
  10650. </cell>
  10651. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10652. \begin_inset Text
  10653. \begin_layout Plain Layout
  10654. \family roman
  10655. \series medium
  10656. \shape up
  10657. \size normal
  10658. \emph off
  10659. \bar no
  10660. \strikeout off
  10661. \xout off
  10662. \uuline off
  10663. \uwave off
  10664. \noun off
  10665. \color none
  10666. All Genic Reads
  10667. \end_layout
  10668. \end_inset
  10669. </cell>
  10670. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10671. \begin_inset Text
  10672. \begin_layout Plain Layout
  10673. \family roman
  10674. \series medium
  10675. \shape up
  10676. \size normal
  10677. \emph off
  10678. \bar no
  10679. \strikeout off
  10680. \xout off
  10681. \uuline off
  10682. \uwave off
  10683. \noun off
  10684. \color none
  10685. All Aligned Reads
  10686. \end_layout
  10687. \end_inset
  10688. </cell>
  10689. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10690. \begin_inset Text
  10691. \begin_layout Plain Layout
  10692. \family roman
  10693. \series medium
  10694. \shape up
  10695. \size normal
  10696. \emph off
  10697. \bar no
  10698. \strikeout off
  10699. \xout off
  10700. \uuline off
  10701. \uwave off
  10702. \noun off
  10703. \color none
  10704. Non-globin Reads
  10705. \end_layout
  10706. \end_inset
  10707. </cell>
  10708. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  10709. \begin_inset Text
  10710. \begin_layout Plain Layout
  10711. \family roman
  10712. \series medium
  10713. \shape up
  10714. \size normal
  10715. \emph off
  10716. \bar no
  10717. \strikeout off
  10718. \xout off
  10719. \uuline off
  10720. \uwave off
  10721. \noun off
  10722. \color none
  10723. Globin Reads
  10724. \end_layout
  10725. \end_inset
  10726. </cell>
  10727. </row>
  10728. <row>
  10729. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10730. \begin_inset Text
  10731. \begin_layout Plain Layout
  10732. \family roman
  10733. \series medium
  10734. \shape up
  10735. \size normal
  10736. \emph off
  10737. \bar no
  10738. \strikeout off
  10739. \xout off
  10740. \uuline off
  10741. \uwave off
  10742. \noun off
  10743. \color none
  10744. Yes
  10745. \end_layout
  10746. \end_inset
  10747. </cell>
  10748. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10749. \begin_inset Text
  10750. \begin_layout Plain Layout
  10751. \family roman
  10752. \series medium
  10753. \shape up
  10754. \size normal
  10755. \emph off
  10756. \bar no
  10757. \strikeout off
  10758. \xout off
  10759. \uuline off
  10760. \uwave off
  10761. \noun off
  10762. \color none
  10763. 50.4% ± 6.82
  10764. \end_layout
  10765. \end_inset
  10766. </cell>
  10767. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10768. \begin_inset Text
  10769. \begin_layout Plain Layout
  10770. \family roman
  10771. \series medium
  10772. \shape up
  10773. \size normal
  10774. \emph off
  10775. \bar no
  10776. \strikeout off
  10777. \xout off
  10778. \uuline off
  10779. \uwave off
  10780. \noun off
  10781. \color none
  10782. 3.48% ± 2.94
  10783. \end_layout
  10784. \end_inset
  10785. </cell>
  10786. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10787. \begin_inset Text
  10788. \begin_layout Plain Layout
  10789. \family roman
  10790. \series medium
  10791. \shape up
  10792. \size normal
  10793. \emph off
  10794. \bar no
  10795. \strikeout off
  10796. \xout off
  10797. \uuline off
  10798. \uwave off
  10799. \noun off
  10800. \color none
  10801. 53.9% ± 6.81
  10802. \end_layout
  10803. \end_inset
  10804. </cell>
  10805. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10806. \begin_inset Text
  10807. \begin_layout Plain Layout
  10808. \family roman
  10809. \series medium
  10810. \shape up
  10811. \size normal
  10812. \emph off
  10813. \bar no
  10814. \strikeout off
  10815. \xout off
  10816. \uuline off
  10817. \uwave off
  10818. \noun off
  10819. \color none
  10820. 89.7% ± 2.40
  10821. \end_layout
  10822. \end_inset
  10823. </cell>
  10824. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  10825. \begin_inset Text
  10826. \begin_layout Plain Layout
  10827. \family roman
  10828. \series medium
  10829. \shape up
  10830. \size normal
  10831. \emph off
  10832. \bar no
  10833. \strikeout off
  10834. \xout off
  10835. \uuline off
  10836. \uwave off
  10837. \noun off
  10838. \color none
  10839. 93.5% ± 5.25
  10840. \end_layout
  10841. \end_inset
  10842. </cell>
  10843. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  10844. \begin_inset Text
  10845. \begin_layout Plain Layout
  10846. \family roman
  10847. \series medium
  10848. \shape up
  10849. \size normal
  10850. \emph off
  10851. \bar no
  10852. \strikeout off
  10853. \xout off
  10854. \uuline off
  10855. \uwave off
  10856. \noun off
  10857. \color none
  10858. 6.49% ± 5.25
  10859. \end_layout
  10860. \end_inset
  10861. </cell>
  10862. </row>
  10863. <row>
  10864. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10865. \begin_inset Text
  10866. \begin_layout Plain Layout
  10867. \family roman
  10868. \series medium
  10869. \shape up
  10870. \size normal
  10871. \emph off
  10872. \bar no
  10873. \strikeout off
  10874. \xout off
  10875. \uuline off
  10876. \uwave off
  10877. \noun off
  10878. \color none
  10879. No
  10880. \end_layout
  10881. \end_inset
  10882. </cell>
  10883. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10884. \begin_inset Text
  10885. \begin_layout Plain Layout
  10886. \family roman
  10887. \series medium
  10888. \shape up
  10889. \size normal
  10890. \emph off
  10891. \bar no
  10892. \strikeout off
  10893. \xout off
  10894. \uuline off
  10895. \uwave off
  10896. \noun off
  10897. \color none
  10898. 26.3% ± 8.95
  10899. \end_layout
  10900. \end_inset
  10901. </cell>
  10902. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10903. \begin_inset Text
  10904. \begin_layout Plain Layout
  10905. \family roman
  10906. \series medium
  10907. \shape up
  10908. \size normal
  10909. \emph off
  10910. \bar no
  10911. \strikeout off
  10912. \xout off
  10913. \uuline off
  10914. \uwave off
  10915. \noun off
  10916. \color none
  10917. 44.6% ± 16.6
  10918. \end_layout
  10919. \end_inset
  10920. </cell>
  10921. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10922. \begin_inset Text
  10923. \begin_layout Plain Layout
  10924. \family roman
  10925. \series medium
  10926. \shape up
  10927. \size normal
  10928. \emph off
  10929. \bar no
  10930. \strikeout off
  10931. \xout off
  10932. \uuline off
  10933. \uwave off
  10934. \noun off
  10935. \color none
  10936. 70.1% ± 9.38
  10937. \end_layout
  10938. \end_inset
  10939. </cell>
  10940. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10941. \begin_inset Text
  10942. \begin_layout Plain Layout
  10943. \family roman
  10944. \series medium
  10945. \shape up
  10946. \size normal
  10947. \emph off
  10948. \bar no
  10949. \strikeout off
  10950. \xout off
  10951. \uuline off
  10952. \uwave off
  10953. \noun off
  10954. \color none
  10955. 90.7% ± 5.16
  10956. \end_layout
  10957. \end_inset
  10958. </cell>
  10959. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  10960. \begin_inset Text
  10961. \begin_layout Plain Layout
  10962. \family roman
  10963. \series medium
  10964. \shape up
  10965. \size normal
  10966. \emph off
  10967. \bar no
  10968. \strikeout off
  10969. \xout off
  10970. \uuline off
  10971. \uwave off
  10972. \noun off
  10973. \color none
  10974. 38.8% ± 17.1
  10975. \end_layout
  10976. \end_inset
  10977. </cell>
  10978. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  10979. \begin_inset Text
  10980. \begin_layout Plain Layout
  10981. \family roman
  10982. \series medium
  10983. \shape up
  10984. \size normal
  10985. \emph off
  10986. \bar no
  10987. \strikeout off
  10988. \xout off
  10989. \uuline off
  10990. \uwave off
  10991. \noun off
  10992. \color none
  10993. 61.2% ± 17.1
  10994. \end_layout
  10995. \end_inset
  10996. </cell>
  10997. </row>
  10998. </lyxtabular>
  10999. \end_inset
  11000. \end_layout
  11001. \begin_layout Plain Layout
  11002. \begin_inset Caption Standard
  11003. \begin_layout Plain Layout
  11004. \series bold
  11005. \begin_inset Argument 1
  11006. status collapsed
  11007. \begin_layout Plain Layout
  11008. Fractions of reads mapping to genomic features in GB and non-GB samples.
  11009. \end_layout
  11010. \end_inset
  11011. \begin_inset CommandInset label
  11012. LatexCommand label
  11013. name "tab:Fractions-of-reads"
  11014. \end_inset
  11015. Fractions of reads mapping to genomic features in GB and non-GB samples.
  11016. \series default
  11017. All values are given as mean ± standard deviation.
  11018. \end_layout
  11019. \end_inset
  11020. \end_layout
  11021. \end_inset
  11022. \end_layout
  11023. \begin_layout Standard
  11024. \begin_inset ERT
  11025. status open
  11026. \begin_layout Plain Layout
  11027. \backslash
  11028. end{landscape}
  11029. \end_layout
  11030. \begin_layout Plain Layout
  11031. }
  11032. \end_layout
  11033. \end_inset
  11034. \end_layout
  11035. \begin_layout Standard
  11036. The objective of the present study was to validate a new protocol for deep
  11037. \begin_inset Flex Glossary Term
  11038. status open
  11039. \begin_layout Plain Layout
  11040. RNA-seq
  11041. \end_layout
  11042. \end_inset
  11043. of whole blood drawn into PaxGene tubes from cynomolgus monkeys undergoing
  11044. islet transplantation, with particular focus on minimizing the loss of
  11045. useful sequencing space to uninformative globin reads.
  11046. The details of the analysis with respect to transplant outcomes and the
  11047. impact of mesenchymal stem cell treatment will be reported in a separate
  11048. manuscript (in preparation).
  11049. To focus on the efficacy of our globin blocking protocol, 37 blood samples,
  11050. 16 from pre-transplant and 21 from post-transplant time points, were each
  11051. prepped once with and once without globin blocking oligos, and were then
  11052. sequenced on an Illumina NextSeq500 instrument.
  11053. The number of reads aligning to each gene in the cynomolgus genome was
  11054. counted.
  11055. Table 1 summarizes the distribution of read fractions among the GB and
  11056. non-GB libraries.
  11057. In the libraries with no globin blocking, globin reads made up an average
  11058. of 44.6% of total input reads, while reads assigned to all other genes made
  11059. up an average of 26.3%.
  11060. The remaining reads either aligned to intergenic regions (that include
  11061. long non-coding RNAs) or did not align with any annotated transcripts in
  11062. the current build of the cynomolgus genome.
  11063. In the GB libraries, globin reads made up only 3.48% and reads assigned
  11064. to all other genes increased to 50.4%.
  11065. Thus, globin blocking resulted in a 92.2% reduction in globin reads and
  11066. a 91.6% increase in yield of useful non-globin reads.
  11067. \end_layout
  11068. \begin_layout Standard
  11069. This reduction is not quite as efficient as the previous analysis showed
  11070. for human samples by DeepSAGE (<0.4% globin reads after globin reduction)
  11071. \begin_inset CommandInset citation
  11072. LatexCommand cite
  11073. key "Mastrokolias2012"
  11074. literal "false"
  11075. \end_inset
  11076. .
  11077. Nonetheless, this degree of globin reduction is sufficient to nearly double
  11078. the yield of useful reads.
  11079. Thus, globin blocking cuts the required sequencing effort (and costs) to
  11080. achieve a target coverage depth by almost 50%.
  11081. Consistent with this near doubling of yield, the average difference in
  11082. un-normalized logCPM across all genes between the GB libraries and non-GB
  11083. libraries is approximately 1 (mean = 1.01, median = 1.08), an overall 2-fold
  11084. increase.
  11085. Un-normalized values are used here because the TMM normalization correctly
  11086. identifies this 2-fold difference as biologically irrelevant and removes
  11087. it.
  11088. \end_layout
  11089. \begin_layout Standard
  11090. \begin_inset Float figure
  11091. wide false
  11092. sideways false
  11093. status collapsed
  11094. \begin_layout Plain Layout
  11095. \align center
  11096. \begin_inset Graphics
  11097. filename graphics/Globin Paper/figure1 - globin-fractions.pdf
  11098. lyxscale 50
  11099. width 75col%
  11100. \end_inset
  11101. \end_layout
  11102. \begin_layout Plain Layout
  11103. \begin_inset Caption Standard
  11104. \begin_layout Plain Layout
  11105. \series bold
  11106. \begin_inset Argument 1
  11107. status collapsed
  11108. \begin_layout Plain Layout
  11109. Fraction of genic reads in each sample aligned to non-globin genes, with
  11110. and without globin blocking (GB).
  11111. \end_layout
  11112. \end_inset
  11113. \begin_inset CommandInset label
  11114. LatexCommand label
  11115. name "fig:Fraction-of-genic-reads"
  11116. \end_inset
  11117. Fraction of genic reads in each sample aligned to non-globin genes, with
  11118. and without globin blocking (GB).
  11119. \series default
  11120. All reads in each sequencing library were aligned to the cyno genome, and
  11121. the number of reads uniquely aligning to each gene was counted.
  11122. For each sample, counts were summed separately for all globin genes and
  11123. for the remainder of the genes (non-globin genes), and the fraction of
  11124. genic reads aligned to non-globin genes was computed.
  11125. Each point represents an individual sample.
  11126. Gray + signs indicate the means for globin-blocked libraries and unblocked
  11127. libraries.
  11128. The overall distribution for each group is represented as a notched box
  11129. plots.
  11130. Points are randomly spread vertically to avoid excessive overlapping.
  11131. \end_layout
  11132. \end_inset
  11133. \end_layout
  11134. \end_inset
  11135. \end_layout
  11136. \begin_layout Standard
  11137. Another important aspect is that the standard deviations in Table
  11138. \begin_inset CommandInset ref
  11139. LatexCommand ref
  11140. reference "tab:Fractions-of-reads"
  11141. plural "false"
  11142. caps "false"
  11143. noprefix "false"
  11144. \end_inset
  11145. are uniformly smaller in the GB samples than the non-GB ones, indicating
  11146. much greater consistency of yield.
  11147. This is best seen in the percentage of non-globin reads as a fraction of
  11148. total reads aligned to annotated genes (genic reads).
  11149. For the non-GB samples, this measure ranges from 10.9% to 80.9%, while for
  11150. the GB samples it ranges from 81.9% to 99.9% (Figure
  11151. \begin_inset CommandInset ref
  11152. LatexCommand ref
  11153. reference "fig:Fraction-of-genic-reads"
  11154. plural "false"
  11155. caps "false"
  11156. noprefix "false"
  11157. \end_inset
  11158. ).
  11159. This means that for applications where it is critical that each sample
  11160. achieve a specified minimum coverage in order to provide useful information,
  11161. it would be necessary to budget up to 10 times the sequencing depth per
  11162. sample without globin blocking, even though the average yield improvement
  11163. for globin blocking is only 2-fold, because every sample has a chance of
  11164. being 90% globin and 10% useful reads.
  11165. Hence, the more consistent behavior of GB samples makes planning an experiment
  11166. easier and more efficient because it eliminates the need to over-sequence
  11167. every sample in order to guard against the worst case of a high-globin
  11168. fraction.
  11169. \end_layout
  11170. \begin_layout Subsection
  11171. Globin blocking lowers the noise floor and allows detection of about 2000
  11172. more low-expression genes
  11173. \end_layout
  11174. \begin_layout Standard
  11175. \begin_inset Flex TODO Note (inline)
  11176. status open
  11177. \begin_layout Plain Layout
  11178. Remove redundant titles from figures
  11179. \end_layout
  11180. \end_inset
  11181. \end_layout
  11182. \begin_layout Standard
  11183. \begin_inset Float figure
  11184. wide false
  11185. sideways false
  11186. status collapsed
  11187. \begin_layout Plain Layout
  11188. \align center
  11189. \begin_inset Graphics
  11190. filename graphics/Globin Paper/figure2 - aveLogCPM-colored.pdf
  11191. lyxscale 50
  11192. height 60theight%
  11193. \end_inset
  11194. \end_layout
  11195. \begin_layout Plain Layout
  11196. \begin_inset Caption Standard
  11197. \begin_layout Plain Layout
  11198. \series bold
  11199. \begin_inset Argument 1
  11200. status collapsed
  11201. \begin_layout Plain Layout
  11202. Distributions of average group gene abundances when normalized separately
  11203. or together.
  11204. \end_layout
  11205. \end_inset
  11206. \begin_inset CommandInset label
  11207. LatexCommand label
  11208. name "fig:logcpm-dists"
  11209. \end_inset
  11210. Distributions of average group gene abundances when normalized separately
  11211. or together.
  11212. \series default
  11213. All reads in each sequencing library were aligned to the cyno genome, and
  11214. the number of reads uniquely aligning to each gene was counted.
  11215. Genes with zero counts in all libraries were discarded.
  11216. Libraries were normalized using the TMM method.
  11217. Libraries were split into globin-blocked (GB) and non-GB groups and the
  11218. average abundance for each gene in both groups, measured in log2 counts
  11219. per million reads counted, was computed using the aveLogCPM function.
  11220. The distribution of average gene logCPM values was plotted for both groups
  11221. using a kernel density plot to approximate a continuous distribution.
  11222. The logCPM GB distributions are marked in red, non-GB in blue.
  11223. The black vertical line denotes the chosen detection threshold of -1.
  11224. Top panel: Libraries were split into GB and non-GB groups first and normalized
  11225. separately.
  11226. Bottom panel: Libraries were all normalized together first and then split
  11227. into groups.
  11228. \end_layout
  11229. \end_inset
  11230. \end_layout
  11231. \begin_layout Plain Layout
  11232. \end_layout
  11233. \end_inset
  11234. \end_layout
  11235. \begin_layout Standard
  11236. Since globin blocking yields more usable sequencing depth, it should also
  11237. allow detection of more genes at any given threshold.
  11238. When we looked at the distribution of average normalized logCPM values
  11239. across all libraries for genes with at least one read assigned to them,
  11240. we observed the expected bimodal distribution, with a high-abundance "signal"
  11241. peak representing detected genes and a low-abundance "noise" peak representing
  11242. genes whose read count did not rise above the noise floor (Figure
  11243. \begin_inset CommandInset ref
  11244. LatexCommand ref
  11245. reference "fig:logcpm-dists"
  11246. plural "false"
  11247. caps "false"
  11248. noprefix "false"
  11249. \end_inset
  11250. ).
  11251. Consistent with the 2-fold increase in raw counts assigned to non-globin
  11252. genes, the signal peak for GB samples is shifted to the right relative
  11253. to the non-GB signal peak.
  11254. When all the samples are normalized together, this difference is normalized
  11255. out, lining up the signal peaks, and this reveals that, as expected, the
  11256. noise floor for the GB samples is about 2-fold lower.
  11257. This greater separation between signal and noise peaks in the GB samples
  11258. means that low-expression genes should be more easily detected and more
  11259. precisely quantified than in the non-GB samples.
  11260. \end_layout
  11261. \begin_layout Standard
  11262. \begin_inset Float figure
  11263. wide false
  11264. sideways false
  11265. status collapsed
  11266. \begin_layout Plain Layout
  11267. \align center
  11268. \begin_inset Graphics
  11269. filename graphics/Globin Paper/figure3 - detection.pdf
  11270. lyxscale 50
  11271. width 70col%
  11272. \end_inset
  11273. \end_layout
  11274. \begin_layout Plain Layout
  11275. \begin_inset Caption Standard
  11276. \begin_layout Plain Layout
  11277. \series bold
  11278. \begin_inset Argument 1
  11279. status collapsed
  11280. \begin_layout Plain Layout
  11281. Gene detections as a function of abundance thresholds in globin-blocked
  11282. (GB) and non-GB samples.
  11283. \end_layout
  11284. \end_inset
  11285. \begin_inset CommandInset label
  11286. LatexCommand label
  11287. name "fig:Gene-detections"
  11288. \end_inset
  11289. Gene detections as a function of abundance thresholds in globin-blocked
  11290. (GB) and non-GB samples.
  11291. \series default
  11292. Average abundance (logCPM,
  11293. \begin_inset Formula $\log_{2}$
  11294. \end_inset
  11295. counts per million reads counted) was computed by separate group normalization
  11296. as described in Figure
  11297. \begin_inset CommandInset ref
  11298. LatexCommand ref
  11299. reference "fig:logcpm-dists"
  11300. plural "false"
  11301. caps "false"
  11302. noprefix "false"
  11303. \end_inset
  11304. for both the GB and non-GB groups, as well as for all samples considered
  11305. as one large group.
  11306. For each every integer threshold from -2 to 3, the number of genes detected
  11307. at or above that logCPM threshold was plotted for each group.
  11308. \end_layout
  11309. \end_inset
  11310. \end_layout
  11311. \begin_layout Plain Layout
  11312. \end_layout
  11313. \end_inset
  11314. \end_layout
  11315. \begin_layout Standard
  11316. Based on these distributions, we selected a detection threshold of
  11317. \begin_inset Formula $-1$
  11318. \end_inset
  11319. , which is approximately the leftmost edge of the trough between the signal
  11320. and noise peaks.
  11321. This represents the most liberal possible detection threshold that doesn't
  11322. call substantial numbers of noise genes as detected.
  11323. Among the full dataset, 13429 genes were detected at this threshold, and
  11324. 22276 were not.
  11325. When considering the GB libraries and non-GB libraries separately and re-comput
  11326. ing normalization factors independently within each group, 14535 genes were
  11327. detected in the GB libraries while only 12460 were detected in the non-GB
  11328. libraries.
  11329. Thus, GB allowed the detection of 2000 extra genes that were buried under
  11330. the noise floor without GB.
  11331. This pattern of at least 2000 additional genes detected with GB was also
  11332. consistent across a wide range of possible detection thresholds, from -2
  11333. to 3 (see Figure
  11334. \begin_inset CommandInset ref
  11335. LatexCommand ref
  11336. reference "fig:Gene-detections"
  11337. plural "false"
  11338. caps "false"
  11339. noprefix "false"
  11340. \end_inset
  11341. ).
  11342. \end_layout
  11343. \begin_layout Subsection
  11344. Globin blocking does not add significant additional noise or decrease sample
  11345. quality
  11346. \end_layout
  11347. \begin_layout Standard
  11348. One potential worry is that the globin blocking protocol could perturb the
  11349. levels of non-globin genes.
  11350. There are two kinds of possible perturbations: systematic and random.
  11351. The former is not a major concern for detection of differential expression,
  11352. since a 2-fold change in every sample has no effect on the relative fold
  11353. change between samples.
  11354. In contrast, random perturbations would increase the noise and obscure
  11355. the signal in the dataset, reducing the capacity to detect differential
  11356. expression.
  11357. \end_layout
  11358. \begin_layout Standard
  11359. \begin_inset Float figure
  11360. wide false
  11361. sideways false
  11362. status collapsed
  11363. \begin_layout Plain Layout
  11364. \align center
  11365. \begin_inset Graphics
  11366. filename graphics/Globin Paper/figure4 - maplot-colored.pdf
  11367. lyxscale 50
  11368. width 60col%
  11369. groupId colwidth
  11370. \end_inset
  11371. \end_layout
  11372. \begin_layout Plain Layout
  11373. \begin_inset Caption Standard
  11374. \begin_layout Plain Layout
  11375. \begin_inset Argument 1
  11376. status collapsed
  11377. \begin_layout Plain Layout
  11378. MA plot showing effects of globin blocking on each gene's abundance.
  11379. \end_layout
  11380. \end_inset
  11381. \begin_inset CommandInset label
  11382. LatexCommand label
  11383. name "fig:MA-plot"
  11384. \end_inset
  11385. \series bold
  11386. MA plot showing effects of globin blocking on each gene's abundance.
  11387. \series default
  11388. All libraries were normalized together as described in Figure
  11389. \begin_inset CommandInset ref
  11390. LatexCommand ref
  11391. reference "fig:logcpm-dists"
  11392. plural "false"
  11393. caps "false"
  11394. noprefix "false"
  11395. \end_inset
  11396. , and genes with an average logCPM below -1 were filtered out.
  11397. Each remaining gene was tested for differential abundance with respect
  11398. to globin blocking (GB) using
  11399. \begin_inset Flex Code
  11400. status open
  11401. \begin_layout Plain Layout
  11402. edgeR
  11403. \end_layout
  11404. \end_inset
  11405. ’s quasi-likelihood F-test, fitting a negative binomial generalized linear
  11406. model to table of read counts in each library.
  11407. For each gene,
  11408. \begin_inset Flex Code
  11409. status open
  11410. \begin_layout Plain Layout
  11411. edgeR
  11412. \end_layout
  11413. \end_inset
  11414. reported average abundance (logCPM),
  11415. \begin_inset Formula $\log_{2}$
  11416. \end_inset
  11417. fold change (logFC), p-value, and Benjamini-Hochberg adjusted false discovery
  11418. rate (FDR).
  11419. Each gene's logFC was plotted against its logCPM, colored by FDR.
  11420. Red points are significant at ≤10% FDR, and blue are not significant at
  11421. that threshold.
  11422. The alpha and beta globin genes targeted for blocking are marked with large
  11423. triangles, while all other genes are represented as small points.
  11424. \end_layout
  11425. \end_inset
  11426. \end_layout
  11427. \begin_layout Plain Layout
  11428. \end_layout
  11429. \end_inset
  11430. \end_layout
  11431. \begin_layout Standard
  11432. \begin_inset Flex TODO Note (inline)
  11433. status open
  11434. \begin_layout Plain Layout
  11435. Standardize on
  11436. \begin_inset Quotes eld
  11437. \end_inset
  11438. log2
  11439. \begin_inset Quotes erd
  11440. \end_inset
  11441. notation
  11442. \end_layout
  11443. \end_inset
  11444. \end_layout
  11445. \begin_layout Standard
  11446. The data do indeed show small systematic perturbations in gene levels (Figure
  11447. \begin_inset CommandInset ref
  11448. LatexCommand ref
  11449. reference "fig:MA-plot"
  11450. plural "false"
  11451. caps "false"
  11452. noprefix "false"
  11453. \end_inset
  11454. ).
  11455. Other than the 3 designated alpha and beta globin genes, two other genes
  11456. stand out as having especially large negative log fold changes: HBD and
  11457. LOC1021365.
  11458. HBD, delta globin, is most likely targeted by the blocking oligos due to
  11459. high sequence homology with the other globin genes.
  11460. LOC1021365 is the aforementioned ncRNA that is reverse-complementary to
  11461. one of the alpha-like genes and that would be expected to be removed during
  11462. the globin blocking step.
  11463. All other genes appear in a cluster centered vertically at 0, and the vast
  11464. majority of genes in this cluster show an absolute log2(FC) of 0.5 or less.
  11465. Nevertheless, many of these small perturbations are still statistically
  11466. significant, indicating that the globin blocking oligos likely cause very
  11467. small but non-zero systematic perturbations in measured gene expression
  11468. levels.
  11469. \end_layout
  11470. \begin_layout Standard
  11471. \begin_inset Float figure
  11472. wide false
  11473. sideways false
  11474. status collapsed
  11475. \begin_layout Plain Layout
  11476. \align center
  11477. \begin_inset Graphics
  11478. filename graphics/Globin Paper/figure5 - corrplot.pdf
  11479. lyxscale 50
  11480. width 70col%
  11481. \end_inset
  11482. \end_layout
  11483. \begin_layout Plain Layout
  11484. \begin_inset Caption Standard
  11485. \begin_layout Plain Layout
  11486. \series bold
  11487. \begin_inset Argument 1
  11488. status collapsed
  11489. \begin_layout Plain Layout
  11490. Comparison of inter-sample gene abundance correlations with and without
  11491. globin blocking.
  11492. \end_layout
  11493. \end_inset
  11494. \begin_inset CommandInset label
  11495. LatexCommand label
  11496. name "fig:gene-abundance-correlations"
  11497. \end_inset
  11498. Comparison of inter-sample gene abundance correlations with and without
  11499. globin blocking (GB).
  11500. \series default
  11501. All libraries were normalized together as described in Figure 2, and genes
  11502. with an average abundance (logCPM, log2 counts per million reads counted)
  11503. less than -1 were filtered out.
  11504. Each gene’s logCPM was computed in each library using the
  11505. \begin_inset Flex Code
  11506. status open
  11507. \begin_layout Plain Layout
  11508. edgeR
  11509. \end_layout
  11510. \end_inset
  11511. cpm function.
  11512. For each pair of biological samples, the Pearson correlation between those
  11513. samples' GB libraries was plotted against the correlation between the same
  11514. samples’ non-GB libraries.
  11515. Each point represents an unique pair of samples.
  11516. The solid gray line shows a quantile-quantile plot of distribution of GB
  11517. correlations vs.
  11518. that of non-GB correlations.
  11519. The thin dashed line is the identity line, provided for reference.
  11520. \end_layout
  11521. \end_inset
  11522. \end_layout
  11523. \begin_layout Plain Layout
  11524. \end_layout
  11525. \end_inset
  11526. \end_layout
  11527. \begin_layout Standard
  11528. \begin_inset Flex TODO Note (inline)
  11529. status open
  11530. \begin_layout Plain Layout
  11531. Give these numbers the LaTeX math treatment
  11532. \end_layout
  11533. \end_inset
  11534. \end_layout
  11535. \begin_layout Standard
  11536. To evaluate the possibility of globin blocking causing random perturbations
  11537. and reducing sample quality, we computed the Pearson correlation between
  11538. logCPM values for every pair of samples with and without GB and plotted
  11539. them against each other (Figure
  11540. \begin_inset CommandInset ref
  11541. LatexCommand ref
  11542. reference "fig:gene-abundance-correlations"
  11543. plural "false"
  11544. caps "false"
  11545. noprefix "false"
  11546. \end_inset
  11547. ).
  11548. The plot indicated that the GB libraries have higher sample-to-sample correlati
  11549. ons than the non-GB libraries.
  11550. Parametric and nonparametric tests for differences between the correlations
  11551. with and without GB both confirmed that this difference was highly significant
  11552. (2-sided paired t-test: t = 37.2, df = 665, P ≪ 2.2e-16; 2-sided Wilcoxon
  11553. sign-rank test: V = 2195, P ≪ 2.2e-16).
  11554. Performing the same tests on the Spearman correlations gave the same conclusion
  11555. (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
  11556. The
  11557. \begin_inset Flex Code
  11558. status open
  11559. \begin_layout Plain Layout
  11560. edgeR
  11561. \end_layout
  11562. \end_inset
  11563. package was used to compute the overall biological coefficient of variation
  11564. (BCV) for GB and non-GB libraries, and found that globin blocking resulted
  11565. in a negligible increase in the BCV (0.417 with GB vs.
  11566. 0.400 without).
  11567. The near equality of the BCVs for both sets indicates that the higher correlati
  11568. ons in the GB libraries are most likely a result of the increased yield
  11569. of useful reads, which reduces the contribution of Poisson counting uncertainty
  11570. to the overall variance of the logCPM values
  11571. \begin_inset CommandInset citation
  11572. LatexCommand cite
  11573. key "McCarthy2012"
  11574. literal "false"
  11575. \end_inset
  11576. .
  11577. This improves the precision of expression measurements and more than offsets
  11578. the negligible increase in BCV.
  11579. \end_layout
  11580. \begin_layout Subsection
  11581. More differentially expressed genes are detected with globin blocking
  11582. \end_layout
  11583. \begin_layout Standard
  11584. \begin_inset Float table
  11585. wide false
  11586. sideways false
  11587. status collapsed
  11588. \begin_layout Plain Layout
  11589. \align center
  11590. \begin_inset Tabular
  11591. <lyxtabular version="3" rows="5" columns="5">
  11592. <features tabularvalignment="middle">
  11593. <column alignment="center" valignment="top">
  11594. <column alignment="center" valignment="top">
  11595. <column alignment="center" valignment="top">
  11596. <column alignment="center" valignment="top">
  11597. <column alignment="center" valignment="top">
  11598. <row>
  11599. <cell alignment="center" valignment="top" usebox="none">
  11600. \begin_inset Text
  11601. \begin_layout Plain Layout
  11602. \end_layout
  11603. \end_inset
  11604. </cell>
  11605. <cell alignment="center" valignment="top" usebox="none">
  11606. \begin_inset Text
  11607. \begin_layout Plain Layout
  11608. \end_layout
  11609. \end_inset
  11610. </cell>
  11611. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  11612. \begin_inset Text
  11613. \begin_layout Plain Layout
  11614. \series bold
  11615. No Globin Blocking
  11616. \end_layout
  11617. \end_inset
  11618. </cell>
  11619. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  11620. \begin_inset Text
  11621. \begin_layout Plain Layout
  11622. \end_layout
  11623. \end_inset
  11624. </cell>
  11625. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  11626. \begin_inset Text
  11627. \begin_layout Plain Layout
  11628. \end_layout
  11629. \end_inset
  11630. </cell>
  11631. </row>
  11632. <row>
  11633. <cell alignment="center" valignment="top" usebox="none">
  11634. \begin_inset Text
  11635. \begin_layout Plain Layout
  11636. \end_layout
  11637. \end_inset
  11638. </cell>
  11639. <cell alignment="center" valignment="top" usebox="none">
  11640. \begin_inset Text
  11641. \begin_layout Plain Layout
  11642. \end_layout
  11643. \end_inset
  11644. </cell>
  11645. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11646. \begin_inset Text
  11647. \begin_layout Plain Layout
  11648. \series bold
  11649. Up
  11650. \end_layout
  11651. \end_inset
  11652. </cell>
  11653. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11654. \begin_inset Text
  11655. \begin_layout Plain Layout
  11656. \series bold
  11657. NS
  11658. \end_layout
  11659. \end_inset
  11660. </cell>
  11661. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  11662. \begin_inset Text
  11663. \begin_layout Plain Layout
  11664. \series bold
  11665. Down
  11666. \end_layout
  11667. \end_inset
  11668. </cell>
  11669. </row>
  11670. <row>
  11671. <cell multirow="3" alignment="center" valignment="middle" topline="true" bottomline="true" leftline="true" usebox="none">
  11672. \begin_inset Text
  11673. \begin_layout Plain Layout
  11674. \series bold
  11675. Globin-Blocking
  11676. \end_layout
  11677. \end_inset
  11678. </cell>
  11679. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11680. \begin_inset Text
  11681. \begin_layout Plain Layout
  11682. \series bold
  11683. Up
  11684. \end_layout
  11685. \end_inset
  11686. </cell>
  11687. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11688. \begin_inset Text
  11689. \begin_layout Plain Layout
  11690. \family roman
  11691. \series medium
  11692. \shape up
  11693. \size normal
  11694. \emph off
  11695. \bar no
  11696. \strikeout off
  11697. \xout off
  11698. \uuline off
  11699. \uwave off
  11700. \noun off
  11701. \color none
  11702. 231
  11703. \end_layout
  11704. \end_inset
  11705. </cell>
  11706. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11707. \begin_inset Text
  11708. \begin_layout Plain Layout
  11709. \family roman
  11710. \series medium
  11711. \shape up
  11712. \size normal
  11713. \emph off
  11714. \bar no
  11715. \strikeout off
  11716. \xout off
  11717. \uuline off
  11718. \uwave off
  11719. \noun off
  11720. \color none
  11721. 515
  11722. \end_layout
  11723. \end_inset
  11724. </cell>
  11725. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  11726. \begin_inset Text
  11727. \begin_layout Plain Layout
  11728. \family roman
  11729. \series medium
  11730. \shape up
  11731. \size normal
  11732. \emph off
  11733. \bar no
  11734. \strikeout off
  11735. \xout off
  11736. \uuline off
  11737. \uwave off
  11738. \noun off
  11739. \color none
  11740. 2
  11741. \end_layout
  11742. \end_inset
  11743. </cell>
  11744. </row>
  11745. <row>
  11746. <cell multirow="4" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11747. \begin_inset Text
  11748. \begin_layout Plain Layout
  11749. \end_layout
  11750. \end_inset
  11751. </cell>
  11752. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11753. \begin_inset Text
  11754. \begin_layout Plain Layout
  11755. \series bold
  11756. NS
  11757. \end_layout
  11758. \end_inset
  11759. </cell>
  11760. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11761. \begin_inset Text
  11762. \begin_layout Plain Layout
  11763. \family roman
  11764. \series medium
  11765. \shape up
  11766. \size normal
  11767. \emph off
  11768. \bar no
  11769. \strikeout off
  11770. \xout off
  11771. \uuline off
  11772. \uwave off
  11773. \noun off
  11774. \color none
  11775. 160
  11776. \end_layout
  11777. \end_inset
  11778. </cell>
  11779. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  11780. \begin_inset Text
  11781. \begin_layout Plain Layout
  11782. \family roman
  11783. \series medium
  11784. \shape up
  11785. \size normal
  11786. \emph off
  11787. \bar no
  11788. \strikeout off
  11789. \xout off
  11790. \uuline off
  11791. \uwave off
  11792. \noun off
  11793. \color none
  11794. 11235
  11795. \end_layout
  11796. \end_inset
  11797. </cell>
  11798. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  11799. \begin_inset Text
  11800. \begin_layout Plain Layout
  11801. \family roman
  11802. \series medium
  11803. \shape up
  11804. \size normal
  11805. \emph off
  11806. \bar no
  11807. \strikeout off
  11808. \xout off
  11809. \uuline off
  11810. \uwave off
  11811. \noun off
  11812. \color none
  11813. 136
  11814. \end_layout
  11815. \end_inset
  11816. </cell>
  11817. </row>
  11818. <row>
  11819. <cell multirow="4" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  11820. \begin_inset Text
  11821. \begin_layout Plain Layout
  11822. \end_layout
  11823. \end_inset
  11824. </cell>
  11825. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  11826. \begin_inset Text
  11827. \begin_layout Plain Layout
  11828. \series bold
  11829. Down
  11830. \end_layout
  11831. \end_inset
  11832. </cell>
  11833. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  11834. \begin_inset Text
  11835. \begin_layout Plain Layout
  11836. \family roman
  11837. \series medium
  11838. \shape up
  11839. \size normal
  11840. \emph off
  11841. \bar no
  11842. \strikeout off
  11843. \xout off
  11844. \uuline off
  11845. \uwave off
  11846. \noun off
  11847. \color none
  11848. 0
  11849. \end_layout
  11850. \end_inset
  11851. </cell>
  11852. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  11853. \begin_inset Text
  11854. \begin_layout Plain Layout
  11855. \family roman
  11856. \series medium
  11857. \shape up
  11858. \size normal
  11859. \emph off
  11860. \bar no
  11861. \strikeout off
  11862. \xout off
  11863. \uuline off
  11864. \uwave off
  11865. \noun off
  11866. \color none
  11867. 548
  11868. \end_layout
  11869. \end_inset
  11870. </cell>
  11871. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  11872. \begin_inset Text
  11873. \begin_layout Plain Layout
  11874. \family roman
  11875. \series medium
  11876. \shape up
  11877. \size normal
  11878. \emph off
  11879. \bar no
  11880. \strikeout off
  11881. \xout off
  11882. \uuline off
  11883. \uwave off
  11884. \noun off
  11885. \color none
  11886. 127
  11887. \end_layout
  11888. \end_inset
  11889. </cell>
  11890. </row>
  11891. </lyxtabular>
  11892. \end_inset
  11893. \end_layout
  11894. \begin_layout Plain Layout
  11895. \begin_inset Caption Standard
  11896. \begin_layout Plain Layout
  11897. \series bold
  11898. \begin_inset Argument 1
  11899. status open
  11900. \begin_layout Plain Layout
  11901. Comparison of significantly differentially expressed genes with and without
  11902. globin blocking.
  11903. \end_layout
  11904. \end_inset
  11905. \begin_inset CommandInset label
  11906. LatexCommand label
  11907. name "tab:Comparison-of-significant"
  11908. \end_inset
  11909. Comparison of significantly differentially expressed genes with and without
  11910. globin blocking.
  11911. \series default
  11912. Up, Down: Genes significantly up/down-regulated in post-transplant samples
  11913. relative to pre-transplant samples, with a false discovery rate of 10%
  11914. or less.
  11915. NS: Non-significant genes (false discovery rate greater than 10%).
  11916. \end_layout
  11917. \end_inset
  11918. \end_layout
  11919. \begin_layout Plain Layout
  11920. \end_layout
  11921. \end_inset
  11922. \end_layout
  11923. \begin_layout Standard
  11924. To compare performance on differential gene expression tests, we took subsets
  11925. of both the GB and non-GB libraries with exactly one pre-transplant and
  11926. one post-transplant sample for each animal that had paired samples available
  11927. for analysis (N=7 animals, N=14 samples in each subset).
  11928. The same test for pre- vs.
  11929. post-transplant differential gene expression was performed on the same
  11930. 7 pairs of samples from GB libraries and non-GB libraries, in each case
  11931. using an FDR of 10% as the threshold of significance.
  11932. Out of 12954 genes that passed the detection threshold in both subsets,
  11933. 358 were called significantly differentially expressed in the same direction
  11934. in both sets; 1063 were differentially expressed in the GB set only; 296
  11935. were differentially expressed in the non-GB set only; 2 genes were called
  11936. significantly up in the GB set but significantly down in the non-GB set;
  11937. and the remaining 11235 were not called differentially expressed in either
  11938. set.
  11939. These data are summarized in Table
  11940. \begin_inset CommandInset ref
  11941. LatexCommand ref
  11942. reference "tab:Comparison-of-significant"
  11943. plural "false"
  11944. caps "false"
  11945. noprefix "false"
  11946. \end_inset
  11947. .
  11948. The differences in BCV calculated by EdgeR for these subsets of samples
  11949. were negligible (BCV = 0.302 for GB and 0.297 for non-GB).
  11950. \end_layout
  11951. \begin_layout Standard
  11952. The key point is that the GB data results in substantially more differentially
  11953. expressed calls than the non-GB data.
  11954. Since there is no gold standard for this dataset, it is impossible to be
  11955. certain whether this is due to under-calling of differential expression
  11956. in the non-GB samples or over-calling in the GB samples.
  11957. However, given that both datasets are derived from the same biological
  11958. samples and have nearly equal BCVs, it is more likely that the larger number
  11959. of DE calls in the GB samples are genuine detections that were enabled
  11960. by the higher sequencing depth and measurement precision of the GB samples.
  11961. Note that the same set of genes was considered in both subsets, so the
  11962. larger number of differentially expressed gene calls in the GB data set
  11963. reflects a greater sensitivity to detect significant differential gene
  11964. expression and not simply the larger total number of detected genes in
  11965. GB samples described earlier.
  11966. \end_layout
  11967. \begin_layout Section
  11968. Discussion
  11969. \end_layout
  11970. \begin_layout Standard
  11971. The original experience with whole blood gene expression profiling on DNA
  11972. microarrays demonstrated that the high concentration of globin transcripts
  11973. reduced the sensitivity to detect genes with relatively low expression
  11974. levels, in effect, significantly reducing the sensitivity.
  11975. To address this limitation, commercial protocols for globin reduction were
  11976. developed based on strategies to block globin transcript amplification
  11977. during labeling or physically removing globin transcripts by affinity bead
  11978. methods
  11979. \begin_inset CommandInset citation
  11980. LatexCommand cite
  11981. key "Winn2010"
  11982. literal "false"
  11983. \end_inset
  11984. .
  11985. More recently, using the latest generation of labeling protocols and arrays,
  11986. it was determined that globin reduction was no longer necessary to obtain
  11987. sufficient sensitivity to detect differential transcript expression
  11988. \begin_inset CommandInset citation
  11989. LatexCommand cite
  11990. key "NuGEN2010"
  11991. literal "false"
  11992. \end_inset
  11993. .
  11994. However, we are not aware of any publications using these currently available
  11995. protocols the with latest generation of microarrays that actually compare
  11996. the detection sensitivity with and without globin reduction.
  11997. However, in practice this has now been adopted generally primarily driven
  11998. by concerns for cost control.
  11999. The main objective of our work was to directly test the impact of globin
  12000. gene transcripts and a new globin blocking protocol for application to
  12001. the newest generation of differential gene expression profiling determined
  12002. using next generation sequencing.
  12003. \end_layout
  12004. \begin_layout Standard
  12005. The challenge of doing global gene expression profiling in cynomolgus monkeys
  12006. is that the current available arrays were never designed to comprehensively
  12007. cover this genome and have not been updated since the first assemblies
  12008. of the cynomolgus genome were published.
  12009. Therefore, we determined that the best strategy for peripheral blood profiling
  12010. was to do deep
  12011. \begin_inset Flex Glossary Term
  12012. status open
  12013. \begin_layout Plain Layout
  12014. RNA-seq
  12015. \end_layout
  12016. \end_inset
  12017. and inform the workflow using the latest available genome assembly and
  12018. annotation
  12019. \begin_inset CommandInset citation
  12020. LatexCommand cite
  12021. key "Wilson2013"
  12022. literal "false"
  12023. \end_inset
  12024. .
  12025. However, it was not immediately clear whether globin reduction was necessary
  12026. for
  12027. \begin_inset Flex Glossary Term
  12028. status open
  12029. \begin_layout Plain Layout
  12030. RNA-seq
  12031. \end_layout
  12032. \end_inset
  12033. or how much improvement in efficiency or sensitivity to detect differential
  12034. gene expression would be achieved for the added cost and work.
  12035. \end_layout
  12036. \begin_layout Standard
  12037. We only found one report that demonstrated that globin reduction significantly
  12038. improved the effective read yields for sequencing of human peripheral blood
  12039. cell RNA using a DeepSAGE protocol
  12040. \begin_inset CommandInset citation
  12041. LatexCommand cite
  12042. key "Mastrokolias2012"
  12043. literal "false"
  12044. \end_inset
  12045. .
  12046. The approach to DeepSAGE involves two different restriction enzymes that
  12047. purify and then tag small fragments of transcripts at specific locations
  12048. and thus, significantly reduces the complexity of the transcriptome.
  12049. Therefore, we could not determine how DeepSAGE results would translate
  12050. to the common strategy in the field for assaying the entire transcript
  12051. population by whole-transcriptome 3’-end
  12052. \begin_inset Flex Glossary Term
  12053. status open
  12054. \begin_layout Plain Layout
  12055. RNA-seq
  12056. \end_layout
  12057. \end_inset
  12058. .
  12059. Furthermore, if globin reduction is necessary, we also needed a globin
  12060. reduction method specific to cynomolgus globin sequences that would work
  12061. an organism for which no kit is available off the shelf.
  12062. \end_layout
  12063. \begin_layout Standard
  12064. As mentioned above, the addition of globin blocking oligos has a very small
  12065. impact on measured expression levels of gene expression.
  12066. However, this is a non-issue for the purposes of differential expression
  12067. testing, since a systematic change in a gene in all samples does not affect
  12068. relative expression levels between samples.
  12069. However, we must acknowledge that simple comparisons of gene expression
  12070. data obtained by GB and non-GB protocols are not possible without additional
  12071. normalization.
  12072. \end_layout
  12073. \begin_layout Standard
  12074. More importantly, globin blocking not only nearly doubles the yield of usable
  12075. reads, it also increases inter-sample correlation and sensitivity to detect
  12076. differential gene expression relative to the same set of samples profiled
  12077. without blocking.
  12078. In addition, globin blocking does not add a significant amount of random
  12079. noise to the data.
  12080. Globin blocking thus represents a cost-effective way to squeeze more data
  12081. and statistical power out of the same blood samples and the same amount
  12082. of sequencing.
  12083. In conclusion, globin reduction greatly increases the yield of useful
  12084. \begin_inset Flex Glossary Term
  12085. status open
  12086. \begin_layout Plain Layout
  12087. RNA-seq
  12088. \end_layout
  12089. \end_inset
  12090. reads mapping to the rest of the genome, with minimal perturbations in
  12091. the relative levels of non-globin genes.
  12092. Based on these results, globin transcript reduction using sequence-specific,
  12093. complementary blocking oligonucleotides is recommended for all deep
  12094. \begin_inset Flex Glossary Term
  12095. status open
  12096. \begin_layout Plain Layout
  12097. RNA-seq
  12098. \end_layout
  12099. \end_inset
  12100. of cynomolgus and other nonhuman primate blood samples.
  12101. \end_layout
  12102. \begin_layout Section
  12103. Future Directions
  12104. \end_layout
  12105. \begin_layout Standard
  12106. One drawback of the globin blocking method presented in this analysis is
  12107. a poor yield of genic reads, only around 50%.
  12108. In a separate experiment, the reagent mixture was modified so as to address
  12109. this drawback, resulting in a method that produces an even better reduction
  12110. in globin reads without reducing the overall fraction of genic reads.
  12111. However, the data showing this improvement consists of only a few test
  12112. samples, so the larger data set analyzed above was chosen in order to demonstra
  12113. te the effectiveness of the method in reducing globin reads while preserving
  12114. the biological signal.
  12115. \end_layout
  12116. \begin_layout Standard
  12117. The motivation for developing a fast practical way to enrich for non-globin
  12118. reads in cyno blood samples was to enable a large-scale
  12119. \begin_inset Flex Glossary Term
  12120. status open
  12121. \begin_layout Plain Layout
  12122. RNA-seq
  12123. \end_layout
  12124. \end_inset
  12125. experiment investigating the effects of mesenchymal stem cell infusion
  12126. on blood gene expression in cynomologus transplant recipients in a time
  12127. course after transplantation.
  12128. With the globin blocking method in place, the way is now clear for this
  12129. experiment to proceed.
  12130. \end_layout
  12131. \begin_layout Chapter
  12132. Future Directions
  12133. \end_layout
  12134. \begin_layout Standard
  12135. \begin_inset Flex TODO Note (inline)
  12136. status open
  12137. \begin_layout Plain Layout
  12138. If there are any chapter-independent future directions, put them here.
  12139. Otherwise, delete this section.
  12140. Check in the directions if this is OK.
  12141. \end_layout
  12142. \end_inset
  12143. \end_layout
  12144. \begin_layout Chapter
  12145. Closing remarks
  12146. \end_layout
  12147. \begin_layout Standard
  12148. \begin_inset ERT
  12149. status collapsed
  12150. \begin_layout Plain Layout
  12151. % Use "References" as the title of the Bibliography
  12152. \end_layout
  12153. \begin_layout Plain Layout
  12154. \backslash
  12155. renewcommand{
  12156. \backslash
  12157. bibname}{References}
  12158. \end_layout
  12159. \end_inset
  12160. \end_layout
  12161. \begin_layout Standard
  12162. \begin_inset CommandInset bibtex
  12163. LatexCommand bibtex
  12164. btprint "btPrintCited"
  12165. bibfiles "code-refs,refs-PROCESSED"
  12166. options "bibtotoc,unsrt"
  12167. \end_inset
  12168. \end_layout
  12169. \begin_layout Standard
  12170. \begin_inset Flex TODO Note (inline)
  12171. status open
  12172. \begin_layout Plain Layout
  12173. Check bib entry formatting & sort order
  12174. \end_layout
  12175. \end_inset
  12176. \end_layout
  12177. \begin_layout Standard
  12178. \begin_inset Flex TODO Note (inline)
  12179. status open
  12180. \begin_layout Plain Layout
  12181. Check in-text citation format.
  12182. Probably don't just want [1], [2], etc.
  12183. \end_layout
  12184. \end_inset
  12185. \end_layout
  12186. \end_body
  12187. \end_document