thesis.lyx 181 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178
  1. #LyX 2.3 created this file. For more info see http://www.lyx.org/
  2. \lyxformat 544
  3. \begin_document
  4. \begin_header
  5. \save_transient_properties true
  6. \origin unavailable
  7. \textclass extbook
  8. \begin_preamble
  9. % List all used files in log output
  10. \listfiles
  11. % Add a DRAFT watermark
  12. \usepackage{draftwatermark}
  13. \SetWatermarkLightness{0.97}
  14. \SetWatermarkScale{1}
  15. % Set up required header format
  16. \usepackage{fancyhdr}
  17. \pagestyle{fancy}
  18. \renewcommand{\headrulewidth}{0pt}
  19. \rhead{}
  20. \lhead{}
  21. \rfoot{}
  22. \lfoot{}
  23. \cfoot{\thepage} % Page number bottom center
  24. % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
  25. \usepackage{xstring}
  26. \usepackage{etoolbox}
  27. \usepackage{caption}
  28. \captionsetup{labelfont=bf,tableposition=top}
  29. \makeatletter
  30. \newcommand\formatlabel[1]{%
  31. \noexpandarg
  32. \IfSubStr{#1}{.}{%
  33. \StrBefore{#1}{.}[\firstcaption]%
  34. \StrBehind{#1}{.}[\secondcaption]%
  35. \textbf{\firstcaption.} \secondcaption}{%
  36. #1}%
  37. }
  38. \patchcmd{\@caption}{#3}{\formatlabel{#3}}
  39. \makeatother
  40. % Allow FloatBarrier command
  41. \usepackage{placeins}
  42. \end_preamble
  43. \use_default_options true
  44. \begin_modules
  45. todonotes
  46. \end_modules
  47. \maintain_unincluded_children false
  48. \language english
  49. \language_package default
  50. \inputencoding utf8
  51. \fontencoding default
  52. \font_roman "default" "default"
  53. \font_sans "default" "default"
  54. \font_typewriter "default" "default"
  55. \font_math "auto" "auto"
  56. \font_default_family default
  57. \use_non_tex_fonts false
  58. \font_sc false
  59. \font_osf false
  60. \font_sf_scale 100 100
  61. \font_tt_scale 100 100
  62. \use_microtype false
  63. \use_dash_ligatures true
  64. \graphics default
  65. \default_output_format pdf4
  66. \output_sync 0
  67. \bibtex_command default
  68. \index_command default
  69. \paperfontsize 12
  70. \spacing double
  71. \use_hyperref true
  72. \pdf_bookmarks true
  73. \pdf_bookmarksnumbered false
  74. \pdf_bookmarksopen false
  75. \pdf_bookmarksopenlevel 1
  76. \pdf_breaklinks false
  77. \pdf_pdfborder false
  78. \pdf_colorlinks false
  79. \pdf_backref false
  80. \pdf_pdfusetitle true
  81. \papersize letterpaper
  82. \use_geometry true
  83. \use_package amsmath 1
  84. \use_package amssymb 1
  85. \use_package cancel 1
  86. \use_package esint 1
  87. \use_package mathdots 1
  88. \use_package mathtools 1
  89. \use_package mhchem 1
  90. \use_package stackrel 1
  91. \use_package stmaryrd 1
  92. \use_package undertilde 1
  93. \cite_engine basic
  94. \cite_engine_type default
  95. \biblio_style plain
  96. \use_bibtopic false
  97. \use_indices false
  98. \paperorientation portrait
  99. \suppress_date false
  100. \justification true
  101. \use_refstyle 1
  102. \use_minted 0
  103. \index Index
  104. \shortcut idx
  105. \color #008000
  106. \end_index
  107. \leftmargin 1.5in
  108. \topmargin 1in
  109. \rightmargin 1in
  110. \bottommargin 1in
  111. \secnumdepth 3
  112. \tocdepth 3
  113. \paragraph_separation indent
  114. \paragraph_indentation default
  115. \is_math_indent 0
  116. \math_numbering_side default
  117. \quotes_style english
  118. \dynamic_quotes 0
  119. \papercolumns 1
  120. \papersides 2
  121. \paperpagestyle default
  122. \tracking_changes false
  123. \output_changes false
  124. \html_math_output 0
  125. \html_css_as_file 0
  126. \html_be_strict false
  127. \end_header
  128. \begin_body
  129. \begin_layout Title
  130. Bioinformatic analysis of complex, high-throughput genomic and epigenomic
  131. data in the context of immunology and transplant rejection
  132. \end_layout
  133. \begin_layout Author
  134. A thesis presented
  135. \begin_inset Newline newline
  136. \end_inset
  137. by
  138. \begin_inset Newline newline
  139. \end_inset
  140. Ryan C.
  141. Thompson
  142. \begin_inset Newline newline
  143. \end_inset
  144. to
  145. \begin_inset Newline newline
  146. \end_inset
  147. The Scripps Research Institute Graduate Program
  148. \begin_inset Newline newline
  149. \end_inset
  150. in partial fulfillment of the requirements for the degree of
  151. \begin_inset Newline newline
  152. \end_inset
  153. Doctor of Philosophy in the subject of Biology
  154. \begin_inset Newline newline
  155. \end_inset
  156. for
  157. \begin_inset Newline newline
  158. \end_inset
  159. The Scripps Research Institute
  160. \begin_inset Newline newline
  161. \end_inset
  162. La Jolla, California
  163. \end_layout
  164. \begin_layout Date
  165. May 2019
  166. \end_layout
  167. \begin_layout Standard
  168. [Copyright notice]
  169. \end_layout
  170. \begin_layout Standard
  171. [Thesis acceptance form]
  172. \end_layout
  173. \begin_layout Standard
  174. [Dedication]
  175. \end_layout
  176. \begin_layout Standard
  177. [Acknowledgements]
  178. \end_layout
  179. \begin_layout Standard
  180. \begin_inset CommandInset toc
  181. LatexCommand tableofcontents
  182. \end_inset
  183. \end_layout
  184. \begin_layout Standard
  185. \begin_inset FloatList table
  186. \end_inset
  187. \end_layout
  188. \begin_layout Standard
  189. \begin_inset FloatList figure
  190. \end_inset
  191. \end_layout
  192. \begin_layout Standard
  193. [List of Abbreviations]
  194. \end_layout
  195. \begin_layout Standard
  196. \begin_inset Flex TODO Note (inline)
  197. status open
  198. \begin_layout Plain Layout
  199. Look into auto-generated nomenclature list: https://wiki.lyx.org/Tips/Nomenclature
  200. \end_layout
  201. \end_inset
  202. \end_layout
  203. \begin_layout List of TODOs
  204. \end_layout
  205. \begin_layout Standard
  206. [Abstract]
  207. \end_layout
  208. \begin_layout Chapter*
  209. Abstract
  210. \end_layout
  211. \begin_layout Chapter
  212. Introduction
  213. \end_layout
  214. \begin_layout Section
  215. Background & Significance
  216. \end_layout
  217. \begin_layout Subsection
  218. Biological motivation
  219. \end_layout
  220. \begin_layout Itemize
  221. Rejection is the major long-term threat to organ and tissue grafts
  222. \end_layout
  223. \begin_deeper
  224. \begin_layout Itemize
  225. Common mechanisms of rejection
  226. \end_layout
  227. \begin_layout Itemize
  228. Effective immune suppression requires monitoring for rejection and tuning
  229. \end_layout
  230. \begin_layout Itemize
  231. Current tests for rejection (tissue biopsy) are invasive and biased
  232. \end_layout
  233. \begin_layout Itemize
  234. A blood test based on microarrays would be less biased and invasive
  235. \end_layout
  236. \end_deeper
  237. \begin_layout Itemize
  238. Memory cells are resistant to immune suppression
  239. \end_layout
  240. \begin_deeper
  241. \begin_layout Itemize
  242. Mechanisms of resistance in memory cells are poorly understood
  243. \end_layout
  244. \begin_layout Itemize
  245. A better understanding of immune memory formation is needed
  246. \end_layout
  247. \end_deeper
  248. \begin_layout Itemize
  249. Mesenchymal stem cell infusion is a promising new treatment to prevent/delay
  250. rejection
  251. \end_layout
  252. \begin_deeper
  253. \begin_layout Itemize
  254. Demonstrated in mice, but not yet in primates
  255. \end_layout
  256. \begin_layout Itemize
  257. Mechanism currently unknown, but MSC are known to be immune modulatory
  258. \end_layout
  259. \end_deeper
  260. \begin_layout Subsection
  261. Overview of bioinformatic analysis methods
  262. \end_layout
  263. \begin_layout Standard
  264. An overview of all the methods used, including what problem they solve,
  265. what assumptions they make, and a basic description of how they work.
  266. \end_layout
  267. \begin_layout Itemize
  268. ChIP-seq Peak calling
  269. \end_layout
  270. \begin_deeper
  271. \begin_layout Itemize
  272. Cross-correlation analysis to determine fragment size
  273. \end_layout
  274. \begin_layout Itemize
  275. Broad vs narrow peaks
  276. \end_layout
  277. \begin_layout Itemize
  278. SICER for broad peaks
  279. \end_layout
  280. \begin_layout Itemize
  281. IDR for biologically reproducible peaks
  282. \end_layout
  283. \begin_layout Itemize
  284. csaw peak filtering guidelines for unbiased downstream analysis
  285. \end_layout
  286. \end_deeper
  287. \begin_layout Itemize
  288. Normalization is non-trivial and application-dependant
  289. \end_layout
  290. \begin_deeper
  291. \begin_layout Itemize
  292. Expression arrays: RMA & fRMA; why fRMA is needed
  293. \end_layout
  294. \begin_layout Itemize
  295. Methylation arrays: M-value transformation approximates normal data but
  296. induces heteroskedasticity
  297. \end_layout
  298. \begin_layout Itemize
  299. RNA-seq: normalize based on assumption that the average gene is not changing
  300. \end_layout
  301. \begin_layout Itemize
  302. ChIP-seq: complex with many considerations, dependent on experimental methods,
  303. biological system, and analysis goals
  304. \end_layout
  305. \end_deeper
  306. \begin_layout Itemize
  307. Limma: The standard linear modeling framework for genomics
  308. \end_layout
  309. \begin_deeper
  310. \begin_layout Itemize
  311. empirical Bayes variance modeling: limma's core feature
  312. \end_layout
  313. \begin_layout Itemize
  314. edgeR & DESeq2: Extend with negative bonomial GLM for RNA-seq and other
  315. count data
  316. \end_layout
  317. \begin_layout Itemize
  318. voom: Extend with precision weights to model mean-variance trend
  319. \end_layout
  320. \begin_layout Itemize
  321. arrayWeights and duplicateCorrelation to handle complex variance structures
  322. \end_layout
  323. \end_deeper
  324. \begin_layout Itemize
  325. sva and ComBat for batch correction
  326. \end_layout
  327. \begin_layout Itemize
  328. Factor analysis: PCA, MDS, MOFA
  329. \end_layout
  330. \begin_deeper
  331. \begin_layout Itemize
  332. Batch-corrected PCA is informative, but careful application is required
  333. to avoid bias
  334. \end_layout
  335. \end_deeper
  336. \begin_layout Itemize
  337. Gene set analysis: camera and SPIA
  338. \end_layout
  339. \begin_layout Section
  340. Innovation
  341. \end_layout
  342. \begin_layout Itemize
  343. MSC infusion to improve transplant outcomes (prevent/delay rejection)
  344. \end_layout
  345. \begin_deeper
  346. \begin_layout Itemize
  347. Characterize MSC response to interferon gamma
  348. \end_layout
  349. \begin_layout Itemize
  350. IFN-g is thought to stimulate their function
  351. \end_layout
  352. \begin_layout Itemize
  353. Test IFN-g treated MSC infusion as a therapy to delay graft rejection in
  354. cynomolgus monkeys
  355. \end_layout
  356. \begin_layout Itemize
  357. Monitor animals post-transplant using blood RNA-seq at serial time points
  358. \end_layout
  359. \end_deeper
  360. \begin_layout Itemize
  361. Investigate dynamics of histone marks in CD4 T-cell activation and memory
  362. \end_layout
  363. \begin_deeper
  364. \begin_layout Itemize
  365. Previous studies have looked at single snapshots of histone marks
  366. \end_layout
  367. \begin_layout Itemize
  368. Instead, look at changes in histone marks across activation and memory
  369. \end_layout
  370. \end_deeper
  371. \begin_layout Itemize
  372. High-throughput sequencing and microarray technologies
  373. \end_layout
  374. \begin_deeper
  375. \begin_layout Itemize
  376. Powerful methods for assaying gene expression and epigenetics across entire
  377. genomes
  378. \end_layout
  379. \begin_layout Itemize
  380. Proper analysis requires finding and exploiting systematic genome-wide trends
  381. \end_layout
  382. \end_deeper
  383. \begin_layout Chapter
  384. Reproducible genome-wide epigenetic analysis of H3K4 and H3K27 methylation
  385. in naive and memory CD4 T-cell activation
  386. \end_layout
  387. \begin_layout Standard
  388. \begin_inset Flex TODO Note (inline)
  389. status open
  390. \begin_layout Plain Layout
  391. Author list: Me, Sarah, Dan
  392. \end_layout
  393. \end_inset
  394. \end_layout
  395. \begin_layout Section
  396. Approach
  397. \end_layout
  398. \begin_layout Itemize
  399. CD4 T-cells are central to all adaptive immune responses and memory
  400. \end_layout
  401. \begin_layout Itemize
  402. H3K4 and H3K27 methylation are major epigenetic regulators of gene expression
  403. \end_layout
  404. \begin_layout Itemize
  405. Canonically, H3K4 is activating and H3K27 is inhibitory, but the reality
  406. is complex
  407. \end_layout
  408. \begin_layout Itemize
  409. Looking at these marks during CD4 activation and memory should reveal new
  410. mechanistic details
  411. \end_layout
  412. \begin_layout Itemize
  413. Test
  414. \begin_inset Quotes eld
  415. \end_inset
  416. poised promoter
  417. \begin_inset Quotes erd
  418. \end_inset
  419. hypothesis in which H3K4 and H3K27 are both methylated
  420. \end_layout
  421. \begin_layout Itemize
  422. Expand scope of analysis beyond simple promoter counts
  423. \end_layout
  424. \begin_deeper
  425. \begin_layout Itemize
  426. Analyze peaks genome-wide, including in intergenic regions
  427. \end_layout
  428. \begin_layout Itemize
  429. Analysis of coverage distribution shape within promoters, e.g.
  430. upstream vs downstream coverage
  431. \end_layout
  432. \end_deeper
  433. \begin_layout Section
  434. Methods
  435. \end_layout
  436. \begin_layout Standard
  437. \begin_inset Float figure
  438. wide false
  439. sideways true
  440. status open
  441. \begin_layout Plain Layout
  442. \align center
  443. \begin_inset Graphics
  444. filename graphics/CD4-csaw/rulegraphs/rulegraph-all.pdf
  445. width 100theight%
  446. \end_inset
  447. \end_layout
  448. \begin_layout Plain Layout
  449. \begin_inset Caption Standard
  450. \begin_layout Plain Layout
  451. \begin_inset CommandInset label
  452. LatexCommand label
  453. name "fig:rulegraph"
  454. \end_inset
  455. \series bold
  456. Dependency graph of steps in reproducible workflow
  457. \end_layout
  458. \end_inset
  459. \end_layout
  460. \end_inset
  461. \end_layout
  462. \begin_layout Standard
  463. A reproducible workflow
  464. \begin_inset CommandInset citation
  465. LatexCommand cite
  466. key "gh-cd4-csaw"
  467. literal "false"
  468. \end_inset
  469. was written to analyze the raw ChIP-seq and RNA-seq data from previous
  470. studies
  471. \begin_inset CommandInset citation
  472. LatexCommand cite
  473. key "LaMere2016,LaMere2017"
  474. literal "true"
  475. \end_inset
  476. .
  477. Briefly, this data consists of RNA-seq and ChIP-seq from CD4 T-cells cultured
  478. from 4 donors.
  479. From each donor, naive and memory CD4 T-cells were isolated separately.
  480. Then cultures of both cells were activated [how?], and samples were taken
  481. at 4 time points: Day 0 (pre-activation), Day 1 (early activation), Day
  482. 5 (peak activation), and Day 14 (post-activation).
  483. For each combination of cell type and time point, RNA was isolated, and
  484. ChIP-seq was performed for each of 3 histone marks: H3K4me2, H3K4me3, and
  485. H3K27me3.
  486. The ChIP-seq input was also sequenced for each sample.
  487. The result was 32 samples for each assay.
  488. \end_layout
  489. \begin_layout Standard
  490. Sequence reads were retrieved from the Sequence Read Archive (SRA)
  491. \begin_inset CommandInset citation
  492. LatexCommand cite
  493. key "Leinonen2011"
  494. literal "false"
  495. \end_inset
  496. .
  497. ChIP-seq (and input) reads were aligned to CRCh38 genome assembly using
  498. Bowtie 2
  499. \begin_inset CommandInset citation
  500. LatexCommand cite
  501. key "Langmead2012,Schneider2017,gh-hg38-ref"
  502. literal "false"
  503. \end_inset
  504. .
  505. Artifact regions were annotated using a custom implementation of the GreyListCh
  506. IP algorithm, and these
  507. \begin_inset Quotes eld
  508. \end_inset
  509. greylists
  510. \begin_inset Quotes erd
  511. \end_inset
  512. were merged with the ENCODE blacklist
  513. \begin_inset CommandInset citation
  514. LatexCommand cite
  515. key "greylistchip,Amemiya2019,Dunham2012"
  516. literal "false"
  517. \end_inset
  518. .
  519. Any read or peak overlapping one of these regions was regarded as artifactual
  520. and excluded from downstream analyses.
  521. \end_layout
  522. \begin_layout Standard
  523. Peaks are called using epic, an implementation of the SICER algorithm
  524. \begin_inset CommandInset citation
  525. LatexCommand cite
  526. key "Zang2009,gh-epic"
  527. literal "false"
  528. \end_inset
  529. .
  530. Peaks are also called separately using MACS, but MACS was determined to
  531. be a poor fit for the data, and these peak calls are not used further
  532. \begin_inset CommandInset citation
  533. LatexCommand cite
  534. key "Zhang2008"
  535. literal "false"
  536. \end_inset
  537. .
  538. \end_layout
  539. \begin_layout Itemize
  540. Re-analyze previously published CD4 ChIP-seq & RNA-seq data
  541. \end_layout
  542. \begin_deeper
  543. \begin_layout Itemize
  544. Completely reimplement analysis from scratch as a reproducible workflow
  545. \end_layout
  546. \begin_layout Itemize
  547. Use newly published methods & algorithms not available during the original
  548. analysis: SICER, csaw, MOFA, ComBat, sva, GREAT, and more
  549. \end_layout
  550. \end_deeper
  551. \begin_layout Itemize
  552. SICER, IDR, csaw, & GREAT to call ChIP-seq peaks genome-wide, perform differenti
  553. al abundance analysis, and relate those peaks to gene expression
  554. \end_layout
  555. \begin_layout Itemize
  556. Promoter counts in sliding windows around each gene's highest-expressed
  557. TSS to investigate coverage distribution within promoters
  558. \end_layout
  559. \begin_layout Section
  560. Results
  561. \end_layout
  562. \begin_layout Standard
  563. \begin_inset Note Note
  564. status open
  565. \begin_layout Plain Layout
  566. Focus on what hypotheses were tested, then select figures that show how
  567. those hypotheses were tested, even if the result is a negative.
  568. \end_layout
  569. \end_inset
  570. \end_layout
  571. \begin_layout Subsection
  572. H3K4 and H3K27 methylation occur in broad regions and are enriched near
  573. promoters
  574. \end_layout
  575. \begin_layout Standard
  576. \begin_inset Float figure
  577. wide false
  578. sideways false
  579. status open
  580. \begin_layout Plain Layout
  581. \begin_inset Flex TODO Note (inline)
  582. status open
  583. \begin_layout Plain Layout
  584. Re-generate IDR rank consistency plots for SICER and MACS side-by-side
  585. \end_layout
  586. \end_inset
  587. \end_layout
  588. \begin_layout Plain Layout
  589. \begin_inset Caption Standard
  590. \begin_layout Plain Layout
  591. \series bold
  592. \begin_inset CommandInset label
  593. LatexCommand label
  594. name "fig:IDR-RC-H3K4me2"
  595. \end_inset
  596. Irreproducible Discovery Rate consistency plots for H3K4me2
  597. \end_layout
  598. \end_inset
  599. \end_layout
  600. \end_inset
  601. \end_layout
  602. \begin_layout Standard
  603. \begin_inset Float figure
  604. wide false
  605. sideways false
  606. status open
  607. \begin_layout Plain Layout
  608. \begin_inset Flex TODO Note (inline)
  609. status open
  610. \begin_layout Plain Layout
  611. Re-generate IDR rank consistency plots for SICER and MACS side-by-side
  612. \end_layout
  613. \end_inset
  614. \end_layout
  615. \begin_layout Plain Layout
  616. \begin_inset Caption Standard
  617. \begin_layout Plain Layout
  618. \series bold
  619. \begin_inset CommandInset label
  620. LatexCommand label
  621. name "fig:IDR-RC-H3K4me3"
  622. \end_inset
  623. Irreproducible Discovery Rate consistency plots for H3K4me3
  624. \end_layout
  625. \end_inset
  626. \end_layout
  627. \end_inset
  628. \end_layout
  629. \begin_layout Standard
  630. \begin_inset Float figure
  631. wide false
  632. sideways false
  633. status open
  634. \begin_layout Plain Layout
  635. \begin_inset Flex TODO Note (inline)
  636. status open
  637. \begin_layout Plain Layout
  638. Re-generate IDR rank consistency plots for SICER and MACS side-by-side
  639. \end_layout
  640. \end_inset
  641. \end_layout
  642. \begin_layout Plain Layout
  643. \begin_inset Caption Standard
  644. \begin_layout Plain Layout
  645. \series bold
  646. \begin_inset CommandInset label
  647. LatexCommand label
  648. name "fig:IDR-RC-H3K27me3"
  649. \end_inset
  650. Irreproducible Discovery Rate consistency plots for H3K27me3
  651. \end_layout
  652. \end_inset
  653. \end_layout
  654. \end_inset
  655. \end_layout
  656. \begin_layout Standard
  657. \begin_inset Float table
  658. wide false
  659. sideways false
  660. status open
  661. \begin_layout Plain Layout
  662. \align center
  663. \begin_inset Flex TODO Note (inline)
  664. status open
  665. \begin_layout Plain Layout
  666. Need
  667. \emph on
  668. median
  669. \emph default
  670. peak width, not mean
  671. \end_layout
  672. \end_inset
  673. \end_layout
  674. \begin_layout Plain Layout
  675. \align center
  676. \begin_inset Tabular
  677. <lyxtabular version="3" rows="4" columns="5">
  678. <features tabularvalignment="middle">
  679. <column alignment="center" valignment="top">
  680. <column alignment="center" valignment="top">
  681. <column alignment="center" valignment="top">
  682. <column alignment="center" valignment="top">
  683. <column alignment="center" valignment="top">
  684. <row>
  685. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  686. \begin_inset Text
  687. \begin_layout Plain Layout
  688. Histone Mark
  689. \end_layout
  690. \end_inset
  691. </cell>
  692. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  693. \begin_inset Text
  694. \begin_layout Plain Layout
  695. # Peaks
  696. \end_layout
  697. \end_inset
  698. </cell>
  699. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  700. \begin_inset Text
  701. \begin_layout Plain Layout
  702. Mean peak width
  703. \end_layout
  704. \end_inset
  705. </cell>
  706. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  707. \begin_inset Text
  708. \begin_layout Plain Layout
  709. genome coverage
  710. \end_layout
  711. \end_inset
  712. </cell>
  713. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  714. \begin_inset Text
  715. \begin_layout Plain Layout
  716. read coverage
  717. \end_layout
  718. \end_inset
  719. </cell>
  720. </row>
  721. <row>
  722. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  723. \begin_inset Text
  724. \begin_layout Plain Layout
  725. H3K4me2
  726. \end_layout
  727. \end_inset
  728. </cell>
  729. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  730. \begin_inset Text
  731. \begin_layout Plain Layout
  732. 14965
  733. \end_layout
  734. \end_inset
  735. </cell>
  736. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  737. \begin_inset Text
  738. \begin_layout Plain Layout
  739. 3970
  740. \end_layout
  741. \end_inset
  742. </cell>
  743. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  744. \begin_inset Text
  745. \begin_layout Plain Layout
  746. 1.92%
  747. \end_layout
  748. \end_inset
  749. </cell>
  750. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  751. \begin_inset Text
  752. \begin_layout Plain Layout
  753. 14.2%
  754. \end_layout
  755. \end_inset
  756. </cell>
  757. </row>
  758. <row>
  759. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  760. \begin_inset Text
  761. \begin_layout Plain Layout
  762. H3K4me3
  763. \end_layout
  764. \end_inset
  765. </cell>
  766. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  767. \begin_inset Text
  768. \begin_layout Plain Layout
  769. 6163
  770. \end_layout
  771. \end_inset
  772. </cell>
  773. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  774. \begin_inset Text
  775. \begin_layout Plain Layout
  776. 2946
  777. \end_layout
  778. \end_inset
  779. </cell>
  780. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  781. \begin_inset Text
  782. \begin_layout Plain Layout
  783. 0.588%
  784. \end_layout
  785. \end_inset
  786. </cell>
  787. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  788. \begin_inset Text
  789. \begin_layout Plain Layout
  790. 6.57%
  791. \end_layout
  792. \end_inset
  793. </cell>
  794. </row>
  795. <row>
  796. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  797. \begin_inset Text
  798. \begin_layout Plain Layout
  799. H3K27me3
  800. \end_layout
  801. \end_inset
  802. </cell>
  803. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  804. \begin_inset Text
  805. \begin_layout Plain Layout
  806. 18139
  807. \end_layout
  808. \end_inset
  809. </cell>
  810. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  811. \begin_inset Text
  812. \begin_layout Plain Layout
  813. 18967
  814. \end_layout
  815. \end_inset
  816. </cell>
  817. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  818. \begin_inset Text
  819. \begin_layout Plain Layout
  820. 11.1%
  821. \end_layout
  822. \end_inset
  823. </cell>
  824. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  825. \begin_inset Text
  826. \begin_layout Plain Layout
  827. 22.5%
  828. \end_layout
  829. \end_inset
  830. </cell>
  831. </row>
  832. </lyxtabular>
  833. \end_inset
  834. \end_layout
  835. \begin_layout Plain Layout
  836. \begin_inset Caption Standard
  837. \begin_layout Plain Layout
  838. \series bold
  839. \begin_inset CommandInset label
  840. LatexCommand label
  841. name "tab:peak-calling-summary"
  842. \end_inset
  843. SICER+IDR peak-calling summary
  844. \end_layout
  845. \end_inset
  846. \end_layout
  847. \end_inset
  848. \end_layout
  849. \begin_layout Standard
  850. Figures
  851. \begin_inset CommandInset ref
  852. LatexCommand ref
  853. reference "fig:IDR-RC-H3K4me2"
  854. plural "false"
  855. caps "false"
  856. noprefix "false"
  857. \end_inset
  858. ,
  859. \begin_inset CommandInset ref
  860. LatexCommand ref
  861. reference "fig:IDR-RC-H3K4me3"
  862. plural "false"
  863. caps "false"
  864. noprefix "false"
  865. \end_inset
  866. , and
  867. \begin_inset CommandInset ref
  868. LatexCommand ref
  869. reference "fig:IDR-RC-H3K27me3"
  870. plural "false"
  871. caps "false"
  872. noprefix "false"
  873. \end_inset
  874. show the IDR rank-consistency plots for peaks called in an arbitrarily-chosen
  875. pair of donors.
  876. For all 3 histone marks, when the peaks for each donor are ranked according
  877. to their scores, SICER produces much more reproducible results between
  878. donors.
  879. This is consistent with SICER's stated goal of identifying broad peaks,
  880. in contrast to MACS, which is designed for identifying sharp peaks.
  881. Based on this observation, the SICER peak calls were used for all downstream
  882. analyses that involved ChIP-seq peaks.
  883. Table
  884. \begin_inset CommandInset ref
  885. LatexCommand ref
  886. reference "tab:peak-calling-summary"
  887. plural "false"
  888. caps "false"
  889. noprefix "false"
  890. \end_inset
  891. gives a summary of the peak calling statistics for each histone mark.
  892. \end_layout
  893. \begin_layout Standard
  894. \begin_inset Float figure
  895. wide false
  896. sideways false
  897. status open
  898. \begin_layout Plain Layout
  899. \align center
  900. \begin_inset Graphics
  901. filename graphics/CD4-csaw/Promoter Peak Distance Profile-PAGE1-CROP.pdf
  902. width 100col%
  903. groupId colwidth
  904. \end_inset
  905. \end_layout
  906. \begin_layout Plain Layout
  907. \begin_inset Caption Standard
  908. \begin_layout Plain Layout
  909. \series bold
  910. \begin_inset CommandInset label
  911. LatexCommand label
  912. name "fig:effective-promoter-radius"
  913. \end_inset
  914. Enrichment of peaks in promoter neighborhoods.
  915. \end_layout
  916. \end_inset
  917. \end_layout
  918. \begin_layout Plain Layout
  919. \end_layout
  920. \end_inset
  921. \end_layout
  922. \begin_layout Itemize
  923. Each histone mark is enriched within a certain radius of gene TSS positions,
  924. but that radius is different for each mark (figure
  925. \begin_inset CommandInset ref
  926. LatexCommand ref
  927. reference "fig:effective-promoter-radius"
  928. plural "false"
  929. caps "false"
  930. noprefix "false"
  931. \end_inset
  932. , previously in
  933. \begin_inset CommandInset citation
  934. LatexCommand cite
  935. key "LaMere2016"
  936. literal "false"
  937. \end_inset
  938. Fig.
  939. S2)
  940. \end_layout
  941. \begin_layout Subsection
  942. RNA-seq align+quant method selection
  943. \end_layout
  944. \begin_layout Standard
  945. \begin_inset Flex TODO Note (inline)
  946. status open
  947. \begin_layout Plain Layout
  948. Maybe fix up the axis ranges for these plots?
  949. \end_layout
  950. \end_inset
  951. \end_layout
  952. \begin_layout Standard
  953. \begin_inset Float figure
  954. wide false
  955. sideways false
  956. status collapsed
  957. \begin_layout Plain Layout
  958. \align center
  959. \begin_inset Graphics
  960. filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-star-CROP.png
  961. lyxscale 25
  962. width 100col%
  963. groupId colwidth-raster
  964. \end_inset
  965. \end_layout
  966. \begin_layout Plain Layout
  967. \begin_inset Caption Standard
  968. \begin_layout Plain Layout
  969. Comparison of STAR quantification between Ensembl and Entrez gene identifiers
  970. \end_layout
  971. \end_inset
  972. \end_layout
  973. \begin_layout Plain Layout
  974. \end_layout
  975. \end_inset
  976. \end_layout
  977. \begin_layout Standard
  978. \begin_inset Float figure
  979. wide false
  980. sideways false
  981. status collapsed
  982. \begin_layout Plain Layout
  983. \align center
  984. \begin_inset Graphics
  985. filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-shoal-CROP.png
  986. lyxscale 25
  987. width 100col%
  988. groupId colwidth-raster
  989. \end_inset
  990. \end_layout
  991. \begin_layout Plain Layout
  992. \begin_inset Caption Standard
  993. \begin_layout Plain Layout
  994. Comparison of Salmon+Shoal quantification between Ensembl and Entrez gene
  995. identifiers
  996. \end_layout
  997. \end_inset
  998. \end_layout
  999. \end_inset
  1000. \end_layout
  1001. \begin_layout Standard
  1002. \begin_inset Float figure
  1003. wide false
  1004. sideways false
  1005. status collapsed
  1006. \begin_layout Plain Layout
  1007. \align center
  1008. \begin_inset Graphics
  1009. filename graphics/CD4-csaw/rnaseq-compare/star-vs-hisat2-CROP.png
  1010. lyxscale 25
  1011. width 100col%
  1012. groupId colwidth-raster
  1013. \end_inset
  1014. \end_layout
  1015. \begin_layout Plain Layout
  1016. \begin_inset Caption Standard
  1017. \begin_layout Plain Layout
  1018. Comparison of quantification between STAR and HISAT2 for identical annotation
  1019. \end_layout
  1020. \end_inset
  1021. \end_layout
  1022. \end_inset
  1023. \end_layout
  1024. \begin_layout Standard
  1025. \begin_inset Float figure
  1026. wide false
  1027. sideways false
  1028. status collapsed
  1029. \begin_layout Plain Layout
  1030. \align center
  1031. \begin_inset Graphics
  1032. filename graphics/CD4-csaw/rnaseq-compare/star-vs-salmon-CROP.png
  1033. lyxscale 25
  1034. width 100col%
  1035. groupId colwidth-raster
  1036. \end_inset
  1037. \end_layout
  1038. \begin_layout Plain Layout
  1039. \begin_inset Caption Standard
  1040. \begin_layout Plain Layout
  1041. Comparison of quantification between STAR and Salmon for identical annotation
  1042. \end_layout
  1043. \end_inset
  1044. \end_layout
  1045. \end_inset
  1046. \end_layout
  1047. \begin_layout Standard
  1048. \begin_inset Float figure
  1049. wide false
  1050. sideways false
  1051. status open
  1052. \begin_layout Plain Layout
  1053. \align center
  1054. \begin_inset Graphics
  1055. filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-kallisto-CROP.png
  1056. lyxscale 25
  1057. width 100col%
  1058. groupId colwidth-raster
  1059. \end_inset
  1060. \end_layout
  1061. \begin_layout Plain Layout
  1062. \begin_inset Caption Standard
  1063. \begin_layout Plain Layout
  1064. Comparison of quantification between Salmon and Kallisto for identical annotatio
  1065. n
  1066. \end_layout
  1067. \end_inset
  1068. \end_layout
  1069. \end_inset
  1070. \end_layout
  1071. \begin_layout Standard
  1072. \begin_inset Float figure
  1073. wide false
  1074. sideways false
  1075. status open
  1076. \begin_layout Plain Layout
  1077. \align center
  1078. \begin_inset Graphics
  1079. filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-shoal-CROP.png
  1080. lyxscale 25
  1081. width 100col%
  1082. groupId colwidth-raster
  1083. \end_inset
  1084. \end_layout
  1085. \begin_layout Plain Layout
  1086. \begin_inset Caption Standard
  1087. \begin_layout Plain Layout
  1088. Comparison of quantification between Salmon with and without Shoal for identical
  1089. annotation
  1090. \end_layout
  1091. \end_inset
  1092. \end_layout
  1093. \end_inset
  1094. \end_layout
  1095. \begin_layout Standard
  1096. \end_layout
  1097. \begin_layout Subsection
  1098. RNA-seq has a large confounding batch effect
  1099. \end_layout
  1100. \begin_layout Itemize
  1101. RNA-seq batch effect can be partially corrected, but still induces uncorrectable
  1102. biases in downstream analysis
  1103. \end_layout
  1104. \begin_deeper
  1105. \begin_layout Itemize
  1106. Figure showing MDS plot before & after ComBat
  1107. \end_layout
  1108. \begin_layout Itemize
  1109. Figure relating sample weights to batches, cell types, time points, etc.,
  1110. showing that one batch is significantly worse quality
  1111. \end_layout
  1112. \begin_layout Itemize
  1113. Figures showing p-value histograms for within-batch and cross-batch contrasts,
  1114. showing that cross-batch contrasts have attenuated signal, as do comparisons
  1115. within the bad batch
  1116. \end_layout
  1117. \end_deeper
  1118. \begin_layout Subsection
  1119. ChIP-seq must be corrected for hidden confounding factors
  1120. \end_layout
  1121. \begin_layout Itemize
  1122. Figures showing pre- and post-SVA MDS plots for each histone mark
  1123. \end_layout
  1124. \begin_layout Itemize
  1125. Figures showing BCV plots with and without SVA for each histone mark
  1126. \end_layout
  1127. \begin_layout Subsection
  1128. H3K4 and H3K27 promoter methylation has broadly the expected correlation
  1129. with gene expression
  1130. \end_layout
  1131. \begin_layout Itemize
  1132. H3K4 is correlated with higher expression, and H3K27 is correlated with
  1133. lower expression genome-wide
  1134. \end_layout
  1135. \begin_layout Itemize
  1136. Figures showing these correlations: box/violin plots of expression distributions
  1137. with every combination of peak presence/absence in promoter
  1138. \end_layout
  1139. \begin_layout Itemize
  1140. Appropriate statistical tests showing significant differences in expected
  1141. directions
  1142. \end_layout
  1143. \begin_layout Subsection
  1144. MOFA recovers biologically relevant variation from blind analysis by correlating
  1145. across datasets
  1146. \end_layout
  1147. \begin_layout Itemize
  1148. MOFA
  1149. \begin_inset CommandInset citation
  1150. LatexCommand cite
  1151. key "Argelaguet2018"
  1152. literal "false"
  1153. \end_inset
  1154. successfully separates biologically relevant patterns of variation from
  1155. technical confounding factors without knowing the sample labels, by finding
  1156. latent factors that explain variation across multiple data sets.
  1157. \end_layout
  1158. \begin_deeper
  1159. \begin_layout Itemize
  1160. Figure: show percent-variance-explained plot from MOFA and PCA-like plots
  1161. for the relevant latent factors
  1162. \end_layout
  1163. \begin_layout Itemize
  1164. MOFA analysis also shows that batch effect correction can't get much better
  1165. than it already is (Figure comparing blind MOFA batch correction to ComBat
  1166. correction)
  1167. \end_layout
  1168. \end_deeper
  1169. \begin_layout Subsection
  1170. Naive-to-memory convergence observed in H3K4 and RNA-seq data, not in H3K27me3
  1171. \end_layout
  1172. \begin_layout Itemize
  1173. H3K4 and RNA-seq data show clear evidence of naive convergence with memory
  1174. between days 1 and 5 (MDS plot figure, also compare with last figure from
  1175. \begin_inset CommandInset citation
  1176. LatexCommand cite
  1177. key "LaMere2016"
  1178. literal "false"
  1179. \end_inset
  1180. )
  1181. \end_layout
  1182. \begin_layout Standard
  1183. \begin_inset Flex TODO Note (inline)
  1184. status open
  1185. \begin_layout Plain Layout
  1186. Note that Sarah has granted permission to use her figures
  1187. \end_layout
  1188. \end_inset
  1189. \end_layout
  1190. \begin_layout Itemize
  1191. Table of numbers of genes different between N & M at each time point, showing
  1192. dwindling differences at later time points, consistent with convergence
  1193. \end_layout
  1194. \begin_layout Itemize
  1195. Similar figure for H3K27me3 showing lack of convergence
  1196. \end_layout
  1197. \begin_layout Subsection
  1198. Effect of promoter coverage upstream vs downstream of TSS
  1199. \end_layout
  1200. \begin_layout Itemize
  1201. H3K4me peaks seem to correlate with increased expression as long as they
  1202. are anywhere near the TSS
  1203. \end_layout
  1204. \begin_layout Itemize
  1205. H3K27me3 peaks can have different correlations to gene expression depending
  1206. on their position relative to TSS (e.g.
  1207. upstream vs downstream) Results consistent with
  1208. \begin_inset CommandInset citation
  1209. LatexCommand cite
  1210. key "Young2011"
  1211. literal "false"
  1212. \end_inset
  1213. \end_layout
  1214. \begin_layout Section
  1215. Discussion
  1216. \end_layout
  1217. \begin_layout Itemize
  1218. "Promoter radius" is not constant and must be defined empirically for a
  1219. given data set
  1220. \end_layout
  1221. \begin_layout Itemize
  1222. MOFA shows great promise for accelerating discovery of major biological
  1223. effects in multi-omics datasets
  1224. \end_layout
  1225. \begin_deeper
  1226. \begin_layout Itemize
  1227. MOFA was added to this analysis late and played primarily a confirmatory
  1228. role, but it was able to confirm earlier conclusions with much less prior
  1229. information (no sample labels) and much less analyst effort
  1230. \end_layout
  1231. \begin_layout Itemize
  1232. MOFA confirmed that the already-implemented batch correction in the RNA-seq
  1233. data was already performing as well as possible given the limitations of
  1234. the data
  1235. \end_layout
  1236. \end_deeper
  1237. \begin_layout Itemize
  1238. Naive-to-memory convergence implies that naive cells are differentiating
  1239. into memory cells, and that gene expression and H3K4 methylation are involved
  1240. in this differentiation while H3K27me3 is less involved
  1241. \end_layout
  1242. \begin_layout Itemize
  1243. H3K27me3, canonically regarded as a deactivating mark, seems to have a more
  1244. complex
  1245. \end_layout
  1246. \begin_layout Itemize
  1247. Discuss advantages of developing using a reproducible workflow
  1248. \end_layout
  1249. \begin_layout Chapter
  1250. Improving array-based analyses of transplant rejection by optimizing data
  1251. preprocessing
  1252. \end_layout
  1253. \begin_layout Standard
  1254. \begin_inset Note Note
  1255. status open
  1256. \begin_layout Plain Layout
  1257. Author list: Me, Sunil, Tom, Padma, Dan
  1258. \end_layout
  1259. \end_inset
  1260. \end_layout
  1261. \begin_layout Section
  1262. Approach
  1263. \end_layout
  1264. \begin_layout Subsection
  1265. Proper pre-processing is essential for array data
  1266. \end_layout
  1267. \begin_layout Standard
  1268. \begin_inset Flex TODO Note (inline)
  1269. status open
  1270. \begin_layout Plain Layout
  1271. This section could probably use some citations
  1272. \end_layout
  1273. \end_inset
  1274. \end_layout
  1275. \begin_layout Standard
  1276. Microarrays, bead arrays, and similar assays produce raw data in the form
  1277. of fluorescence intensity measurements, with the each intensity measurement
  1278. proportional to the abundance of some fluorescently-labelled target DNA
  1279. or RNA sequence that base pairs to a specific probe sequence.
  1280. However, these measurements for each probe are also affected my many technical
  1281. confounding factors, such as the concentration of target material, strength
  1282. of off-target binding, and the sensitivity of the imaging sensor.
  1283. Some array designs also use multiple probe sequences for each target.
  1284. Hence, extensive pre-processing of array data is necessary to normalize
  1285. out the effects of these technical factors and summarize the information
  1286. from multiple probes to arrive at a single usable estimate of abundance
  1287. or other relevant quantity, such as a ratio of two abundances, for each
  1288. target.
  1289. \end_layout
  1290. \begin_layout Standard
  1291. The choice of pre-processing algorithms used in the analysis of an array
  1292. data set can have a large effect on the results of that analysis.
  1293. However, despite their importance, these steps are often neglected or rushed
  1294. in order to get to the more scientifically interesting analysis steps involving
  1295. the actual biology of the system under study.
  1296. Hence, it is often possible to achieve substantial gains in statistical
  1297. power, model goodness-of-fit, or other relevant performance measures, by
  1298. checking the assumptions made by each preprocessing step and choosing specific
  1299. normalization methods tailored to the specific goals of the current analysis.
  1300. \end_layout
  1301. \begin_layout Subsection
  1302. Normalization for clinical microarray classifiers must be single-channel
  1303. \end_layout
  1304. \begin_layout Subsubsection
  1305. Standard normalization methods are unsuitable for clinical application
  1306. \end_layout
  1307. \begin_layout Standard
  1308. As the cost of performing microarray assays falls, there is increasing interest
  1309. in using genomic assays for diagnostic purposes, such as distinguishing
  1310. healthy transplants (TX) from transplants undergoing acute rejection (AR)
  1311. or acute dysfunction with no rejection (ADNR).
  1312. However, the the standard normalization algorithm used for microarray data,
  1313. Robust Multi-chip Average (RMA)
  1314. \begin_inset CommandInset citation
  1315. LatexCommand cite
  1316. key "Irizarry2003a"
  1317. literal "false"
  1318. \end_inset
  1319. , is not applicable in a clinical setting.
  1320. Two of the steps in RMA, quantile normalization and probe summarization
  1321. by median polish, depend on every array in the data set being normalized.
  1322. This means that adding or removing any arrays from a data set changes the
  1323. normalized values for all arrays, and data sets that have been normalized
  1324. separately cannot be compared to each other.
  1325. Hence, when using RMA, any arrays to be analyzed together must also be
  1326. normalized together, and the set of arrays included in the data set must
  1327. be held constant throughout an analysis.
  1328. \end_layout
  1329. \begin_layout Standard
  1330. These limitations present serious impediments to the use of arrays as a
  1331. diagnostic tool.
  1332. When training a classifier, the samples to be classified must not be involved
  1333. in any step of the training process, lest their inclusion bias the training
  1334. process.
  1335. Once a classifier is deployed in a clinical setting, the samples to be
  1336. classified will not even
  1337. \emph on
  1338. exist
  1339. \emph default
  1340. at the time of training, so including them would be impossible even if
  1341. it were statistically justifiable.
  1342. Therefore, any machine learning application for microarrays demands that
  1343. the normalized expression values computed for an array must depend only
  1344. on information contained within that array.
  1345. This would ensure that each array's normalization is independent of every
  1346. other array, and that arrays normalized separately can still be compared
  1347. to each other without bias.
  1348. Such a normalization is commonly referred to as
  1349. \begin_inset Quotes eld
  1350. \end_inset
  1351. single-channel normalization
  1352. \begin_inset Quotes erd
  1353. \end_inset
  1354. .
  1355. \end_layout
  1356. \begin_layout Subsubsection
  1357. Several strategies are available to meet clinical normalization requirements
  1358. \end_layout
  1359. \begin_layout Standard
  1360. Frozen RMA (fRMA) addresses these concerns by replacing the quantile normalizati
  1361. on and median polish with alternatives that do not introduce inter-array
  1362. dependence, allowing each array to be normalized independently of all others
  1363. \begin_inset CommandInset citation
  1364. LatexCommand cite
  1365. key "McCall2010"
  1366. literal "false"
  1367. \end_inset
  1368. .
  1369. Quantile normalization is performed against a pre-generated set of quantiles
  1370. learned from a collection of 850 publically available arrays sampled from
  1371. a wide variety of tissues in the Gene Expression Omnibus (GEO).
  1372. Each array's probe intensity distribution is normalized against these pre-gener
  1373. ated quantiles.
  1374. The median polish step is replaced with a robust weighted average of probe
  1375. intensities, using inverse variance weights learned from the same public
  1376. GEO data.
  1377. The result is a normalization that satisfies the requirements mentioned
  1378. above: each array is normalized independently of all others, and any two
  1379. normalized arrays can be compared directly to each other.
  1380. \end_layout
  1381. \begin_layout Standard
  1382. One important limitation of fRMA is that it requires a separate reference
  1383. data set from which to learn the parameters (reference quantiles and probe
  1384. weights) that will be used to normalize each array.
  1385. These parameters are specific to a given array platform, and pre-generated
  1386. parameters are only provided for the most common platforms, such as Affymetrix
  1387. hgu133plus2.
  1388. For a less common platform, such as hthgu133pluspm, is is necessary to
  1389. learn custom parameters from in-house data before fRMA can be used to normalize
  1390. samples on that platform
  1391. \begin_inset CommandInset citation
  1392. LatexCommand cite
  1393. key "McCall2011"
  1394. literal "false"
  1395. \end_inset
  1396. .
  1397. \end_layout
  1398. \begin_layout Standard
  1399. One other option is the aptly-named Single Channel Array Normalization (SCAN),
  1400. which adapts a normalization method originally designed for tiling arrays
  1401. \begin_inset CommandInset citation
  1402. LatexCommand cite
  1403. key "Piccolo2012"
  1404. literal "false"
  1405. \end_inset
  1406. .
  1407. SCAN is truly single-channel in that it does not require a set of normalization
  1408. paramters estimated from an external set of reference samples like fRMA
  1409. does.
  1410. \end_layout
  1411. \begin_layout Subsection
  1412. Heteroskedasticity must be accounted for in methylation array data
  1413. \end_layout
  1414. \begin_layout Subsubsection
  1415. Methylation array preprocessing induces heteroskedasticity
  1416. \end_layout
  1417. \begin_layout Standard
  1418. DNA methylation arrays are a relatively new kind of assay that uses microarrays
  1419. to measure the degree of methylation on cytosines in specific regions arrayed
  1420. across the genome.
  1421. First, bisulfite treatment converts all unmethylated cytosines to uracil
  1422. (which then become thymine after amplication) while leaving methylated
  1423. cytosines unaffected.
  1424. Then, each target region is interrogated with two probes: one binds to
  1425. the original genomic sequence and interrogates the level of methylated
  1426. DNA, and the other binds to the same sequence with all cytosines replaced
  1427. by thymidines and interrogates the level of unmethylated DNA.
  1428. \end_layout
  1429. \begin_layout Standard
  1430. \begin_inset Float figure
  1431. wide false
  1432. sideways false
  1433. status collapsed
  1434. \begin_layout Plain Layout
  1435. \align center
  1436. \begin_inset Graphics
  1437. filename graphics/methylvoom/sigmoid.pdf
  1438. \end_inset
  1439. \end_layout
  1440. \begin_layout Plain Layout
  1441. \begin_inset Caption Standard
  1442. \begin_layout Plain Layout
  1443. \begin_inset CommandInset label
  1444. LatexCommand label
  1445. name "fig:Sigmoid-beta-m-mapping"
  1446. \end_inset
  1447. \series bold
  1448. Sigmoid shape of the mapping between β and M values
  1449. \end_layout
  1450. \end_inset
  1451. \end_layout
  1452. \end_inset
  1453. \end_layout
  1454. \begin_layout Standard
  1455. After normalization, these two probe intensities are summarized in one of
  1456. two ways, each with advantages and disadvantages.
  1457. β
  1458. \series bold
  1459. \series default
  1460. values, interpreted as fraction of DNA copies methylated, range from 0 to
  1461. 1.
  1462. β
  1463. \series bold
  1464. \series default
  1465. values are conceptually easy to interpret, but the constrained range makes
  1466. them unsuitable for linear modeling, and their error distributions are
  1467. highly non-normal, which also frustrates linear modeling.
  1468. M-values, interpreted as the log ratio of methylated to unmethylated copies,
  1469. are computed by mapping the beta values from
  1470. \begin_inset Formula $[0,1]$
  1471. \end_inset
  1472. onto
  1473. \begin_inset Formula $(-\infty,+\infty)$
  1474. \end_inset
  1475. using a sigmoid curve (Figure
  1476. \begin_inset CommandInset ref
  1477. LatexCommand ref
  1478. reference "fig:Sigmoid-beta-m-mapping"
  1479. plural "false"
  1480. caps "false"
  1481. noprefix "false"
  1482. \end_inset
  1483. ).
  1484. This transformation results in values with better statistical perperties:
  1485. the unconstrained range is suitable for linear modeling, and the error
  1486. distributions are more normal.
  1487. Hence, most linear modeling and other statistical testing on methylation
  1488. arrays is performed using M-values.
  1489. \end_layout
  1490. \begin_layout Standard
  1491. However, the steep slope of the sigmoid transformation near 0 and 1 tends
  1492. to over-exaggerate small differences in β values near those extremes, which
  1493. in turn amplifies the error in those values, leading to a U-shaped trend
  1494. in the mean-variance curve: extreme values have higher variances than values
  1495. near the middle.
  1496. This mean-variance dependency must be accounted for when fitting the linear
  1497. model for differential methylation, or else the variance will be systematically
  1498. overestimated for probes with moderate M-values and underestimated for
  1499. probes with extreme M-values.
  1500. \end_layout
  1501. \begin_layout Subsubsection
  1502. The voom method for RNA-seq data can model M-value heteroskedasticity
  1503. \end_layout
  1504. \begin_layout Standard
  1505. RNA-seq read count data are also known to show heteroskedasticity, and the
  1506. voom method was developed for modeling this heteroskedasticity by estimating
  1507. the mean-variance trend in the data and using this trend to assign precision
  1508. weights to each observation
  1509. \begin_inset CommandInset citation
  1510. LatexCommand cite
  1511. key "Law2013"
  1512. literal "false"
  1513. \end_inset
  1514. .
  1515. While methylation array data are not derived from counts and have a very
  1516. different mean-variance relationship from that of typical RNA-seq data,
  1517. the voom method makes no specific assumptions on the shape of the mean-variance
  1518. relationship - it only assumes that the relationship is smooth enough to
  1519. model using a lowess curve.
  1520. Hence, the method is sufficiently general to model the mean-variance relationsh
  1521. ip in methylation array data.
  1522. However, the standard implementation of voom assumes that the input is
  1523. given in raw read counts, and it must be adapted to run on methylation
  1524. M-values.
  1525. \end_layout
  1526. \begin_layout Section
  1527. Methods
  1528. \end_layout
  1529. \begin_layout Subsection
  1530. Evaluation of classifier performance with different normalization methods
  1531. \end_layout
  1532. \begin_layout Standard
  1533. For testing different expression microarray normalizations, a data set of
  1534. 157 hgu133plus2 arrays was used, consisting of blood samples from kidney
  1535. transplant patients whose grafts had been graded as TX, AR, or ADNR via
  1536. biopsy and histology (46 TX, 69 AR, 42 ADNR)
  1537. \begin_inset CommandInset citation
  1538. LatexCommand cite
  1539. key "Kurian2014"
  1540. literal "true"
  1541. \end_inset
  1542. .
  1543. Additionally, an external validation set of 75 samples was gathered from
  1544. public GEO data (37 TX, 38 AR, no ADNR).
  1545. \end_layout
  1546. \begin_layout Standard
  1547. \begin_inset Flex TODO Note (inline)
  1548. status collapsed
  1549. \begin_layout Plain Layout
  1550. Find appropriate GEO identifiers if possible.
  1551. Kurian 2014 says GSE15296, but this seems to be different data.
  1552. I also need to look up the GEO accession for the external validation set.
  1553. \end_layout
  1554. \end_inset
  1555. \end_layout
  1556. \begin_layout Standard
  1557. To evaluate the effect of each normalization on classifier performance,
  1558. the same classifier training and validation procedure was used after each
  1559. normalization method.
  1560. The PAM package was used to train a nearest shrunken centroid classifier
  1561. on the training set and select the appropriate threshold for centroid shrinking.
  1562. Then the trained classifier was used to predict the class probabilities
  1563. of each validation sample.
  1564. From these class probabilities, ROC curves and area-under-curve (AUC) values
  1565. were generated
  1566. \begin_inset CommandInset citation
  1567. LatexCommand cite
  1568. key "Turck2011"
  1569. literal "false"
  1570. \end_inset
  1571. .
  1572. Each normalization was tested on two different sets of training and validation
  1573. samples.
  1574. For internal validation, the 115 TX and AR arrays in the internal set were
  1575. split at random into two equal sized sets, one for training and one for
  1576. validation, each containing the same numbers of TX and AR samples as the
  1577. other set.
  1578. For external validation, the full set of 115 TX and AR samples were used
  1579. as a training set, and the 75 external TX and AR samples were used as the
  1580. validation set.
  1581. Thus, 2 ROC curves and AUC values were generated for each normalization
  1582. method: one internal and one external.
  1583. Because the external validation set contains no ADNR samples, only classificati
  1584. on of TX and AR samples was considered.
  1585. The ADNR samples were included during normalization but excluded from all
  1586. classifier training and validation.
  1587. This ensures that the performance on internal and external validation sets
  1588. is directly comparable, since both are performing the same task: distinguising
  1589. TX from AR.
  1590. \end_layout
  1591. \begin_layout Standard
  1592. \begin_inset Flex TODO Note (inline)
  1593. status collapsed
  1594. \begin_layout Plain Layout
  1595. Summarize the get.best.threshold algorithm for PAM threshold selection
  1596. \end_layout
  1597. \end_inset
  1598. \end_layout
  1599. \begin_layout Standard
  1600. Six different normalization strategies were evaluated.
  1601. First, 2 well-known non-single-channel normalization methods were considered:
  1602. RMA and dChip
  1603. \begin_inset CommandInset citation
  1604. LatexCommand cite
  1605. key "Li2001,Irizarry2003a"
  1606. literal "false"
  1607. \end_inset
  1608. .
  1609. Since RMA produces expression values on a log2 scale and dChip does not,
  1610. the values from dChip were log2 transformed after normalization.
  1611. Next, RMA and dChip followed by Global Rank-invariant Set Normalization
  1612. (GRSN) were tested
  1613. \begin_inset CommandInset citation
  1614. LatexCommand cite
  1615. key "Pelz2008"
  1616. literal "false"
  1617. \end_inset
  1618. .
  1619. Post-processing with GRSN does not turn RMA or dChip into single-channel
  1620. methods, but it may help mitigate batch effects and is therefore useful
  1621. as a benchmark.
  1622. Lastly, the two single-channel normalization methods, fRMA and SCAN, were
  1623. tested
  1624. \begin_inset CommandInset citation
  1625. LatexCommand cite
  1626. key "McCall2010,Piccolo2012"
  1627. literal "false"
  1628. \end_inset
  1629. .
  1630. When evaluting internal validation performance, only the 157 internal samples
  1631. were normalized; when evaluating external validation performance, all 157
  1632. internal samples and 75 external samples were normalized together.
  1633. \end_layout
  1634. \begin_layout Standard
  1635. For demonstrating the problem with separate normalization of training and
  1636. validation data, one additional normalization was performed: the internal
  1637. and external sets were each normalized separately using RMA, and the normalized
  1638. data for each set were combined into a single set with no further attempts
  1639. at normalizing between the two sets.
  1640. The represents approximately how RMA would have to be used in a clinical
  1641. setting, where the samples to be classified are not available at the time
  1642. the classifier is trained.
  1643. \end_layout
  1644. \begin_layout Subsection
  1645. Generating custom fRMA vectors for hthgu133pluspm array platform
  1646. \end_layout
  1647. \begin_layout Standard
  1648. In order to enable fRMA normalization for the hthgu133pluspm array platform,
  1649. custom fRMA normalization vectors were trained using the frmaTools package
  1650. \begin_inset CommandInset citation
  1651. LatexCommand cite
  1652. key "McCall2011"
  1653. literal "false"
  1654. \end_inset
  1655. .
  1656. Separate vectors were created for two types of samples: kidney graft biopsy
  1657. samples and blood samples from graft recipients.
  1658. For training, a 341 kidney biopsy samples from 2 data sets and 965 blood
  1659. samples from 5 data sets were used as the reference set.
  1660. Arrays were groups into batches based on unique combinations of sample
  1661. type (blood or biopsy), diagnosis (TX, AR, etc.), data set, and scan date.
  1662. Thus, each batch represents arrays of the same kind that were run together
  1663. on the same day.
  1664. For estimating the probe inverse variance weights, frmaTools requires equal-siz
  1665. ed batches, which means a batch size must be chosen, and then batches smaller
  1666. than that size must be ignored, while batches larger than the chosen size
  1667. must be downsampled.
  1668. This downsampling is performed randomly, so the sampling process is repeated
  1669. 5 times and the resulting normalizations are compared to each other.
  1670. \end_layout
  1671. \begin_layout Standard
  1672. To evaluate the consistency of the generated normalization vectors, the
  1673. 5 fRMA vector sets generated from 5 random batch samplings were each used
  1674. to normalize the same 20 randomly selected samples from each tissue.
  1675. Then the normalized expression values for each probe on each array were
  1676. compared across all normalizations.
  1677. Each fRMA normalization was also compared against the normalized expression
  1678. values obtained by normalizing the same 20 samples with ordinary RMA.
  1679. \end_layout
  1680. \begin_layout Subsection
  1681. Modeling methylation array M-value heteroskedasticy in linear models with
  1682. modified voom implementation
  1683. \end_layout
  1684. \begin_layout Standard
  1685. \begin_inset Flex TODO Note (inline)
  1686. status open
  1687. \begin_layout Plain Layout
  1688. Put code on Github and reference it.
  1689. \end_layout
  1690. \end_inset
  1691. \end_layout
  1692. \begin_layout Standard
  1693. To investigate the whether DNA methylation could be used to distinguish
  1694. between healthy and dysfunctional transplants, a data set of 78 Illumina
  1695. 450k methylation arrays from human kidney graft biopsies was analyzed for
  1696. differential metylation between 4 transplant statuses: healthy transplant
  1697. (TX), transplants undergoing acute rejection (AR), acute dysfunction with
  1698. no rejection (ADNR), and chronic allograpft nephropathy (CAN).
  1699. The data consisted of 33 TX, 9 AR, 8 ADNR, and 28 CAN samples.
  1700. The uneven group sizes are a result of taking the biopsy samples before
  1701. the eventual fate of the transplant was known.
  1702. Each sample was additionally annotated with a donor ID (anonymized), Sex,
  1703. Age, Ethnicity, Creatinine Level, and Diabetes diagnosois (all samples
  1704. in this data set came from patients with either Type 1 or Type 2 diabetes).
  1705. \end_layout
  1706. \begin_layout Standard
  1707. The intensity data were first normalized using subset-quantile within array
  1708. normalization (SWAN)
  1709. \begin_inset CommandInset citation
  1710. LatexCommand cite
  1711. key "Maksimovic2012"
  1712. literal "false"
  1713. \end_inset
  1714. , then converted to intensity ratios (beta values)
  1715. \begin_inset CommandInset citation
  1716. LatexCommand cite
  1717. key "Aryee2014"
  1718. literal "false"
  1719. \end_inset
  1720. .
  1721. Any probes binding to loci that overlapped annotated SNPs were dropped,
  1722. and the annotated sex of each sample was verified against the sex inferred
  1723. from the ratio of median probe intensities for the X and Y chromosomes.
  1724. Then, the ratios were transformed to M-values.
  1725. \end_layout
  1726. \begin_layout Standard
  1727. \begin_inset Float table
  1728. wide false
  1729. sideways false
  1730. status collapsed
  1731. \begin_layout Plain Layout
  1732. \begin_inset Tabular
  1733. <lyxtabular version="3" rows="4" columns="6">
  1734. <features tabularvalignment="middle">
  1735. <column alignment="center" valignment="top">
  1736. <column alignment="center" valignment="top">
  1737. <column alignment="center" valignment="top">
  1738. <column alignment="center" valignment="top">
  1739. <column alignment="center" valignment="top">
  1740. <column alignment="center" valignment="top">
  1741. <row>
  1742. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1743. \begin_inset Text
  1744. \begin_layout Plain Layout
  1745. Analysis
  1746. \end_layout
  1747. \end_inset
  1748. </cell>
  1749. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1750. \begin_inset Text
  1751. \begin_layout Plain Layout
  1752. patient random effect
  1753. \end_layout
  1754. \end_inset
  1755. </cell>
  1756. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1757. \begin_inset Text
  1758. \begin_layout Plain Layout
  1759. empirical Bayes
  1760. \end_layout
  1761. \end_inset
  1762. </cell>
  1763. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1764. \begin_inset Text
  1765. \begin_layout Plain Layout
  1766. SVA
  1767. \end_layout
  1768. \end_inset
  1769. </cell>
  1770. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1771. \begin_inset Text
  1772. \begin_layout Plain Layout
  1773. sample weights
  1774. \end_layout
  1775. \end_inset
  1776. </cell>
  1777. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  1778. \begin_inset Text
  1779. \begin_layout Plain Layout
  1780. voom
  1781. \end_layout
  1782. \end_inset
  1783. </cell>
  1784. </row>
  1785. <row>
  1786. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1787. \begin_inset Text
  1788. \begin_layout Plain Layout
  1789. A
  1790. \end_layout
  1791. \end_inset
  1792. </cell>
  1793. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1794. \begin_inset Text
  1795. \begin_layout Plain Layout
  1796. Yes
  1797. \end_layout
  1798. \end_inset
  1799. </cell>
  1800. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1801. \begin_inset Text
  1802. \begin_layout Plain Layout
  1803. Yes
  1804. \end_layout
  1805. \end_inset
  1806. </cell>
  1807. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1808. \begin_inset Text
  1809. \begin_layout Plain Layout
  1810. No
  1811. \end_layout
  1812. \end_inset
  1813. </cell>
  1814. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1815. \begin_inset Text
  1816. \begin_layout Plain Layout
  1817. No
  1818. \end_layout
  1819. \end_inset
  1820. </cell>
  1821. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1822. \begin_inset Text
  1823. \begin_layout Plain Layout
  1824. No
  1825. \end_layout
  1826. \end_inset
  1827. </cell>
  1828. </row>
  1829. <row>
  1830. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1831. \begin_inset Text
  1832. \begin_layout Plain Layout
  1833. B
  1834. \end_layout
  1835. \end_inset
  1836. </cell>
  1837. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1838. \begin_inset Text
  1839. \begin_layout Plain Layout
  1840. Yes
  1841. \end_layout
  1842. \end_inset
  1843. </cell>
  1844. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1845. \begin_inset Text
  1846. \begin_layout Plain Layout
  1847. Yes
  1848. \end_layout
  1849. \end_inset
  1850. </cell>
  1851. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1852. \begin_inset Text
  1853. \begin_layout Plain Layout
  1854. Yes
  1855. \end_layout
  1856. \end_inset
  1857. </cell>
  1858. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1859. \begin_inset Text
  1860. \begin_layout Plain Layout
  1861. Yes
  1862. \end_layout
  1863. \end_inset
  1864. </cell>
  1865. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1866. \begin_inset Text
  1867. \begin_layout Plain Layout
  1868. No
  1869. \end_layout
  1870. \end_inset
  1871. </cell>
  1872. </row>
  1873. <row>
  1874. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1875. \begin_inset Text
  1876. \begin_layout Plain Layout
  1877. C
  1878. \end_layout
  1879. \end_inset
  1880. </cell>
  1881. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1882. \begin_inset Text
  1883. \begin_layout Plain Layout
  1884. Yes
  1885. \end_layout
  1886. \end_inset
  1887. </cell>
  1888. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1889. \begin_inset Text
  1890. \begin_layout Plain Layout
  1891. Yes
  1892. \end_layout
  1893. \end_inset
  1894. </cell>
  1895. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1896. \begin_inset Text
  1897. \begin_layout Plain Layout
  1898. Yes
  1899. \end_layout
  1900. \end_inset
  1901. </cell>
  1902. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1903. \begin_inset Text
  1904. \begin_layout Plain Layout
  1905. Yes
  1906. \end_layout
  1907. \end_inset
  1908. </cell>
  1909. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  1910. \begin_inset Text
  1911. \begin_layout Plain Layout
  1912. Yes
  1913. \end_layout
  1914. \end_inset
  1915. </cell>
  1916. </row>
  1917. </lyxtabular>
  1918. \end_inset
  1919. \end_layout
  1920. \begin_layout Plain Layout
  1921. \begin_inset Caption Standard
  1922. \begin_layout Plain Layout
  1923. \series bold
  1924. \begin_inset CommandInset label
  1925. LatexCommand label
  1926. name "tab:Summary-of-meth-analysis"
  1927. \end_inset
  1928. Summary of analysis variants for methylation array data.
  1929. \series default
  1930. Each analysis included a different set of steps to adjust or account for
  1931. various systematic features of the data.
  1932. See the text for a more detailed explanation of each step.
  1933. \end_layout
  1934. \end_inset
  1935. \end_layout
  1936. \end_inset
  1937. \end_layout
  1938. \begin_layout Standard
  1939. From the M-values, a series of parallel analyses was performed, each adding
  1940. additional steps into the model fit to accomodate a feature of the data
  1941. (see Table
  1942. \begin_inset CommandInset ref
  1943. LatexCommand ref
  1944. reference "tab:Summary-of-meth-analysis"
  1945. plural "false"
  1946. caps "false"
  1947. noprefix "false"
  1948. \end_inset
  1949. ).
  1950. For analysis A, a
  1951. \begin_inset Quotes eld
  1952. \end_inset
  1953. basic
  1954. \begin_inset Quotes erd
  1955. \end_inset
  1956. linear modeling analysis was performed, compensating for known confounders
  1957. by including terms for the factor of interest (transplant status) as well
  1958. as the known biological confounders: sex, age, ethnicity, and diabetes.
  1959. Since some samples came from the same patients at different times, the
  1960. intra-patient correlation was modeled as a random effect, estimating a
  1961. shared correlation value across all probes
  1962. \begin_inset CommandInset citation
  1963. LatexCommand cite
  1964. key "Smyth2005a"
  1965. literal "false"
  1966. \end_inset
  1967. .
  1968. Then the linear model was fit, and the variance was modeled using empirical
  1969. Bayes squeezing toward the mean-variance trend
  1970. \begin_inset CommandInset citation
  1971. LatexCommand cite
  1972. key "Ritchie2015"
  1973. literal "false"
  1974. \end_inset
  1975. .
  1976. Finally, t-tests or F-tests were performed as appropriate for each test:
  1977. t-tests for single contrasts, and F-tests for multiple contrasts.
  1978. P-values were corrected for multiple testing using the Benjamini-Hochberg
  1979. procedure for FDR control
  1980. \begin_inset CommandInset citation
  1981. LatexCommand cite
  1982. key "Benjamini1995"
  1983. literal "false"
  1984. \end_inset
  1985. .
  1986. \end_layout
  1987. \begin_layout Standard
  1988. For the analysis B, surrogate variable analysis (SVA) was used to infer
  1989. additional unobserved sources of heterogeneity in the data
  1990. \begin_inset CommandInset citation
  1991. LatexCommand cite
  1992. key "Leek2007"
  1993. literal "false"
  1994. \end_inset
  1995. .
  1996. These surrogate variables were added to the design matrix before fitting
  1997. the linear model.
  1998. In addition, sample quality weights were estimated from the data and used
  1999. during linear modeling to down-weight the contribution of highly variable
  2000. arrays while increasing the weight to arrays with lower variability
  2001. \begin_inset CommandInset citation
  2002. LatexCommand cite
  2003. key "Ritchie2006"
  2004. literal "false"
  2005. \end_inset
  2006. .
  2007. The remainder of the analysis proceeded as in analysis A.
  2008. For analysis C, the voom method was adapted to run on methylation array
  2009. data and used to model and correct for the mean-variance trend using individual
  2010. observation weights
  2011. \begin_inset CommandInset citation
  2012. LatexCommand cite
  2013. key "Law2013"
  2014. literal "false"
  2015. \end_inset
  2016. , which were combined with the sample weights
  2017. \begin_inset CommandInset citation
  2018. LatexCommand cite
  2019. key "Liu2015"
  2020. literal "false"
  2021. \end_inset
  2022. .
  2023. Each time weights were used, they were estimated once before estimating
  2024. the random effect correlation value, and then the weights were re-estimated
  2025. taking the random effect into account.
  2026. The remainder of the analysis proceeded as in analysis B.
  2027. \end_layout
  2028. \begin_layout Section
  2029. Results
  2030. \end_layout
  2031. \begin_layout Standard
  2032. \begin_inset Flex TODO Note (inline)
  2033. status open
  2034. \begin_layout Plain Layout
  2035. Improve subsection titles in this section
  2036. \end_layout
  2037. \end_inset
  2038. \end_layout
  2039. \begin_layout Subsection
  2040. fRMA eliminates unwanted dependence of classifier training on normalization
  2041. strategy caused by RMA
  2042. \end_layout
  2043. \begin_layout Standard
  2044. \begin_inset Flex TODO Note (inline)
  2045. status open
  2046. \begin_layout Plain Layout
  2047. Write figure legends
  2048. \end_layout
  2049. \end_inset
  2050. \end_layout
  2051. \begin_layout Subsubsection
  2052. Separate normalization with RMA introduces unwanted biases in classification
  2053. \end_layout
  2054. \begin_layout Standard
  2055. \begin_inset Float figure
  2056. wide false
  2057. sideways false
  2058. status collapsed
  2059. \begin_layout Plain Layout
  2060. \align center
  2061. \begin_inset Graphics
  2062. filename graphics/PAM/predplot.pdf
  2063. width 100col%
  2064. groupId colwidth
  2065. \end_inset
  2066. \end_layout
  2067. \begin_layout Plain Layout
  2068. \begin_inset Caption Standard
  2069. \begin_layout Plain Layout
  2070. \begin_inset CommandInset label
  2071. LatexCommand label
  2072. name "fig:Classifier-probabilities-RMA"
  2073. \end_inset
  2074. \series bold
  2075. Classifier probabilities on validation samples when normalized with RMA
  2076. together vs.
  2077. separately.
  2078. \end_layout
  2079. \end_inset
  2080. \end_layout
  2081. \end_inset
  2082. \end_layout
  2083. \begin_layout Standard
  2084. To demonstrate the problem with non-single-channel normalization methods,
  2085. we considered the problem of training a classifier to distinguish TX from
  2086. AR using the samples from the internal set as training data, evaluating
  2087. performance on the external set.
  2088. First, training and evaluation were performed after normalizing all array
  2089. samples together as a single set using RMA, and second, the internal samples
  2090. were normalized separately from the external samples and the training and
  2091. evaluation were repeated.
  2092. For each sample in the validation set, the classifier probabilities from
  2093. both classifiers were plotted against each other (Fig.
  2094. \begin_inset CommandInset ref
  2095. LatexCommand ref
  2096. reference "fig:Classifier-probabilities-RMA"
  2097. plural "false"
  2098. caps "false"
  2099. noprefix "false"
  2100. \end_inset
  2101. ).
  2102. As expected, separate normalization biases the classifier probabilities,
  2103. resulting in several misclassifications.
  2104. In this case, the bias from separate normalization causes the classifier
  2105. to assign a lower probability of AR to every sample.
  2106. \end_layout
  2107. \begin_layout Subsubsection
  2108. fRMA and SCAN achieve maintain classification performance while eliminating
  2109. dependence on normalization strategy
  2110. \end_layout
  2111. \begin_layout Standard
  2112. \begin_inset Float figure
  2113. placement tb
  2114. wide false
  2115. sideways false
  2116. status collapsed
  2117. \begin_layout Plain Layout
  2118. \align center
  2119. \begin_inset Graphics
  2120. filename graphics/PAM/ROC-TXvsAR-internal.pdf
  2121. width 100col%
  2122. groupId colwidth
  2123. \end_inset
  2124. \end_layout
  2125. \begin_layout Plain Layout
  2126. \begin_inset Caption Standard
  2127. \begin_layout Plain Layout
  2128. \series bold
  2129. \begin_inset CommandInset label
  2130. LatexCommand label
  2131. name "fig:ROC-PAM-int"
  2132. \end_inset
  2133. ROC curves for PAM on internal validation data using different normalization
  2134. strategies
  2135. \end_layout
  2136. \end_inset
  2137. \end_layout
  2138. \end_inset
  2139. \end_layout
  2140. \begin_layout Standard
  2141. \begin_inset Float table
  2142. wide false
  2143. sideways false
  2144. status collapsed
  2145. \begin_layout Plain Layout
  2146. \align center
  2147. \begin_inset Tabular
  2148. <lyxtabular version="3" rows="7" columns="4">
  2149. <features tabularvalignment="middle">
  2150. <column alignment="center" valignment="top">
  2151. <column alignment="center" valignment="top">
  2152. <column alignment="center" valignment="top">
  2153. <column alignment="center" valignment="top">
  2154. <row>
  2155. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2156. \begin_inset Text
  2157. \begin_layout Plain Layout
  2158. \family roman
  2159. \series medium
  2160. \shape up
  2161. \size normal
  2162. \emph off
  2163. \bar no
  2164. \strikeout off
  2165. \xout off
  2166. \uuline off
  2167. \uwave off
  2168. \noun off
  2169. \color none
  2170. Normalization
  2171. \end_layout
  2172. \end_inset
  2173. </cell>
  2174. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2175. \begin_inset Text
  2176. \begin_layout Plain Layout
  2177. Single-channel?
  2178. \end_layout
  2179. \end_inset
  2180. </cell>
  2181. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2182. \begin_inset Text
  2183. \begin_layout Plain Layout
  2184. \family roman
  2185. \series medium
  2186. \shape up
  2187. \size normal
  2188. \emph off
  2189. \bar no
  2190. \strikeout off
  2191. \xout off
  2192. \uuline off
  2193. \uwave off
  2194. \noun off
  2195. \color none
  2196. Internal Val.
  2197. AUC
  2198. \end_layout
  2199. \end_inset
  2200. </cell>
  2201. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2202. \begin_inset Text
  2203. \begin_layout Plain Layout
  2204. External Val.
  2205. AUC
  2206. \end_layout
  2207. \end_inset
  2208. </cell>
  2209. </row>
  2210. <row>
  2211. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2212. \begin_inset Text
  2213. \begin_layout Plain Layout
  2214. \family roman
  2215. \series medium
  2216. \shape up
  2217. \size normal
  2218. \emph off
  2219. \bar no
  2220. \strikeout off
  2221. \xout off
  2222. \uuline off
  2223. \uwave off
  2224. \noun off
  2225. \color none
  2226. RMA
  2227. \end_layout
  2228. \end_inset
  2229. </cell>
  2230. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2231. \begin_inset Text
  2232. \begin_layout Plain Layout
  2233. No
  2234. \end_layout
  2235. \end_inset
  2236. </cell>
  2237. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2238. \begin_inset Text
  2239. \begin_layout Plain Layout
  2240. \family roman
  2241. \series medium
  2242. \shape up
  2243. \size normal
  2244. \emph off
  2245. \bar no
  2246. \strikeout off
  2247. \xout off
  2248. \uuline off
  2249. \uwave off
  2250. \noun off
  2251. \color none
  2252. 0.852
  2253. \end_layout
  2254. \end_inset
  2255. </cell>
  2256. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2257. \begin_inset Text
  2258. \begin_layout Plain Layout
  2259. \family roman
  2260. \series medium
  2261. \shape up
  2262. \size normal
  2263. \emph off
  2264. \bar no
  2265. \strikeout off
  2266. \xout off
  2267. \uuline off
  2268. \uwave off
  2269. \noun off
  2270. \color none
  2271. 0.713
  2272. \end_layout
  2273. \end_inset
  2274. </cell>
  2275. </row>
  2276. <row>
  2277. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2278. \begin_inset Text
  2279. \begin_layout Plain Layout
  2280. \family roman
  2281. \series medium
  2282. \shape up
  2283. \size normal
  2284. \emph off
  2285. \bar no
  2286. \strikeout off
  2287. \xout off
  2288. \uuline off
  2289. \uwave off
  2290. \noun off
  2291. \color none
  2292. dChip
  2293. \end_layout
  2294. \end_inset
  2295. </cell>
  2296. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2297. \begin_inset Text
  2298. \begin_layout Plain Layout
  2299. No
  2300. \end_layout
  2301. \end_inset
  2302. </cell>
  2303. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2304. \begin_inset Text
  2305. \begin_layout Plain Layout
  2306. \family roman
  2307. \series medium
  2308. \shape up
  2309. \size normal
  2310. \emph off
  2311. \bar no
  2312. \strikeout off
  2313. \xout off
  2314. \uuline off
  2315. \uwave off
  2316. \noun off
  2317. \color none
  2318. 0.891
  2319. \end_layout
  2320. \end_inset
  2321. </cell>
  2322. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2323. \begin_inset Text
  2324. \begin_layout Plain Layout
  2325. \family roman
  2326. \series medium
  2327. \shape up
  2328. \size normal
  2329. \emph off
  2330. \bar no
  2331. \strikeout off
  2332. \xout off
  2333. \uuline off
  2334. \uwave off
  2335. \noun off
  2336. \color none
  2337. 0.657
  2338. \end_layout
  2339. \end_inset
  2340. </cell>
  2341. </row>
  2342. <row>
  2343. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2344. \begin_inset Text
  2345. \begin_layout Plain Layout
  2346. \family roman
  2347. \series medium
  2348. \shape up
  2349. \size normal
  2350. \emph off
  2351. \bar no
  2352. \strikeout off
  2353. \xout off
  2354. \uuline off
  2355. \uwave off
  2356. \noun off
  2357. \color none
  2358. RMA + GRSN
  2359. \end_layout
  2360. \end_inset
  2361. </cell>
  2362. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2363. \begin_inset Text
  2364. \begin_layout Plain Layout
  2365. No
  2366. \end_layout
  2367. \end_inset
  2368. </cell>
  2369. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2370. \begin_inset Text
  2371. \begin_layout Plain Layout
  2372. \family roman
  2373. \series medium
  2374. \shape up
  2375. \size normal
  2376. \emph off
  2377. \bar no
  2378. \strikeout off
  2379. \xout off
  2380. \uuline off
  2381. \uwave off
  2382. \noun off
  2383. \color none
  2384. 0.816
  2385. \end_layout
  2386. \end_inset
  2387. </cell>
  2388. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2389. \begin_inset Text
  2390. \begin_layout Plain Layout
  2391. \family roman
  2392. \series medium
  2393. \shape up
  2394. \size normal
  2395. \emph off
  2396. \bar no
  2397. \strikeout off
  2398. \xout off
  2399. \uuline off
  2400. \uwave off
  2401. \noun off
  2402. \color none
  2403. 0.750
  2404. \end_layout
  2405. \end_inset
  2406. </cell>
  2407. </row>
  2408. <row>
  2409. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2410. \begin_inset Text
  2411. \begin_layout Plain Layout
  2412. \family roman
  2413. \series medium
  2414. \shape up
  2415. \size normal
  2416. \emph off
  2417. \bar no
  2418. \strikeout off
  2419. \xout off
  2420. \uuline off
  2421. \uwave off
  2422. \noun off
  2423. \color none
  2424. dChip + GRSN
  2425. \end_layout
  2426. \end_inset
  2427. </cell>
  2428. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2429. \begin_inset Text
  2430. \begin_layout Plain Layout
  2431. No
  2432. \end_layout
  2433. \end_inset
  2434. </cell>
  2435. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2436. \begin_inset Text
  2437. \begin_layout Plain Layout
  2438. \family roman
  2439. \series medium
  2440. \shape up
  2441. \size normal
  2442. \emph off
  2443. \bar no
  2444. \strikeout off
  2445. \xout off
  2446. \uuline off
  2447. \uwave off
  2448. \noun off
  2449. \color none
  2450. 0.875
  2451. \end_layout
  2452. \end_inset
  2453. </cell>
  2454. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2455. \begin_inset Text
  2456. \begin_layout Plain Layout
  2457. \family roman
  2458. \series medium
  2459. \shape up
  2460. \size normal
  2461. \emph off
  2462. \bar no
  2463. \strikeout off
  2464. \xout off
  2465. \uuline off
  2466. \uwave off
  2467. \noun off
  2468. \color none
  2469. 0.642
  2470. \end_layout
  2471. \end_inset
  2472. </cell>
  2473. </row>
  2474. <row>
  2475. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2476. \begin_inset Text
  2477. \begin_layout Plain Layout
  2478. \family roman
  2479. \series medium
  2480. \shape up
  2481. \size normal
  2482. \emph off
  2483. \bar no
  2484. \strikeout off
  2485. \xout off
  2486. \uuline off
  2487. \uwave off
  2488. \noun off
  2489. \color none
  2490. fRMA
  2491. \end_layout
  2492. \end_inset
  2493. </cell>
  2494. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2495. \begin_inset Text
  2496. \begin_layout Plain Layout
  2497. Yes
  2498. \end_layout
  2499. \end_inset
  2500. </cell>
  2501. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2502. \begin_inset Text
  2503. \begin_layout Plain Layout
  2504. \family roman
  2505. \series medium
  2506. \shape up
  2507. \size normal
  2508. \emph off
  2509. \bar no
  2510. \strikeout off
  2511. \xout off
  2512. \uuline off
  2513. \uwave off
  2514. \noun off
  2515. \color none
  2516. 0.863
  2517. \end_layout
  2518. \end_inset
  2519. </cell>
  2520. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2521. \begin_inset Text
  2522. \begin_layout Plain Layout
  2523. \family roman
  2524. \series medium
  2525. \shape up
  2526. \size normal
  2527. \emph off
  2528. \bar no
  2529. \strikeout off
  2530. \xout off
  2531. \uuline off
  2532. \uwave off
  2533. \noun off
  2534. \color none
  2535. 0.718
  2536. \end_layout
  2537. \end_inset
  2538. </cell>
  2539. </row>
  2540. <row>
  2541. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2542. \begin_inset Text
  2543. \begin_layout Plain Layout
  2544. \family roman
  2545. \series medium
  2546. \shape up
  2547. \size normal
  2548. \emph off
  2549. \bar no
  2550. \strikeout off
  2551. \xout off
  2552. \uuline off
  2553. \uwave off
  2554. \noun off
  2555. \color none
  2556. SCAN
  2557. \end_layout
  2558. \end_inset
  2559. </cell>
  2560. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2561. \begin_inset Text
  2562. \begin_layout Plain Layout
  2563. Yes
  2564. \end_layout
  2565. \end_inset
  2566. </cell>
  2567. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2568. \begin_inset Text
  2569. \begin_layout Plain Layout
  2570. \family roman
  2571. \series medium
  2572. \shape up
  2573. \size normal
  2574. \emph off
  2575. \bar no
  2576. \strikeout off
  2577. \xout off
  2578. \uuline off
  2579. \uwave off
  2580. \noun off
  2581. \color none
  2582. 0.853
  2583. \end_layout
  2584. \end_inset
  2585. </cell>
  2586. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2587. \begin_inset Text
  2588. \begin_layout Plain Layout
  2589. \family roman
  2590. \series medium
  2591. \shape up
  2592. \size normal
  2593. \emph off
  2594. \bar no
  2595. \strikeout off
  2596. \xout off
  2597. \uuline off
  2598. \uwave off
  2599. \noun off
  2600. \color none
  2601. 0.689
  2602. \end_layout
  2603. \end_inset
  2604. </cell>
  2605. </row>
  2606. </lyxtabular>
  2607. \end_inset
  2608. \end_layout
  2609. \begin_layout Plain Layout
  2610. \begin_inset Caption Standard
  2611. \begin_layout Plain Layout
  2612. \begin_inset CommandInset label
  2613. LatexCommand label
  2614. name "tab:AUC-PAM"
  2615. \end_inset
  2616. \series bold
  2617. AUC values for internal and external validation with 6 different normalization
  2618. strategies.
  2619. \series default
  2620. Only fRMA and SCAN are single-channel normalizations.
  2621. The other 4 normalizations are for comparison.
  2622. \end_layout
  2623. \end_inset
  2624. \end_layout
  2625. \end_inset
  2626. \end_layout
  2627. \begin_layout Standard
  2628. For internal validation, the 6 methods' AUC values ranged from 0.816 to 0.891,
  2629. as shown in Table
  2630. \begin_inset CommandInset ref
  2631. LatexCommand ref
  2632. reference "tab:AUC-PAM"
  2633. plural "false"
  2634. caps "false"
  2635. noprefix "false"
  2636. \end_inset
  2637. .
  2638. Among the non-single-channel normalizations, dChip outperformed RMA, while
  2639. GRSN reduced the AUC values for both dChip and RMA.
  2640. Both single-channel methods, fRMA and SCAN, slightly outperformed RMA,
  2641. with fRMA ahead of SCAN.
  2642. However, the difference between RMA and fRMA is still quite small.
  2643. Figure
  2644. \begin_inset CommandInset ref
  2645. LatexCommand ref
  2646. reference "fig:ROC-PAM-int"
  2647. plural "false"
  2648. caps "false"
  2649. noprefix "false"
  2650. \end_inset
  2651. shows that the ROC curves for RMA, dChip, and fRMA look very similar and
  2652. relatively smooth, while both GRSN curves and the curve for SCAN have a
  2653. more jagged appearance.
  2654. \end_layout
  2655. \begin_layout Standard
  2656. \begin_inset Float figure
  2657. placement tb
  2658. wide false
  2659. sideways false
  2660. status collapsed
  2661. \begin_layout Plain Layout
  2662. \align center
  2663. \begin_inset Graphics
  2664. filename graphics/PAM/ROC-TXvsAR-external.pdf
  2665. width 100col%
  2666. groupId colwidth
  2667. \end_inset
  2668. \end_layout
  2669. \begin_layout Plain Layout
  2670. \begin_inset Caption Standard
  2671. \begin_layout Plain Layout
  2672. \series bold
  2673. \begin_inset CommandInset label
  2674. LatexCommand label
  2675. name "fig:ROC-PAM-ext"
  2676. \end_inset
  2677. ROC curve for PAM on external validation data using different normalization
  2678. strategies
  2679. \end_layout
  2680. \end_inset
  2681. \end_layout
  2682. \end_inset
  2683. \end_layout
  2684. \begin_layout Standard
  2685. For external validation, as expected, all the AUC values are lower than
  2686. the internal validations, ranging from 0.642 to 0.750 (Table
  2687. \begin_inset CommandInset ref
  2688. LatexCommand ref
  2689. reference "tab:AUC-PAM"
  2690. plural "false"
  2691. caps "false"
  2692. noprefix "false"
  2693. \end_inset
  2694. ).
  2695. With or without GRSN, RMA shows its dominance over dChip in this more challengi
  2696. ng test.
  2697. Unlike in the internal validation, GRSN actually improves the classifier
  2698. performance for RMA, although it does not for dChip.
  2699. Once again, both single-channel methods perform about on par with RMA,
  2700. with fRMA performing slightly better and SCAN performing a bit worse.
  2701. Figure
  2702. \begin_inset CommandInset ref
  2703. LatexCommand ref
  2704. reference "fig:ROC-PAM-ext"
  2705. plural "false"
  2706. caps "false"
  2707. noprefix "false"
  2708. \end_inset
  2709. shows the ROC curves for the external validation test.
  2710. As expected, none of them are as clean-looking as the internal validation
  2711. ROC curves.
  2712. The curves for RMA, RMA+GRSN, and fRMA all look similar, while the other
  2713. curves look more divergent.
  2714. \end_layout
  2715. \begin_layout Standard
  2716. \begin_inset ERT
  2717. status collapsed
  2718. \begin_layout Plain Layout
  2719. \backslash
  2720. FloatBarrier
  2721. \end_layout
  2722. \end_inset
  2723. \end_layout
  2724. \begin_layout Subsection
  2725. fRMA with custom-generated vectors enables normalization on hthgu133pluspm
  2726. \end_layout
  2727. \begin_layout Standard
  2728. \begin_inset Float figure
  2729. placement tb
  2730. wide false
  2731. sideways false
  2732. status collapsed
  2733. \begin_layout Plain Layout
  2734. \align center
  2735. \begin_inset Graphics
  2736. filename graphics/frma-pax-bx/batchsize_batches.pdf
  2737. \end_inset
  2738. \end_layout
  2739. \begin_layout Plain Layout
  2740. \begin_inset Caption Standard
  2741. \begin_layout Plain Layout
  2742. \begin_inset CommandInset label
  2743. LatexCommand label
  2744. name "fig:batch-size-batches"
  2745. \end_inset
  2746. \series bold
  2747. Effect of batch size selection on number of batches included in fRMA probe
  2748. weight learning.
  2749. \series default
  2750. For batch sizes ranging from 3 to 15, the number of batches with at least
  2751. that many samples was plotted for biopsy (BX) and blood (PAX) samples.
  2752. The selected batch size, 5, is marked with a dotted vertical line.
  2753. \end_layout
  2754. \end_inset
  2755. \end_layout
  2756. \end_inset
  2757. \end_layout
  2758. \begin_layout Standard
  2759. \begin_inset Float figure
  2760. placement tb
  2761. wide false
  2762. sideways false
  2763. status collapsed
  2764. \begin_layout Plain Layout
  2765. \align center
  2766. \begin_inset Graphics
  2767. filename graphics/frma-pax-bx/batchsize_samples.pdf
  2768. \end_inset
  2769. \end_layout
  2770. \begin_layout Plain Layout
  2771. \begin_inset Caption Standard
  2772. \begin_layout Plain Layout
  2773. \begin_inset CommandInset label
  2774. LatexCommand label
  2775. name "fig:batch-size-samples"
  2776. \end_inset
  2777. \series bold
  2778. Effect of batch size selection on number of samples included in fRMA probe
  2779. weight learning.
  2780. \series default
  2781. For batch sizes ranging from 3 to 15, the number of samples included in
  2782. probe weight training was plotted for biopsy (BX) and blood (PAX) samples.
  2783. The selected batch size, 5, is marked with a dotted vertical line.
  2784. \end_layout
  2785. \end_inset
  2786. \end_layout
  2787. \end_inset
  2788. \end_layout
  2789. \begin_layout Standard
  2790. In order to enable use of fRMA to normalize hthgu133pluspm, a custom set
  2791. of fRMA vectors was created.
  2792. First, an appropriate batch size was chosen by looking at the number of
  2793. batches and number of samples included as a function of batch size (Figures
  2794. \begin_inset CommandInset ref
  2795. LatexCommand ref
  2796. reference "fig:batch-size-batches"
  2797. plural "false"
  2798. caps "false"
  2799. noprefix "false"
  2800. \end_inset
  2801. and
  2802. \begin_inset CommandInset ref
  2803. LatexCommand ref
  2804. reference "fig:batch-size-samples"
  2805. plural "false"
  2806. caps "false"
  2807. noprefix "false"
  2808. \end_inset
  2809. , respectively).
  2810. For a given batch size, all batches with fewer samples that the chosen
  2811. size must be ignored during training, while larger batches must be randomly
  2812. downsampled to the chosen size.
  2813. Hence, the number of samples included for a given batch size equals the
  2814. batch size times the number of batches with at least that many samples.
  2815. From Figure
  2816. \begin_inset CommandInset ref
  2817. LatexCommand ref
  2818. reference "fig:batch-size-samples"
  2819. plural "false"
  2820. caps "false"
  2821. noprefix "false"
  2822. \end_inset
  2823. , it is apparent that that a batch size of 8 maximizes the number of samples
  2824. included in training.
  2825. Increasing the batch size beyond this causes too many smaller batches to
  2826. be excluded, reducing the total number of samples for both tissue types.
  2827. However, a batch size of 8 is not necessarily optimal.
  2828. The article introducing frmaTools concluded that it was highly advantageous
  2829. to use a smaller batch size in order to include more batches, even at the
  2830. expense of including fewer total samples in training
  2831. \begin_inset CommandInset citation
  2832. LatexCommand cite
  2833. key "McCall2011"
  2834. literal "false"
  2835. \end_inset
  2836. .
  2837. To strike an appropriate balance between more batches and more samples,
  2838. a batch size of 5 was chosen.
  2839. For both blood and biopsy samples, this increased the number of batches
  2840. included by 10, with only a modest reduction in the number of samples compared
  2841. to a batch size of 8.
  2842. With a batch size of 5, 26 batches of biopsy samples and 46 batches of
  2843. blood samples were available.
  2844. \end_layout
  2845. \begin_layout Standard
  2846. \begin_inset Float figure
  2847. wide false
  2848. sideways false
  2849. status collapsed
  2850. \begin_layout Plain Layout
  2851. \align center
  2852. \begin_inset Graphics
  2853. filename graphics/frma-pax-bx/M-BX-violin.pdf
  2854. lyxscale 40
  2855. height 80theight%
  2856. groupId m-violin
  2857. \end_inset
  2858. \end_layout
  2859. \begin_layout Plain Layout
  2860. \begin_inset Caption Standard
  2861. \begin_layout Plain Layout
  2862. \begin_inset CommandInset label
  2863. LatexCommand label
  2864. name "fig:m-bx-violin"
  2865. \end_inset
  2866. \series bold
  2867. Violin plot of log ratios between normalizations for 20 biopsy samples.
  2868. \series default
  2869. Each of 20 randomly selected biopsy samples was normalized with RMA and
  2870. with 5 different sets of fRMA vectors.
  2871. This shows the distribution of log ratios between normalized expression
  2872. values, aggregated across all 20 arrays.
  2873. \end_layout
  2874. \end_inset
  2875. \end_layout
  2876. \end_inset
  2877. \end_layout
  2878. \begin_layout Standard
  2879. Since fRMA training requires equal-size batches, larger batches are downsampled
  2880. randomly.
  2881. This introduces a nondeterministic step in the generation of normalization
  2882. vectors.
  2883. To show that this randomness does not substantially change the outcome,
  2884. the random downsampling and subsequent vector learning was repeated 5 times,
  2885. with a different random seed each time.
  2886. 20 samples were selected at random as a test set and normalized with each
  2887. of the 5 sets of fRMA normalization vectors as well as ordinary RMA, and
  2888. the normalized expression values were compared across normalizations.
  2889. Figure
  2890. \begin_inset CommandInset ref
  2891. LatexCommand ref
  2892. reference "fig:m-bx-violin"
  2893. plural "false"
  2894. caps "false"
  2895. noprefix "false"
  2896. \end_inset
  2897. shows a summary of these comparisons for biopsy samples.
  2898. Comparing RMA to each of the 5 fRMA normalizations, the distribution of
  2899. log ratios is somewhat wide, indicating that the normalizations disagree
  2900. on the expression values of a fair number of probe sets.
  2901. In contrast, comparisons of fRMA against fRMA, the vast mojority of probe
  2902. sets have very small log ratios, indicating a very high agreement between
  2903. the normalized values generated by the two normalizations.
  2904. This shows that the fRMA normalization's behavior is not very sensitive
  2905. to the random downsampling of larger batches during training.
  2906. \end_layout
  2907. \begin_layout Standard
  2908. \begin_inset Float figure
  2909. wide false
  2910. sideways false
  2911. status collapsed
  2912. \begin_layout Plain Layout
  2913. \align center
  2914. \begin_inset Graphics
  2915. filename graphics/frma-pax-bx/MA-BX-RMA.fRMA.pdf
  2916. lyxscale 50
  2917. width 100text%
  2918. groupId ma-frma
  2919. \end_inset
  2920. \end_layout
  2921. \begin_layout Plain Layout
  2922. \begin_inset Caption Standard
  2923. \begin_layout Plain Layout
  2924. \begin_inset CommandInset label
  2925. LatexCommand label
  2926. name "fig:ma-bx-rma-frma"
  2927. \end_inset
  2928. \series bold
  2929. Representative MA plot comparing RMA against fRMA for 20 biopsy samples.
  2930. \series default
  2931. Averages and log ratios were computed for every probe in each of 20 biopsy
  2932. samples between RMA normalization and fRMA.
  2933. Density of points is represented by darkness of shading, and individual
  2934. outlier points are plotted.
  2935. \end_layout
  2936. \end_inset
  2937. \end_layout
  2938. \end_inset
  2939. \end_layout
  2940. \begin_layout Standard
  2941. \begin_inset Float figure
  2942. wide false
  2943. sideways false
  2944. status collapsed
  2945. \begin_layout Plain Layout
  2946. \align center
  2947. \begin_inset Graphics
  2948. filename graphics/frma-pax-bx/MA-BX-fRMA.fRMA.pdf
  2949. lyxscale 50
  2950. width 100text%
  2951. groupId ma-frma
  2952. \end_inset
  2953. \end_layout
  2954. \begin_layout Plain Layout
  2955. \begin_inset Caption Standard
  2956. \begin_layout Plain Layout
  2957. \begin_inset CommandInset label
  2958. LatexCommand label
  2959. name "fig:ma-bx-frma-frma"
  2960. \end_inset
  2961. \series bold
  2962. Representative MA plot comparing different fRMA vectors for 20 biopsy samples.
  2963. \series default
  2964. Averages and log ratios were computed for every probe in each of 20 biopsy
  2965. samples between fRMA normalizations using vectors from two different batch
  2966. samplings.
  2967. Density of points is represented by darkness of shading, and individual
  2968. outlier points are plotted.
  2969. \end_layout
  2970. \end_inset
  2971. \end_layout
  2972. \end_inset
  2973. \end_layout
  2974. \begin_layout Standard
  2975. Figure
  2976. \begin_inset CommandInset ref
  2977. LatexCommand ref
  2978. reference "fig:ma-bx-rma-frma"
  2979. plural "false"
  2980. caps "false"
  2981. noprefix "false"
  2982. \end_inset
  2983. shows an MA plot of the RMA-normalized values against the fRMA-normalized
  2984. values for the same probe sets and arrays, corresponding to the first row
  2985. of Figure
  2986. \begin_inset CommandInset ref
  2987. LatexCommand ref
  2988. reference "fig:m-bx-violin"
  2989. plural "false"
  2990. caps "false"
  2991. noprefix "false"
  2992. \end_inset
  2993. .
  2994. This MA plot shows that not only is there a wide distribution of M-values,
  2995. but the trend of M-values is dependent on the average normalized intensity.
  2996. This is expected, since the overall trend represents the differences in
  2997. the quantile normalization step.
  2998. When running RMA, only the quantiles for these specific 20 arrays are used,
  2999. while for fRMA the quantile distribution is taking from all arrays used
  3000. in training.
  3001. Figure
  3002. \begin_inset CommandInset ref
  3003. LatexCommand ref
  3004. reference "fig:ma-bx-frma-frma"
  3005. plural "false"
  3006. caps "false"
  3007. noprefix "false"
  3008. \end_inset
  3009. shows a similar MA plot comparing 2 different fRMA normalizations, correspondin
  3010. g to the 6th row of Figure
  3011. \begin_inset CommandInset ref
  3012. LatexCommand ref
  3013. reference "fig:m-bx-violin"
  3014. plural "false"
  3015. caps "false"
  3016. noprefix "false"
  3017. \end_inset
  3018. .
  3019. The MA plot is very tightly centered around zero with no visible trend.
  3020. Figures
  3021. \begin_inset CommandInset ref
  3022. LatexCommand ref
  3023. reference "fig:m-pax-violin"
  3024. plural "false"
  3025. caps "false"
  3026. noprefix "false"
  3027. \end_inset
  3028. ,
  3029. \begin_inset CommandInset ref
  3030. LatexCommand ref
  3031. reference "fig:MA-PAX-rma-frma"
  3032. plural "false"
  3033. caps "false"
  3034. noprefix "false"
  3035. \end_inset
  3036. , and
  3037. \begin_inset CommandInset ref
  3038. LatexCommand ref
  3039. reference "fig:ma-bx-frma-frma"
  3040. plural "false"
  3041. caps "false"
  3042. noprefix "false"
  3043. \end_inset
  3044. show exactly the same information for the blood samples, once again comparing
  3045. the normalized expression values between normalizations for all probe sets
  3046. across 20 randomly selected test arrays.
  3047. Once again, there is a wider distribution of log ratios between RMA-normalized
  3048. values and fRMA-normalized, and a much tighter distribution when comparing
  3049. different fRMA normalizations to each other, indicating that the fRMA training
  3050. process is robust to random batch downsampling for the blood samples as
  3051. well.
  3052. \end_layout
  3053. \begin_layout Standard
  3054. \begin_inset Float figure
  3055. wide false
  3056. sideways false
  3057. status collapsed
  3058. \begin_layout Plain Layout
  3059. \align center
  3060. \begin_inset Graphics
  3061. filename graphics/frma-pax-bx/M-PAX-violin.pdf
  3062. lyxscale 40
  3063. height 80theight%
  3064. groupId m-violin
  3065. \end_inset
  3066. \end_layout
  3067. \begin_layout Plain Layout
  3068. \begin_inset Caption Standard
  3069. \begin_layout Plain Layout
  3070. \begin_inset CommandInset label
  3071. LatexCommand label
  3072. name "fig:m-pax-violin"
  3073. \end_inset
  3074. \series bold
  3075. Violin plot of log ratios between normalizations for 20 blood samples.
  3076. \series default
  3077. Each of 20 randomly selected blood samples was normalized with RMA and with
  3078. 5 different sets of fRMA vectors.
  3079. This shows the distribution of log ratios between normalized expression
  3080. values, aggregated across all 20 arrays.
  3081. \end_layout
  3082. \end_inset
  3083. \end_layout
  3084. \end_inset
  3085. \end_layout
  3086. \begin_layout Standard
  3087. \begin_inset Float figure
  3088. wide false
  3089. sideways false
  3090. status collapsed
  3091. \begin_layout Plain Layout
  3092. \align center
  3093. \begin_inset Graphics
  3094. filename graphics/frma-pax-bx/MA-PAX-RMA.fRMA.pdf
  3095. lyxscale 50
  3096. width 100text%
  3097. groupId ma-frma
  3098. \end_inset
  3099. \end_layout
  3100. \begin_layout Plain Layout
  3101. \begin_inset Caption Standard
  3102. \begin_layout Plain Layout
  3103. \begin_inset CommandInset label
  3104. LatexCommand label
  3105. name "fig:MA-PAX-rma-frma"
  3106. \end_inset
  3107. \series bold
  3108. Representative MA plot comparing RMA against fRMA for 20 blood samples.
  3109. \series default
  3110. Averages and log ratios were computed for every probe in each of 20 blood
  3111. samples between RMA normalization and fRMA.
  3112. Density of points is represented by darkness of shading, and individual
  3113. outlier points are plotted.
  3114. \end_layout
  3115. \end_inset
  3116. \end_layout
  3117. \begin_layout Plain Layout
  3118. \end_layout
  3119. \end_inset
  3120. \end_layout
  3121. \begin_layout Standard
  3122. \begin_inset Float figure
  3123. wide false
  3124. sideways false
  3125. status collapsed
  3126. \begin_layout Plain Layout
  3127. \align center
  3128. \begin_inset Graphics
  3129. filename graphics/frma-pax-bx/MA-PAX-fRMA.fRMA.pdf
  3130. lyxscale 50
  3131. width 100text%
  3132. groupId ma-frma
  3133. \end_inset
  3134. \end_layout
  3135. \begin_layout Plain Layout
  3136. \begin_inset Caption Standard
  3137. \begin_layout Plain Layout
  3138. \begin_inset CommandInset label
  3139. LatexCommand label
  3140. name "fig:MA-PAX-frma-frma"
  3141. \end_inset
  3142. \series bold
  3143. Representative MA plot comparing different fRMA vectors for 20 blood samples.
  3144. \series default
  3145. Averages and log ratios were computed for every probe in each of 20 blood
  3146. samples between fRMA normalizations using vectors from two different batch
  3147. samplings.
  3148. Density of points is represented by darkness of shading, and individual
  3149. outlier points are plotted.
  3150. \end_layout
  3151. \end_inset
  3152. \end_layout
  3153. \end_inset
  3154. \end_layout
  3155. \begin_layout Standard
  3156. \begin_inset ERT
  3157. status collapsed
  3158. \begin_layout Plain Layout
  3159. \backslash
  3160. FloatBarrier
  3161. \end_layout
  3162. \end_inset
  3163. \end_layout
  3164. \begin_layout Subsection
  3165. SVA, voom, and array weights improve model fit for methylation array data
  3166. \end_layout
  3167. \begin_layout Standard
  3168. \begin_inset Float figure
  3169. wide false
  3170. sideways false
  3171. status collapsed
  3172. \begin_layout Plain Layout
  3173. \align center
  3174. \begin_inset Flex TODO Note (inline)
  3175. status open
  3176. \begin_layout Plain Layout
  3177. Fix axis labels:
  3178. \begin_inset Quotes eld
  3179. \end_inset
  3180. log2 M-value
  3181. \begin_inset Quotes erd
  3182. \end_inset
  3183. is redundant because M-values are already log scale
  3184. \end_layout
  3185. \end_inset
  3186. \end_layout
  3187. \begin_layout Plain Layout
  3188. \align center
  3189. \begin_inset Graphics
  3190. filename graphics/methylvoom/unadj.dupcor/meanvar-trends-PAGE1-CROP-RASTER.png
  3191. lyxscale 15
  3192. width 100col%
  3193. groupId raster-600ppi
  3194. \end_inset
  3195. \end_layout
  3196. \begin_layout Plain Layout
  3197. \begin_inset Caption Standard
  3198. \begin_layout Plain Layout
  3199. \series bold
  3200. \begin_inset CommandInset label
  3201. LatexCommand label
  3202. name "fig:meanvar-basic"
  3203. \end_inset
  3204. Mean-variance trend for analysis A.
  3205. \series default
  3206. The log2(standard deviation) for each probe is plotted against the probe's
  3207. average M-value across all samples as a black point, with some transparency
  3208. to make overplotting more visible, since there are about 450,000 points.
  3209. Density of points is also indicated by the dark blue contour lines.
  3210. The prior variance trend estimated by eBayes is shown in light blue, while
  3211. the lowess trend of the points is shown in red.
  3212. \end_layout
  3213. \end_inset
  3214. \end_layout
  3215. \end_inset
  3216. \end_layout
  3217. \begin_layout Standard
  3218. Figure
  3219. \begin_inset CommandInset ref
  3220. LatexCommand ref
  3221. reference "fig:meanvar-basic"
  3222. plural "false"
  3223. caps "false"
  3224. noprefix "false"
  3225. \end_inset
  3226. shows the relationship between the mean M-value and the standard deviation
  3227. calculated for each probe in the methylation array data set.
  3228. A few features of the data are apparent.
  3229. First, the data are very strongly bimodal, with peaks in the density around
  3230. M-values of +4 and -4.
  3231. These modes correspond to methylation sites that are nearly 100% methylated
  3232. and nearly 100% unmethylated, respectively.
  3233. The strong bomodality indicates that a majority of probes interrogate sites
  3234. that fall into one of these two categories.
  3235. The points in between these modes represent sites that are either partially
  3236. methylated in many samples, or are fully methylated in some samples and
  3237. fully unmethylated in other samples, or some combination.
  3238. The next visible feature of the data is the W-shaped variance trend.
  3239. The upticks in the variance trend on either side are expected, based on
  3240. the sigmoid transformation exaggerating small differences at extreme M-values
  3241. (Figure
  3242. \begin_inset CommandInset ref
  3243. LatexCommand ref
  3244. reference "fig:Sigmoid-beta-m-mapping"
  3245. plural "false"
  3246. caps "false"
  3247. noprefix "false"
  3248. \end_inset
  3249. ).
  3250. However, the uptick in the center is interesting: it indicates that sites
  3251. that are not constitutitively methylated or unmethylated have a higher
  3252. variance.
  3253. This could be a genuine biological effect, or it could be spurious noise
  3254. that is only observable at sites with varying methylation.
  3255. \end_layout
  3256. \begin_layout Standard
  3257. \begin_inset Float figure
  3258. wide false
  3259. sideways false
  3260. status open
  3261. \begin_layout Plain Layout
  3262. \begin_inset Graphics
  3263. filename graphics/methylvoom/unadj.dupcor.sva.aw/meanvar-trends-PAGE1-CROP-RASTER.png
  3264. lyxscale 15
  3265. width 100col%
  3266. groupId raster-600ppi
  3267. \end_inset
  3268. \end_layout
  3269. \begin_layout Plain Layout
  3270. \begin_inset Caption Standard
  3271. \begin_layout Plain Layout
  3272. \series bold
  3273. \begin_inset CommandInset label
  3274. LatexCommand label
  3275. name "fig:meanvar-sva-aw"
  3276. \end_inset
  3277. Mean-variance trend for analysis B.
  3278. \series default
  3279. Interpretation is as in Figure
  3280. \begin_inset CommandInset ref
  3281. LatexCommand ref
  3282. reference "fig:meanvar-basic"
  3283. plural "false"
  3284. caps "false"
  3285. noprefix "false"
  3286. \end_inset
  3287. .
  3288. \end_layout
  3289. \end_inset
  3290. \end_layout
  3291. \end_inset
  3292. \end_layout
  3293. \begin_layout Standard
  3294. In Figure
  3295. \begin_inset CommandInset ref
  3296. LatexCommand ref
  3297. reference "fig:meanvar-sva-aw"
  3298. plural "false"
  3299. caps "false"
  3300. noprefix "false"
  3301. \end_inset
  3302. , we see the mean-variance trend for the same methylation array data, this
  3303. time with surrogate variables and sample quality weights estimated from
  3304. the data and included in the model.
  3305. As expected, the overall average variance is smaller, since the surrogate
  3306. variables account for some of the variance.
  3307. In addition, the uptick in variance in the middle of the M-value range
  3308. has disappeared, turning the W shape into a wide U shape.
  3309. This indicates that the excess variance in the probes with intermediate
  3310. M-values was explained by systematic variations not correlated with known
  3311. covariates, and these variations were modeled by the surrogate variables.
  3312. The result is a nearly flat variance trend for the entire intermediate
  3313. M-value range from about -3 to +3.
  3314. In contrast, the excess variance at the extremes was not
  3315. \begin_inset Quotes eld
  3316. \end_inset
  3317. absorbed
  3318. \begin_inset Quotes erd
  3319. \end_inset
  3320. by the surrogate variables and remains in the plot, indicating that this
  3321. variation has no systematic component: probes with extreme M-values are
  3322. uniformly more variable across all samples, as expected.
  3323. \end_layout
  3324. \begin_layout Standard
  3325. \begin_inset Float figure
  3326. wide false
  3327. sideways false
  3328. status collapsed
  3329. \begin_layout Plain Layout
  3330. \begin_inset Graphics
  3331. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/meanvar-trends-PAGE2-CROP-RASTER.png
  3332. lyxscale 15
  3333. width 100col%
  3334. groupId raster-600ppi
  3335. \end_inset
  3336. \end_layout
  3337. \begin_layout Plain Layout
  3338. \begin_inset Caption Standard
  3339. \begin_layout Plain Layout
  3340. \series bold
  3341. \begin_inset CommandInset label
  3342. LatexCommand label
  3343. name "fig:meanvar-sva-voomaw"
  3344. \end_inset
  3345. Mean-variance trend after voom modeling in analysis C.
  3346. \series default
  3347. Interpretation is as in Figure
  3348. \begin_inset CommandInset ref
  3349. LatexCommand ref
  3350. reference "fig:meanvar-basic"
  3351. plural "false"
  3352. caps "false"
  3353. noprefix "false"
  3354. \end_inset
  3355. .
  3356. \end_layout
  3357. \end_inset
  3358. \end_layout
  3359. \end_inset
  3360. \end_layout
  3361. \begin_layout Standard
  3362. Figure
  3363. \begin_inset CommandInset ref
  3364. LatexCommand ref
  3365. reference "fig:meanvar-sva-voomaw"
  3366. plural "false"
  3367. caps "false"
  3368. noprefix "false"
  3369. \end_inset
  3370. shows the mean-variance trend after fitting the model with the observation
  3371. weights assigned by voom based on the mean-variance trend shown in Figure
  3372. \begin_inset CommandInset ref
  3373. LatexCommand ref
  3374. reference "fig:meanvar-sva-aw"
  3375. plural "false"
  3376. caps "false"
  3377. noprefix "false"
  3378. \end_inset
  3379. .
  3380. As expected, the weights exactly counteract the trend in the data, resulting
  3381. in a nearly flat trend centered vertically at 1 (i.e.
  3382. 0 on the log scale).
  3383. This shows that the observations with extreme M-values have been appropriately
  3384. down-weighted to account for the fact that the noise in those observations
  3385. has been amplified by the non-linear M-value transformation.
  3386. In turn, this gives relatively more weight to observervations in the middle
  3387. region, which are more likely to correspond to probes measuring interesting
  3388. biology (not constitutively methylated or unmethylated).
  3389. \end_layout
  3390. \begin_layout Standard
  3391. \begin_inset Float table
  3392. wide false
  3393. sideways false
  3394. status collapsed
  3395. \begin_layout Plain Layout
  3396. \align center
  3397. \begin_inset Tabular
  3398. <lyxtabular version="3" rows="5" columns="3">
  3399. <features tabularvalignment="middle">
  3400. <column alignment="center" valignment="top">
  3401. <column alignment="center" valignment="top">
  3402. <column alignment="center" valignment="top">
  3403. <row>
  3404. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3405. \begin_inset Text
  3406. \begin_layout Plain Layout
  3407. Covariate
  3408. \end_layout
  3409. \end_inset
  3410. </cell>
  3411. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3412. \begin_inset Text
  3413. \begin_layout Plain Layout
  3414. Test used
  3415. \end_layout
  3416. \end_inset
  3417. </cell>
  3418. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3419. \begin_inset Text
  3420. \begin_layout Plain Layout
  3421. p-value
  3422. \end_layout
  3423. \end_inset
  3424. </cell>
  3425. </row>
  3426. <row>
  3427. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3428. \begin_inset Text
  3429. \begin_layout Plain Layout
  3430. Transplant Status
  3431. \end_layout
  3432. \end_inset
  3433. </cell>
  3434. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3435. \begin_inset Text
  3436. \begin_layout Plain Layout
  3437. F-test
  3438. \end_layout
  3439. \end_inset
  3440. </cell>
  3441. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3442. \begin_inset Text
  3443. \begin_layout Plain Layout
  3444. 0.404
  3445. \end_layout
  3446. \end_inset
  3447. </cell>
  3448. </row>
  3449. <row>
  3450. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3451. \begin_inset Text
  3452. \begin_layout Plain Layout
  3453. Diabetes Diagnosis
  3454. \end_layout
  3455. \end_inset
  3456. </cell>
  3457. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3458. \begin_inset Text
  3459. \begin_layout Plain Layout
  3460. t-test
  3461. \end_layout
  3462. \end_inset
  3463. </cell>
  3464. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3465. \begin_inset Text
  3466. \begin_layout Plain Layout
  3467. 0.00106
  3468. \end_layout
  3469. \end_inset
  3470. </cell>
  3471. </row>
  3472. <row>
  3473. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3474. \begin_inset Text
  3475. \begin_layout Plain Layout
  3476. Sex
  3477. \end_layout
  3478. \end_inset
  3479. </cell>
  3480. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3481. \begin_inset Text
  3482. \begin_layout Plain Layout
  3483. t-test
  3484. \end_layout
  3485. \end_inset
  3486. </cell>
  3487. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3488. \begin_inset Text
  3489. \begin_layout Plain Layout
  3490. 0.148
  3491. \end_layout
  3492. \end_inset
  3493. </cell>
  3494. </row>
  3495. <row>
  3496. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3497. \begin_inset Text
  3498. \begin_layout Plain Layout
  3499. Age
  3500. \end_layout
  3501. \end_inset
  3502. </cell>
  3503. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3504. \begin_inset Text
  3505. \begin_layout Plain Layout
  3506. linear regression
  3507. \end_layout
  3508. \end_inset
  3509. </cell>
  3510. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3511. \begin_inset Text
  3512. \begin_layout Plain Layout
  3513. 0.212
  3514. \end_layout
  3515. \end_inset
  3516. </cell>
  3517. </row>
  3518. </lyxtabular>
  3519. \end_inset
  3520. \end_layout
  3521. \begin_layout Plain Layout
  3522. \begin_inset Caption Standard
  3523. \begin_layout Plain Layout
  3524. \series bold
  3525. \begin_inset CommandInset label
  3526. LatexCommand label
  3527. name "tab:weight-covariate-tests"
  3528. \end_inset
  3529. Association of sample weights with clinical covariates in methylation array
  3530. data.
  3531. \series default
  3532. Computed sample quality log weights were tested for significant association
  3533. with each of the variables in the model (1st column).
  3534. An appropriate test was selected for each variable (2nd column).
  3535. P-values for significant association are shown in the 3rd column.
  3536. \end_layout
  3537. \end_inset
  3538. \end_layout
  3539. \end_inset
  3540. \end_layout
  3541. \begin_layout Standard
  3542. \begin_inset Flex TODO Note (inline)
  3543. status open
  3544. \begin_layout Plain Layout
  3545. Redo the sample weight boxplot with notches and without fill colors (and
  3546. update the legend)
  3547. \end_layout
  3548. \end_inset
  3549. \end_layout
  3550. \begin_layout Standard
  3551. \begin_inset Float figure
  3552. wide false
  3553. sideways false
  3554. status collapsed
  3555. \begin_layout Plain Layout
  3556. \begin_inset Graphics
  3557. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/sample-weights-PAGE3-CROP.pdf
  3558. \end_inset
  3559. \end_layout
  3560. \begin_layout Plain Layout
  3561. \begin_inset Caption Standard
  3562. \begin_layout Plain Layout
  3563. \begin_inset CommandInset label
  3564. LatexCommand label
  3565. name "fig:diabetes-sample-weights"
  3566. \end_inset
  3567. \series bold
  3568. Boxplot of sample quality weights grouped by diabetes diagnosis.
  3569. \series default
  3570. Sample were grouped based on diabetes diagnosis, and the distribution of
  3571. sample quality weights for each diagnosis was plotted.
  3572. \end_layout
  3573. \end_inset
  3574. \end_layout
  3575. \begin_layout Plain Layout
  3576. \end_layout
  3577. \end_inset
  3578. \end_layout
  3579. \begin_layout Standard
  3580. To determine whether any of the known experimental factors had an impact
  3581. on data quality, the sample quality weights estimated from the data were
  3582. tested for association with each of the experimental factors (Table
  3583. \begin_inset CommandInset ref
  3584. LatexCommand ref
  3585. reference "tab:weight-covariate-tests"
  3586. plural "false"
  3587. caps "false"
  3588. noprefix "false"
  3589. \end_inset
  3590. ).
  3591. Diabetes diagnosis was found to have a potentially significant association
  3592. with the sample weights, with a t-test p-value of
  3593. \begin_inset Formula $1.06\times10^{-3}$
  3594. \end_inset
  3595. .
  3596. Figure
  3597. \begin_inset CommandInset ref
  3598. LatexCommand ref
  3599. reference "fig:diabetes-sample-weights"
  3600. plural "false"
  3601. caps "false"
  3602. noprefix "false"
  3603. \end_inset
  3604. shows the distribution of sample weights grouped by diabetes diagnosis.
  3605. The samples from patients with Type 2 diabetes were assigned significantly
  3606. lower weights than those from patients with Type 1 diabetes.
  3607. This indicates that the type 2 diabetes samples had an overall higher variance
  3608. on average across all probes.
  3609. \end_layout
  3610. \begin_layout Standard
  3611. \begin_inset Float table
  3612. wide false
  3613. sideways false
  3614. status collapsed
  3615. \begin_layout Plain Layout
  3616. \align center
  3617. \begin_inset Flex TODO Note (inline)
  3618. status open
  3619. \begin_layout Plain Layout
  3620. Consider transposing this table and the next one
  3621. \end_layout
  3622. \end_inset
  3623. \end_layout
  3624. \begin_layout Plain Layout
  3625. \align center
  3626. \begin_inset Tabular
  3627. <lyxtabular version="3" rows="5" columns="4">
  3628. <features tabularvalignment="middle">
  3629. <column alignment="center" valignment="top">
  3630. <column alignment="center" valignment="top">
  3631. <column alignment="center" valignment="top">
  3632. <column alignment="center" valignment="top">
  3633. <row>
  3634. <cell alignment="center" valignment="top" usebox="none">
  3635. \begin_inset Text
  3636. \begin_layout Plain Layout
  3637. \end_layout
  3638. \end_inset
  3639. </cell>
  3640. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3641. \begin_inset Text
  3642. \begin_layout Plain Layout
  3643. Analysis
  3644. \end_layout
  3645. \end_inset
  3646. </cell>
  3647. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3648. \begin_inset Text
  3649. \begin_layout Plain Layout
  3650. \end_layout
  3651. \end_inset
  3652. </cell>
  3653. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3654. \begin_inset Text
  3655. \begin_layout Plain Layout
  3656. \end_layout
  3657. \end_inset
  3658. </cell>
  3659. </row>
  3660. <row>
  3661. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3662. \begin_inset Text
  3663. \begin_layout Plain Layout
  3664. Contrast
  3665. \end_layout
  3666. \end_inset
  3667. </cell>
  3668. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3669. \begin_inset Text
  3670. \begin_layout Plain Layout
  3671. A
  3672. \end_layout
  3673. \end_inset
  3674. </cell>
  3675. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3676. \begin_inset Text
  3677. \begin_layout Plain Layout
  3678. B
  3679. \end_layout
  3680. \end_inset
  3681. </cell>
  3682. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3683. \begin_inset Text
  3684. \begin_layout Plain Layout
  3685. C
  3686. \end_layout
  3687. \end_inset
  3688. </cell>
  3689. </row>
  3690. <row>
  3691. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3692. \begin_inset Text
  3693. \begin_layout Plain Layout
  3694. TX vs AR
  3695. \end_layout
  3696. \end_inset
  3697. </cell>
  3698. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3699. \begin_inset Text
  3700. \begin_layout Plain Layout
  3701. 0
  3702. \end_layout
  3703. \end_inset
  3704. </cell>
  3705. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3706. \begin_inset Text
  3707. \begin_layout Plain Layout
  3708. 25
  3709. \end_layout
  3710. \end_inset
  3711. </cell>
  3712. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3713. \begin_inset Text
  3714. \begin_layout Plain Layout
  3715. 22
  3716. \end_layout
  3717. \end_inset
  3718. </cell>
  3719. </row>
  3720. <row>
  3721. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3722. \begin_inset Text
  3723. \begin_layout Plain Layout
  3724. TX vs ADNR
  3725. \end_layout
  3726. \end_inset
  3727. </cell>
  3728. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3729. \begin_inset Text
  3730. \begin_layout Plain Layout
  3731. 7
  3732. \end_layout
  3733. \end_inset
  3734. </cell>
  3735. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3736. \begin_inset Text
  3737. \begin_layout Plain Layout
  3738. 338
  3739. \end_layout
  3740. \end_inset
  3741. </cell>
  3742. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3743. \begin_inset Text
  3744. \begin_layout Plain Layout
  3745. 369
  3746. \end_layout
  3747. \end_inset
  3748. </cell>
  3749. </row>
  3750. <row>
  3751. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3752. \begin_inset Text
  3753. \begin_layout Plain Layout
  3754. TX vs CAN
  3755. \end_layout
  3756. \end_inset
  3757. </cell>
  3758. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3759. \begin_inset Text
  3760. \begin_layout Plain Layout
  3761. 0
  3762. \end_layout
  3763. \end_inset
  3764. </cell>
  3765. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3766. \begin_inset Text
  3767. \begin_layout Plain Layout
  3768. 231
  3769. \end_layout
  3770. \end_inset
  3771. </cell>
  3772. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3773. \begin_inset Text
  3774. \begin_layout Plain Layout
  3775. 278
  3776. \end_layout
  3777. \end_inset
  3778. </cell>
  3779. </row>
  3780. </lyxtabular>
  3781. \end_inset
  3782. \end_layout
  3783. \begin_layout Plain Layout
  3784. \begin_inset Caption Standard
  3785. \begin_layout Plain Layout
  3786. \begin_inset CommandInset label
  3787. LatexCommand label
  3788. name "tab:methyl-num-signif"
  3789. \end_inset
  3790. \series bold
  3791. Number of probes significant at 10% FDR for each contrast in each analysis.
  3792. \series default
  3793. For each of the analyses in Table
  3794. \begin_inset CommandInset ref
  3795. LatexCommand ref
  3796. reference "tab:Summary-of-meth-analysis"
  3797. plural "false"
  3798. caps "false"
  3799. noprefix "false"
  3800. \end_inset
  3801. , the table shows the number of probes called significantly differentially
  3802. methylated at a threshold of 10% FDR for each comparison between TX and
  3803. the other 3 transplant statuses.
  3804. \end_layout
  3805. \end_inset
  3806. \end_layout
  3807. \end_inset
  3808. \end_layout
  3809. \begin_layout Standard
  3810. \begin_inset Float table
  3811. wide false
  3812. sideways false
  3813. status collapsed
  3814. \begin_layout Plain Layout
  3815. \align center
  3816. \begin_inset Tabular
  3817. <lyxtabular version="3" rows="5" columns="4">
  3818. <features tabularvalignment="middle">
  3819. <column alignment="center" valignment="top">
  3820. <column alignment="center" valignment="top">
  3821. <column alignment="center" valignment="top">
  3822. <column alignment="center" valignment="top">
  3823. <row>
  3824. <cell alignment="center" valignment="top" usebox="none">
  3825. \begin_inset Text
  3826. \begin_layout Plain Layout
  3827. \end_layout
  3828. \end_inset
  3829. </cell>
  3830. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3831. \begin_inset Text
  3832. \begin_layout Plain Layout
  3833. Analysis
  3834. \end_layout
  3835. \end_inset
  3836. </cell>
  3837. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3838. \begin_inset Text
  3839. \begin_layout Plain Layout
  3840. \end_layout
  3841. \end_inset
  3842. </cell>
  3843. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3844. \begin_inset Text
  3845. \begin_layout Plain Layout
  3846. \end_layout
  3847. \end_inset
  3848. </cell>
  3849. </row>
  3850. <row>
  3851. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3852. \begin_inset Text
  3853. \begin_layout Plain Layout
  3854. Contrast
  3855. \end_layout
  3856. \end_inset
  3857. </cell>
  3858. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3859. \begin_inset Text
  3860. \begin_layout Plain Layout
  3861. A
  3862. \end_layout
  3863. \end_inset
  3864. </cell>
  3865. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3866. \begin_inset Text
  3867. \begin_layout Plain Layout
  3868. B
  3869. \end_layout
  3870. \end_inset
  3871. </cell>
  3872. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3873. \begin_inset Text
  3874. \begin_layout Plain Layout
  3875. C
  3876. \end_layout
  3877. \end_inset
  3878. </cell>
  3879. </row>
  3880. <row>
  3881. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3882. \begin_inset Text
  3883. \begin_layout Plain Layout
  3884. TX vs AR
  3885. \end_layout
  3886. \end_inset
  3887. </cell>
  3888. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3889. \begin_inset Text
  3890. \begin_layout Plain Layout
  3891. 0
  3892. \end_layout
  3893. \end_inset
  3894. </cell>
  3895. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3896. \begin_inset Text
  3897. \begin_layout Plain Layout
  3898. 10,063
  3899. \end_layout
  3900. \end_inset
  3901. </cell>
  3902. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3903. \begin_inset Text
  3904. \begin_layout Plain Layout
  3905. 11,225
  3906. \end_layout
  3907. \end_inset
  3908. </cell>
  3909. </row>
  3910. <row>
  3911. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3912. \begin_inset Text
  3913. \begin_layout Plain Layout
  3914. TX vs ADNR
  3915. \end_layout
  3916. \end_inset
  3917. </cell>
  3918. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3919. \begin_inset Text
  3920. \begin_layout Plain Layout
  3921. 27
  3922. \end_layout
  3923. \end_inset
  3924. </cell>
  3925. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3926. \begin_inset Text
  3927. \begin_layout Plain Layout
  3928. 12,674
  3929. \end_layout
  3930. \end_inset
  3931. </cell>
  3932. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3933. \begin_inset Text
  3934. \begin_layout Plain Layout
  3935. 13,086
  3936. \end_layout
  3937. \end_inset
  3938. </cell>
  3939. </row>
  3940. <row>
  3941. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3942. \begin_inset Text
  3943. \begin_layout Plain Layout
  3944. TX vs CAN
  3945. \end_layout
  3946. \end_inset
  3947. </cell>
  3948. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3949. \begin_inset Text
  3950. \begin_layout Plain Layout
  3951. 966
  3952. \end_layout
  3953. \end_inset
  3954. </cell>
  3955. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3956. \begin_inset Text
  3957. \begin_layout Plain Layout
  3958. 20,039
  3959. \end_layout
  3960. \end_inset
  3961. </cell>
  3962. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3963. \begin_inset Text
  3964. \begin_layout Plain Layout
  3965. 20,955
  3966. \end_layout
  3967. \end_inset
  3968. </cell>
  3969. </row>
  3970. </lyxtabular>
  3971. \end_inset
  3972. \end_layout
  3973. \begin_layout Plain Layout
  3974. \begin_inset Caption Standard
  3975. \begin_layout Plain Layout
  3976. \begin_inset CommandInset label
  3977. LatexCommand label
  3978. name "tab:methyl-est-nonnull"
  3979. \end_inset
  3980. \series bold
  3981. Estimated number of non-null tests for each contrast in each analysis.
  3982. \series default
  3983. For each of the analyses in Table
  3984. \begin_inset CommandInset ref
  3985. LatexCommand ref
  3986. reference "tab:Summary-of-meth-analysis"
  3987. plural "false"
  3988. caps "false"
  3989. noprefix "false"
  3990. \end_inset
  3991. , the table shows the number of probes estimated to be differentially methylated
  3992. between TX and the other 3 transplant statuses.
  3993. \end_layout
  3994. \end_inset
  3995. \end_layout
  3996. \end_inset
  3997. \end_layout
  3998. \begin_layout Standard
  3999. \begin_inset Float figure
  4000. wide false
  4001. sideways false
  4002. status collapsed
  4003. \begin_layout Plain Layout
  4004. \begin_inset Flex TODO Note (inline)
  4005. status open
  4006. \begin_layout Plain Layout
  4007. Re-generate p-value histograms for all relevant contrasts in a single page,
  4008. then write an appropriate legend.
  4009. \end_layout
  4010. \end_inset
  4011. \end_layout
  4012. \begin_layout Plain Layout
  4013. \align center
  4014. \series bold
  4015. [Figure goes here]
  4016. \end_layout
  4017. \begin_layout Plain Layout
  4018. \begin_inset Caption Standard
  4019. \begin_layout Plain Layout
  4020. \series bold
  4021. \begin_inset CommandInset label
  4022. LatexCommand label
  4023. name "fig:meth-p-value-histograms"
  4024. \end_inset
  4025. Probe p-value histograms for each contrast in each analysis.
  4026. \end_layout
  4027. \end_inset
  4028. \end_layout
  4029. \begin_layout Plain Layout
  4030. \end_layout
  4031. \end_inset
  4032. \end_layout
  4033. \begin_layout Standard
  4034. Table
  4035. \begin_inset CommandInset ref
  4036. LatexCommand ref
  4037. reference "tab:methyl-num-signif"
  4038. plural "false"
  4039. caps "false"
  4040. noprefix "false"
  4041. \end_inset
  4042. shows the number of significantly differentially methylated probes reported
  4043. by each analysis for each comparison of interest at an FDR of 10%.
  4044. As expected, the more elaborate analyses, B and C, report more significant
  4045. probes than the more basic analysis A, consistent with the conclusions
  4046. above that the data contain hidden systematic variations that must be modeled.
  4047. Table
  4048. \begin_inset CommandInset ref
  4049. LatexCommand ref
  4050. reference "tab:methyl-est-nonnull"
  4051. plural "false"
  4052. caps "false"
  4053. noprefix "false"
  4054. \end_inset
  4055. shows the estimated number differentially methylated probes for each test
  4056. from each analysis.
  4057. This was computed by estimating the proportion of null hypotheses that
  4058. were true using the method of
  4059. \begin_inset CommandInset citation
  4060. LatexCommand cite
  4061. key "Phipson2013"
  4062. literal "false"
  4063. \end_inset
  4064. and subtracting that fraction from the total number of probes, yielding
  4065. an estimate of the number of null hypotheses that are false based on the
  4066. distribution of p-values across the entire dataset.
  4067. Note that this does not identify which null hypotheses should be rejected
  4068. (i.e.
  4069. which probes are significant); it only estimates the true number of such
  4070. probes.
  4071. Once again, analyses B and C result it much larger estimates for the number
  4072. of differentially methylated probes.
  4073. In this case, analysis C, the only analysis that includes voom, estimates
  4074. the largest number of differentially methylated probes for all 3 contrasts.
  4075. If the assumptions of all the methods employed hold, then this represents
  4076. a gain in statistical power over the simpler analysis A.
  4077. Figure
  4078. \begin_inset CommandInset ref
  4079. LatexCommand ref
  4080. reference "fig:meth-p-value-histograms"
  4081. plural "false"
  4082. caps "false"
  4083. noprefix "false"
  4084. \end_inset
  4085. shows the p-value distributions for each test, from which the numbers in
  4086. Table
  4087. \begin_inset CommandInset ref
  4088. LatexCommand ref
  4089. reference "tab:methyl-est-nonnull"
  4090. plural "false"
  4091. caps "false"
  4092. noprefix "false"
  4093. \end_inset
  4094. were generated.
  4095. The distributions for analysis A all have a dip in density near zero, which
  4096. is a strong sign of a poor model fit.
  4097. The histograms for analyses B and C are more well-behaved, with a uniform
  4098. component stretching all the way from 0 to 1 representing the probes for
  4099. which the null hypotheses is true (no differential methylation), and a
  4100. zero-biased component representing the probes for which the null hypothesis
  4101. is false (differentially methylated).
  4102. These histograms do not indicate any major issues with the model fit.
  4103. \end_layout
  4104. \begin_layout Standard
  4105. \begin_inset Flex TODO Note (inline)
  4106. status open
  4107. \begin_layout Plain Layout
  4108. Maybe include the PCA plots before/after SVA effect subtraction?
  4109. \end_layout
  4110. \end_inset
  4111. \end_layout
  4112. \begin_layout Standard
  4113. \begin_inset ERT
  4114. status collapsed
  4115. \begin_layout Plain Layout
  4116. \backslash
  4117. FloatBarrier
  4118. \end_layout
  4119. \end_inset
  4120. \end_layout
  4121. \begin_layout Section
  4122. Discussion
  4123. \end_layout
  4124. \begin_layout Subsection
  4125. fRMA achieves clinically applicable normalization without sacrificing classifica
  4126. tion performance
  4127. \end_layout
  4128. \begin_layout Standard
  4129. As shown in Figure
  4130. \begin_inset CommandInset ref
  4131. LatexCommand ref
  4132. reference "fig:Classifier-probabilities-RMA"
  4133. plural "false"
  4134. caps "false"
  4135. noprefix "false"
  4136. \end_inset
  4137. , improper normalization, particularly separate normalization of training
  4138. and test samples, leads to unwanted biases in classification.
  4139. In a controlled experimental context, it is always possible to correct
  4140. this issue by normalizing all experimental samples together.
  4141. However, because it is not feasible to normalize all samples together in
  4142. a clinical context, a single-channel normalization is required is required.
  4143. \end_layout
  4144. \begin_layout Standard
  4145. The major concern in using a single-channel normalization is that non-single-cha
  4146. nnel methods can share information between arrays to improve the normalization,
  4147. and single-channel methods risk sacrificing the gains in normalization
  4148. accuracy that come from this information sharing.
  4149. In the case of RMA, this information sharing is accomplished through quantile
  4150. normalization and median polish steps.
  4151. The need for information sharing in quantile normalization can easily be
  4152. removed by learning a fixed set of quantiles from external data and normalizing
  4153. each array to these fixed quantiles, instead of the quantiles of the data
  4154. itself.
  4155. As long as the fixed quantiles are reasonable, the result will be similar
  4156. to standard RMA.
  4157. However, there is no analogous way to eliminate cross-array information
  4158. sharing in the median polish step, so fRMA replaces this with a weighted
  4159. average of probes on each array, with the weights learned from external
  4160. data.
  4161. This step of fRMA has the greatest potential to diverge from RMA un undesirable
  4162. ways.
  4163. \end_layout
  4164. \begin_layout Standard
  4165. However, when run on real data, fRMA performed at least as well as RMA in
  4166. both the internal validation and external validation tests.
  4167. This shows that fRMA can be used to normalize individual clinical samples
  4168. in a class prediction context without sacrificing the classifier performance
  4169. that would be obtained by using the more well-established RMA for normalization.
  4170. The other single-channel normalization method considered, SCAN, showed
  4171. some loss of AUC in the external validation test.
  4172. Based on these results, fRMA is the preferred normalization for clinical
  4173. samples in a class prediction context.
  4174. \end_layout
  4175. \begin_layout Subsection
  4176. Robust fRMA vectors can be generated for new array platforms
  4177. \end_layout
  4178. \begin_layout Standard
  4179. \begin_inset Flex TODO Note (inline)
  4180. status open
  4181. \begin_layout Plain Layout
  4182. Look up the exact numbers, do a find & replace for
  4183. \begin_inset Quotes eld
  4184. \end_inset
  4185. 850
  4186. \begin_inset Quotes erd
  4187. \end_inset
  4188. \end_layout
  4189. \end_inset
  4190. \end_layout
  4191. \begin_layout Standard
  4192. The published fRMA normalization vectors for the hgu133plus2 platform were
  4193. generated from a set of about 850 samples chosen from a wide range of tissues,
  4194. which the authors determined was sufficient to generate a robust set of
  4195. normalization vectors that could be applied across all tissues
  4196. \begin_inset CommandInset citation
  4197. LatexCommand cite
  4198. key "McCall2010"
  4199. literal "false"
  4200. \end_inset
  4201. .
  4202. Since we only had hthgu133pluspm for 2 tissues of interest, our needs were
  4203. more modest.
  4204. Even using only 130 samples in 26 batches of 5 samples each for kidney
  4205. biopsies, we were able to train a robust set of fRMA normalization vectors
  4206. that were not meaningfully affected by the random selection of 5 samples
  4207. from each batch.
  4208. As expected, the training process was just as robust for the blood samples
  4209. with 230 samples in 46 batches of 5 samples each.
  4210. Because these vectors were each generated using training samples from a
  4211. single tissue, they are not suitable for general use, unlike the vectors
  4212. provided with fRMA itself.
  4213. They are purpose-built for normalizing a specific type of sample on a specific
  4214. platform.
  4215. This is a mostly acceptable limitation in the context of developing a machine
  4216. learning classifier for diagnosing a disease based on samples of a specific
  4217. tissue.
  4218. \end_layout
  4219. \begin_layout Standard
  4220. \begin_inset Flex TODO Note (inline)
  4221. status open
  4222. \begin_layout Plain Layout
  4223. How to bring up that these custom vectors were used in another project by
  4224. someone else that was never published?
  4225. \end_layout
  4226. \end_inset
  4227. \end_layout
  4228. \begin_layout Subsection
  4229. Methylation array data can be successfully analyzed using existing techniques,
  4230. but machine learning poses additional challenges
  4231. \end_layout
  4232. \begin_layout Standard
  4233. Both analysis strategies B and C both yield a reasonable analysis, with
  4234. a mean-variance trend that matches the expected behavior for the non-linear
  4235. M-value transformation (Figure
  4236. \begin_inset CommandInset ref
  4237. LatexCommand ref
  4238. reference "fig:meanvar-sva-aw"
  4239. plural "false"
  4240. caps "false"
  4241. noprefix "false"
  4242. \end_inset
  4243. ) and well-behaved p-value distributions (Figure
  4244. \begin_inset CommandInset ref
  4245. LatexCommand ref
  4246. reference "fig:meth-p-value-histograms"
  4247. plural "false"
  4248. caps "false"
  4249. noprefix "false"
  4250. \end_inset
  4251. ).
  4252. These two analyses also yield similar numbers of significant probes (Table
  4253. \begin_inset CommandInset ref
  4254. LatexCommand ref
  4255. reference "tab:methyl-num-signif"
  4256. plural "false"
  4257. caps "false"
  4258. noprefix "false"
  4259. \end_inset
  4260. ) and similar estimates of the number of differentially methylated probes
  4261. (Table
  4262. \begin_inset CommandInset ref
  4263. LatexCommand ref
  4264. reference "tab:methyl-est-nonnull"
  4265. plural "false"
  4266. caps "false"
  4267. noprefix "false"
  4268. \end_inset
  4269. ).
  4270. The main difference between these two analyses is the method used to account
  4271. for the mean-variance trend.
  4272. In analysis B, the trend is estimated and applied at the probe level: each
  4273. probe's estimated variance is squeezed toward the trend using an empirical
  4274. Bayes procedure (Figure
  4275. \begin_inset CommandInset ref
  4276. LatexCommand ref
  4277. reference "fig:meanvar-sva-aw"
  4278. plural "false"
  4279. caps "false"
  4280. noprefix "false"
  4281. \end_inset
  4282. ).
  4283. In analysis C, the trend is still estimated at the probe level, but instead
  4284. of estimating a single variance value shared across all observations for
  4285. a given probe, the voom method computes an initial estiamte of the variance
  4286. for each observation individually based on where its model-fitted M-value
  4287. falls on the trend line and then assigns inverse-variance weights to model
  4288. the difference in variance between observations.
  4289. An overall variance is still estimated for each probe using the same empirical
  4290. Bayes method, but now the residual trend is flat (Figure
  4291. \begin_inset CommandInset ref
  4292. LatexCommand ref
  4293. reference "fig:meanvar-sva-voomaw"
  4294. plural "false"
  4295. caps "false"
  4296. noprefix "false"
  4297. \end_inset
  4298. ), and the mean-variance trend is modeled by scaling the probe's estimated
  4299. variance for each observation using the weights computed by voom.
  4300. The difference between these two methods is analogous to the difference
  4301. between a t-test with equal variance and a t-test with unequal variance,
  4302. except that the unequal group variances used in the latter test are estimated
  4303. based on the mean-variance trend from all the probes rather than the data
  4304. for the specific probe being tested, thus stabilizing the group variance
  4305. estimates by sharing information between probes.
  4306. In practice, allowing voom to model the variance using observation weights
  4307. in this manner allows the linear model fit to concentrate statistical power
  4308. where it will do the most good.
  4309. For example, if a particular probe's M-values are always at the extreme
  4310. of the M-value range (e.g.
  4311. less than -4) for ADNR samples, but the M-values for that probe in TX and
  4312. CAN samples are within the flat region of the mean-variance trend (between
  4313. -3 and +3), voom is able to down-weight the contribution of the high-variance
  4314. M-values from the ADNR samples in order to gain more statistical power
  4315. while testing for differential methylation between TX and CAN.
  4316. In contrast, modeling the mean-variance trend only at the probe level would
  4317. combine the high-variance ADNR samples and lower-variance samples from
  4318. other conditions and estimate an intermediate variance for this probe.
  4319. In practice, analysis B shows that this approach is adequate, but the voom
  4320. approach in analysis C is at least as good on all model fit criteria and
  4321. yields a larger estimate for the number of differentially methylated genes.
  4322. \end_layout
  4323. \begin_layout Standard
  4324. The significant association of diebetes diagnosis with sample quality is
  4325. interesting.
  4326. The samples with Type 2 diabetes tended to have more variation, averaged
  4327. across all probes, than those with Type 1 diabetes.
  4328. This is consistent with the consensus that type 2 disbetes and the associated
  4329. metabolic syndrome represent a broad dysregulation of the body's endocrine
  4330. signalling related to metabolism [citation needed].
  4331. This dysregulation could easily manifest as a greater degree of variation
  4332. in the DNA methylation patterns of affected tissues.
  4333. In contrast, Type 1 disbetes has a more specific cause and effect, so a
  4334. less variable methylation signature is expected.
  4335. \end_layout
  4336. \begin_layout Standard
  4337. This preliminary anlaysis suggests that some degree of differential methylation
  4338. exists between TX and each of the three types of transplant disfunction
  4339. studied.
  4340. Hence, it may be feasible to train a classifier to diagnose transplant
  4341. disfunction from DNA methylation array data.
  4342. However, the major importance of both SVA and sample quality weighting
  4343. for proper modeling of this data poses significant challenges for any attempt
  4344. at a machine learning on data of similar quality.
  4345. While these are easily used in a modeling context with full sample information,
  4346. neither of these methods is directly applicable in a machine learning context,
  4347. where the diagnosis is not known ahead of time.
  4348. If a machine learning approach for methylation-based diagnosis is to be
  4349. pursued, it will either require machine-learning-friendly methods to address
  4350. the same systematic trends in the data that SVA and sample quality weighting
  4351. address, or it will require higher quality data with substantially less
  4352. systematic perturbation of the data.
  4353. \end_layout
  4354. \begin_layout Chapter
  4355. Globin-blocking for more effective blood RNA-seq analysis in primate animal
  4356. model
  4357. \end_layout
  4358. \begin_layout Standard
  4359. \begin_inset Flex TODO Note (inline)
  4360. status open
  4361. \begin_layout Plain Layout
  4362. Choose between above and the paper title: Optimizing yield of deep RNA sequencin
  4363. g for gene expression profiling by globin reduction of peripheral blood
  4364. samples from cynomolgus monkeys (Macaca fascicularis).
  4365. \end_layout
  4366. \end_inset
  4367. \end_layout
  4368. \begin_layout Standard
  4369. \begin_inset Flex TODO Note (inline)
  4370. status open
  4371. \begin_layout Plain Layout
  4372. Chapter author list: https://tex.stackexchange.com/questions/156862/displaying-aut
  4373. hor-for-each-chapter-in-book Every chapter gets an author list, which may
  4374. or may not be part of a citation to a published/preprinted paper.
  4375. \end_layout
  4376. \end_inset
  4377. \end_layout
  4378. \begin_layout Standard
  4379. \begin_inset Flex TODO Note (inline)
  4380. status open
  4381. \begin_layout Plain Layout
  4382. Preprint then cite the paper
  4383. \end_layout
  4384. \end_inset
  4385. \end_layout
  4386. \begin_layout Section*
  4387. Abstract
  4388. \end_layout
  4389. \begin_layout Paragraph
  4390. Background
  4391. \end_layout
  4392. \begin_layout Standard
  4393. Primate blood contains high concentrations of globin messenger RNA.
  4394. Globin reduction is a standard technique used to improve the expression
  4395. results obtained by DNA microarrays on RNA from blood samples.
  4396. However, with whole transcriptome RNA-sequencing (RNA-seq) quickly replacing
  4397. microarrays for many applications, the impact of globin reduction for RNA-seq
  4398. has not been previously studied.
  4399. Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
  4400. primates.
  4401. \end_layout
  4402. \begin_layout Paragraph
  4403. Results
  4404. \end_layout
  4405. \begin_layout Standard
  4406. Here we report a protocol for RNA-seq in primate blood samples that uses
  4407. complimentary oligonucleotides to block reverse transcription of the alpha
  4408. and beta globin genes.
  4409. In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
  4410. blocking protocol approximately doubles the yield of informative (non-globin)
  4411. reads by greatly reducing the fraction of globin reads, while also improving
  4412. the consistency in sequencing depth between samples.
  4413. The increased yield enables detection of about 2000 more genes, significantly
  4414. increases the correlation in measured gene expression levels between samples,
  4415. and increases the sensitivity of differential gene expression tests.
  4416. \end_layout
  4417. \begin_layout Paragraph
  4418. Conclusions
  4419. \end_layout
  4420. \begin_layout Standard
  4421. These results show that globin blocking significantly improves the cost-effectiv
  4422. eness of mRNA sequencing in primate blood samples by doubling the yield
  4423. of useful reads, allowing detection of more genes, and improving the precision
  4424. of gene expression measurements.
  4425. Based on these results, a globin reducing or blocking protocol is recommended
  4426. for all RNA-seq studies of primate blood samples.
  4427. \end_layout
  4428. \begin_layout Section
  4429. Approach
  4430. \end_layout
  4431. \begin_layout Standard
  4432. \begin_inset Note Note
  4433. status open
  4434. \begin_layout Plain Layout
  4435. Consider putting some of this in the Intro chapter
  4436. \end_layout
  4437. \begin_layout Itemize
  4438. Cynomolgus monkeys as a model organism
  4439. \end_layout
  4440. \begin_deeper
  4441. \begin_layout Itemize
  4442. Highly related to humans
  4443. \end_layout
  4444. \begin_layout Itemize
  4445. Small size and short life cycle - good research animal
  4446. \end_layout
  4447. \begin_layout Itemize
  4448. Genomics resources still in development
  4449. \end_layout
  4450. \end_deeper
  4451. \begin_layout Itemize
  4452. Inadequacy of existing blood RNA-seq protocols
  4453. \end_layout
  4454. \begin_deeper
  4455. \begin_layout Itemize
  4456. Existing protocols use a separate globin pulldown step, slowing down processing
  4457. \end_layout
  4458. \end_deeper
  4459. \end_inset
  4460. \end_layout
  4461. \begin_layout Standard
  4462. Increasingly, researchers are turning to high-throughput mRNA sequencing
  4463. technologies (RNA-seq) in preference to expression microarrays for analysis
  4464. of gene expression
  4465. \begin_inset CommandInset citation
  4466. LatexCommand cite
  4467. key "Mutz2012"
  4468. literal "false"
  4469. \end_inset
  4470. .
  4471. The advantages are even greater for study of model organisms with no well-estab
  4472. lished array platforms available, such as the cynomolgus monkey (Macaca
  4473. fascicularis).
  4474. High fractions of globin mRNA are naturally present in mammalian peripheral
  4475. blood samples (up to 70% of total mRNA) and these are known to interfere
  4476. with the results of array-based expression profiling
  4477. \begin_inset CommandInset citation
  4478. LatexCommand cite
  4479. key "Winn2010"
  4480. literal "false"
  4481. \end_inset
  4482. .
  4483. The importance of globin reduction for RNA-seq of blood has only been evaluated
  4484. for a deepSAGE protocol on human samples
  4485. \begin_inset CommandInset citation
  4486. LatexCommand cite
  4487. key "Mastrokolias2012"
  4488. literal "false"
  4489. \end_inset
  4490. .
  4491. In the present report, we evaluated globin reduction using custom blocking
  4492. oligonucleotides for deep RNA-seq of peripheral blood samples from a nonhuman
  4493. primate, cynomolgus monkey, using the Illumina technology platform.
  4494. We demonstrate that globin reduction significantly improves the cost-effectiven
  4495. ess of RNA-seq in blood samples.
  4496. Thus, our protocol offers a significant advantage to any investigator planning
  4497. to use RNA-seq for gene expression profiling of nonhuman primate blood
  4498. samples.
  4499. Our method can be generally applied to any species by designing complementary
  4500. oligonucleotide blocking probes to the globin gene sequences of that species.
  4501. Indeed, any highly expressed but biologically uninformative transcripts
  4502. can also be blocked to further increase sequencing efficiency and value
  4503. \begin_inset CommandInset citation
  4504. LatexCommand cite
  4505. key "Arnaud2016"
  4506. literal "false"
  4507. \end_inset
  4508. .
  4509. \end_layout
  4510. \begin_layout Section
  4511. Methods
  4512. \end_layout
  4513. \begin_layout Subsection
  4514. Sample collection
  4515. \end_layout
  4516. \begin_layout Standard
  4517. All research reported here was done under IACUC-approved protocols at the
  4518. University of Miami and complied with all applicable federal and state
  4519. regulations and ethical principles for nonhuman primate research.
  4520. Blood draws occurred between 16 April 2012 and 18 June 2015.
  4521. The experimental system involved intrahepatic pancreatic islet transplantation
  4522. into Cynomolgus monkeys with induced diabetes mellitus with or without
  4523. concomitant infusion of mesenchymal stem cells.
  4524. Blood was collected at serial time points before and after transplantation
  4525. into PAXgene Blood RNA tubes (PreAnalytiX/Qiagen, Valencia, CA) at the
  4526. precise volume:volume ratio of 2.5 ml whole blood into 6.9 ml of PAX gene
  4527. additive.
  4528. \end_layout
  4529. \begin_layout Subsection
  4530. Globin Blocking
  4531. \end_layout
  4532. \begin_layout Standard
  4533. Four oligonucleotides were designed to hybridize to the 3’ end of the transcript
  4534. s for Cynomolgus HBA1, HBA2 and HBB, with two hybridization sites for HBB
  4535. and 2 sites for HBA (the chosen sites were identical in both HBA genes).
  4536. All oligos were purchased from Sigma and were entirely composed of 2’O-Me
  4537. bases with a C3 spacer positioned at the 3’ ends to prevent any polymerase
  4538. mediated primer extension.
  4539. \end_layout
  4540. \begin_layout Quote
  4541. HBA1/2 site 1: GCCCACUCAGACUUUAUUCAAAG-C3spacer
  4542. \end_layout
  4543. \begin_layout Quote
  4544. HBA1/2 site 2: GGUGCAAGGAGGGGAGGAG-C3spacer
  4545. \end_layout
  4546. \begin_layout Quote
  4547. HBB site 1: AAUGAAAAUAAAUGUUUUUUAUUAG-C3spacer
  4548. \end_layout
  4549. \begin_layout Quote
  4550. HBB site 2: CUCAAGGCCCUUCAUAAUAUCCC-C3spacer
  4551. \end_layout
  4552. \begin_layout Subsection
  4553. RNA-seq Library Preparation
  4554. \end_layout
  4555. \begin_layout Standard
  4556. Sequencing libraries were prepared with 200ng total RNA from each sample.
  4557. Polyadenylated mRNA was selected from 200 ng aliquots of cynomologus blood-deri
  4558. ved total RNA using Ambion Dynabeads Oligo(dT)25 beads (Invitrogen) following
  4559. manufacturer’s recommended protocol.
  4560. PolyA selected RNA was then combined with 8 pmol of HBA1/2 (site 1), 8
  4561. pmol of HBA1/2 (site 2), 12 pmol of HBB (site 1) and 12 pmol of HBB (site
  4562. 2) oligonucleotides.
  4563. In addition, 20 pmol of RT primer containing a portion of the Illumina
  4564. adapter sequence (B-oligo-dTV: GAGTTCCTTGGCACCCGAGAATTCCATTTTTTTTTTTTTTTTTTTV)
  4565. and 4 µL of 5X First Strand buffer (250 mM Tris-HCl pH 8.3, 375 mM KCl,
  4566. 15mM MgCl2) were added in a total volume of 15 µL.
  4567. The RNA was fragmented by heating this cocktail for 3 minutes at 95°C and
  4568. then placed on ice.
  4569. This was followed by the addition of 2 µL 0.1 M DTT, 1 µL RNaseOUT, 1 µL
  4570. 10mM dNTPs 10% biotin-16 aminoallyl-2’- dUTP and 10% biotin-16 aminoallyl-2’-
  4571. dCTP (TriLink Biotech, San Diego, CA), 1 µL Superscript II (200U/ µL, Thermo-Fi
  4572. sher).
  4573. A second “unblocked” library was prepared in the same way for each sample
  4574. but replacing the blocking oligos with an equivalent volume of water.
  4575. The reaction was carried out at 25°C for 15 minutes and 42°C for 40 minutes,
  4576. followed by incubation at 75°C for 10 minutes to inactivate the reverse
  4577. transcriptase.
  4578. \end_layout
  4579. \begin_layout Standard
  4580. The cDNA/RNA hybrid molecules were purified using 1.8X Ampure XP beads (Agencourt
  4581. ) following supplier’s recommended protocol.
  4582. The cDNA/RNA hybrid was eluted in 25 µL of 10 mM Tris-HCl pH 8.0, and then
  4583. bound to 25 µL of M280 Magnetic Streptavidin beads washed per recommended
  4584. protocol (Thermo-Fisher).
  4585. After 30 minutes of binding, beads were washed one time in 100 µL 0.1N NaOH
  4586. to denature and remove the bound RNA, followed by two 100 µL washes with
  4587. 1X TE buffer.
  4588. \end_layout
  4589. \begin_layout Standard
  4590. Subsequent attachment of the 5-prime Illumina A adapter was performed by
  4591. on-bead random primer extension of the following sequence (A-N8 primer:
  4592. TTCAGAGTTCTACAGTCCGACGATCNNNNNNNN).
  4593. Briefly, beads were resuspended in a 20 µL reaction containing 5 µM A-N8
  4594. primer, 40mM Tris-HCl pH 7.5, 20mM MgCl2, 50mM NaCl, 0.325U/µL Sequenase
  4595. 2.0 (Affymetrix, Santa Clara, CA), 0.0025U/µL inorganic pyrophosphatase (Affymetr
  4596. ix) and 300 µM each dNTP.
  4597. Reaction was incubated at 22°C for 30 minutes, then beads were washed 2
  4598. times with 1X TE buffer (200µL).
  4599. \end_layout
  4600. \begin_layout Standard
  4601. The magnetic streptavidin beads were resuspended in 34 µL nuclease-free
  4602. water and added directly to a PCR tube.
  4603. The two Illumina protocol-specified PCR primers were added at 0.53 µM (Illumina
  4604. TruSeq Universal Primer 1 and Illumina TruSeq barcoded PCR primer 2), along
  4605. with 40 µL 2X KAPA HiFi Hotstart ReadyMix (KAPA, Willmington MA) and thermocycl
  4606. ed as follows: starting with 98°C (2 min-hold); 15 cycles of 98°C, 20sec;
  4607. 60°C, 30sec; 72°C, 30sec; and finished with a 72°C (2 min-hold).
  4608. \end_layout
  4609. \begin_layout Standard
  4610. PCR products were purified with 1X Ampure Beads following manufacturer’s
  4611. recommended protocol.
  4612. Libraries were then analyzed using the Agilent TapeStation and quantitation
  4613. of desired size range was performed by “smear analysis”.
  4614. Samples were pooled in equimolar batches of 16 samples.
  4615. Pooled libraries were size selected on 2% agarose gels (E-Gel EX Agarose
  4616. Gels; Thermo-Fisher).
  4617. Products were cut between 250 and 350 bp (corresponding to insert sizes
  4618. of 130 to 230 bps).
  4619. Finished library pools were then sequenced on the Illumina NextSeq500 instrumen
  4620. t with 75 base read lengths.
  4621. \end_layout
  4622. \begin_layout Subsection
  4623. Read alignment and counting
  4624. \end_layout
  4625. \begin_layout Standard
  4626. Reads were aligned to the cynomolgus genome using STAR
  4627. \begin_inset CommandInset citation
  4628. LatexCommand cite
  4629. key "Dobin2013,Wilson2013"
  4630. literal "false"
  4631. \end_inset
  4632. .
  4633. Counts of uniquely mapped reads were obtained for every gene in each sample
  4634. with the “featureCounts” function from the Rsubread package, using each
  4635. of the three possibilities for the “strandSpecific” option: sense, antisense,
  4636. and unstranded
  4637. \begin_inset CommandInset citation
  4638. LatexCommand cite
  4639. key "Liao2014"
  4640. literal "false"
  4641. \end_inset
  4642. .
  4643. A few artifacts in the cynomolgus genome annotation complicated read counting.
  4644. First, no ortholog is annotated for alpha globin in the cynomolgus genome,
  4645. presumably because the human genome has two alpha globin genes with nearly
  4646. identical sequences, making the orthology relationship ambiguous.
  4647. However, two loci in the cynomolgus genome are as “hemoglobin subunit alpha-lik
  4648. e” (LOC102136192 and LOC102136846).
  4649. LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
  4650. as protein-coding.
  4651. Our globin reduction protocol was designed to include blocking of these
  4652. two genes.
  4653. Indeed, these two genes have almost the same read counts in each library
  4654. as the properly-annotated HBB gene and much larger counts than any other
  4655. gene in the unblocked libraries, giving confidence that reads derived from
  4656. the real alpha globin are mapping to both genes.
  4657. Thus, reads from both of these loci were counted as alpha globin reads
  4658. in all further analyses.
  4659. The second artifact is a small, uncharacterized non-coding RNA gene (LOC1021365
  4660. 91), which overlaps the HBA-like gene (LOC102136192) on the opposite strand.
  4661. If counting is not performed in stranded mode (or if a non-strand-specific
  4662. sequencing protocol is used), many reads mapping to the globin gene will
  4663. be discarded as ambiguous due to their overlap with this ncRNA gene, resulting
  4664. in significant undercounting of globin reads.
  4665. Therefore, stranded sense counts were used for all further analysis in
  4666. the present study to insure that we accurately accounted for globin transcript
  4667. reduction.
  4668. However, we note that stranded reads are not necessary for RNA-seq using
  4669. our protocol in standard practice.
  4670. \end_layout
  4671. \begin_layout Subsection
  4672. Normalization and Exploratory Data Analysis
  4673. \end_layout
  4674. \begin_layout Standard
  4675. Libraries were normalized by computing scaling factors using the edgeR package’s
  4676. Trimmed Mean of M-values method
  4677. \begin_inset CommandInset citation
  4678. LatexCommand cite
  4679. key "Robinson2010"
  4680. literal "false"
  4681. \end_inset
  4682. .
  4683. Log2 counts per million values (logCPM) were calculated using the cpm function
  4684. in edgeR for individual samples and aveLogCPM function for averages across
  4685. groups of samples, using those functions’ default prior count values to
  4686. avoid taking the logarithm of 0.
  4687. Genes were considered “present” if their average normalized logCPM values
  4688. across all libraries were at least -1.
  4689. Normalizing for gene length was unnecessary because the sequencing protocol
  4690. is 3’-biased and hence the expected read count for each gene is related
  4691. to the transcript’s copy number but not its length.
  4692. \end_layout
  4693. \begin_layout Standard
  4694. In order to assess the effect of blocking on reproducibility, Pearson and
  4695. Spearman correlation coefficients were computed between the logCPM values
  4696. for every pair of libraries within the globin-blocked (GB) and unblocked
  4697. (non-GB) groups, and edgeR's “estimateDisp” function was used to compute
  4698. negative binomial dispersions separately for the two groups
  4699. \begin_inset CommandInset citation
  4700. LatexCommand cite
  4701. key "Chen2014"
  4702. literal "false"
  4703. \end_inset
  4704. .
  4705. \end_layout
  4706. \begin_layout Subsection
  4707. Differential Expression Analysis
  4708. \end_layout
  4709. \begin_layout Standard
  4710. All tests for differential gene expression were performed using edgeR, by
  4711. first fitting a negative binomial generalized linear model to the counts
  4712. and normalization factors and then performing a quasi-likelihood F-test
  4713. with robust estimation of outlier gene dispersions
  4714. \begin_inset CommandInset citation
  4715. LatexCommand cite
  4716. key "Lund2012,Phipson2016"
  4717. literal "false"
  4718. \end_inset
  4719. .
  4720. To investigate the effects of globin blocking on each gene, an additive
  4721. model was fit to the full data with coefficients for globin blocking and
  4722. SampleID.
  4723. To test the effect of globin blocking on detection of differentially expressed
  4724. genes, the GB samples and non-GB samples were each analyzed independently
  4725. as follows: for each animal with both a pre-transplant and a post-transplant
  4726. time point in the data set, the pre-transplant sample and the earliest
  4727. post-transplant sample were selected, and all others were excluded, yielding
  4728. a pre-/post-transplant pair of samples for each animal (N=7 animals with
  4729. paired samples).
  4730. These samples were analyzed for pre-transplant vs.
  4731. post-transplant differential gene expression while controlling for inter-animal
  4732. variation using an additive model with coefficients for transplant and
  4733. animal ID.
  4734. In all analyses, p-values were adjusted using the Benjamini-Hochberg procedure
  4735. for FDR control
  4736. \begin_inset CommandInset citation
  4737. LatexCommand cite
  4738. key "Benjamini1995"
  4739. literal "false"
  4740. \end_inset
  4741. .
  4742. \end_layout
  4743. \begin_layout Standard
  4744. \begin_inset Note Note
  4745. status open
  4746. \begin_layout Itemize
  4747. New blood RNA-seq protocol to block reverse transcription of globin genes
  4748. \end_layout
  4749. \begin_layout Itemize
  4750. Blood RNA-seq time course after transplants with/without MSC infusion
  4751. \end_layout
  4752. \end_inset
  4753. \end_layout
  4754. \begin_layout Section
  4755. Results
  4756. \end_layout
  4757. \begin_layout Subsection
  4758. Globin blocking yields a larger and more consistent fraction of useful reads
  4759. \end_layout
  4760. \begin_layout Standard
  4761. The objective of the present study was to validate a new protocol for deep
  4762. RNA-seq of whole blood drawn into PaxGene tubes from cynomolgus monkeys
  4763. undergoing islet transplantation, with particular focus on minimizing the
  4764. loss of useful sequencing space to uninformative globin reads.
  4765. The details of the analysis with respect to transplant outcomes and the
  4766. impact of mesenchymal stem cell treatment will be reported in a separate
  4767. manuscript (in preparation).
  4768. To focus on the efficacy of our globin blocking protocol, 37 blood samples,
  4769. 16 from pre-transplant and 21 from post-transplant time points, were each
  4770. prepped once with and once without globin blocking oligos, and were then
  4771. sequenced on an Illumina NextSeq500 instrument.
  4772. The number of reads aligning to each gene in the cynomolgus genome was
  4773. counted.
  4774. Table 1 summarizes the distribution of read fractions among the GB and
  4775. non-GB libraries.
  4776. In the libraries with no globin blocking, globin reads made up an average
  4777. of 44.6% of total input reads, while reads assigned to all other genes made
  4778. up an average of 26.3%.
  4779. The remaining reads either aligned to intergenic regions (that include
  4780. long non-coding RNAs) or did not align with any annotated transcripts in
  4781. the current build of the cynomolgus genome.
  4782. In the GB libraries, globin reads made up only 3.48% and reads assigned
  4783. to all other genes increased to 50.4%.
  4784. Thus, globin blocking resulted in a 92.2% reduction in globin reads and
  4785. a 91.6% increase in yield of useful non-globin reads.
  4786. \end_layout
  4787. \begin_layout Standard
  4788. This reduction is not quite as efficient as the previous analysis showed
  4789. for human samples by DeepSAGE (<0.4% globin reads after globin reduction)
  4790. \begin_inset CommandInset citation
  4791. LatexCommand cite
  4792. key "Mastrokolias2012"
  4793. literal "false"
  4794. \end_inset
  4795. .
  4796. Nonetheless, this degree of globin reduction is sufficient to nearly double
  4797. the yield of useful reads.
  4798. Thus, globin blocking cuts the required sequencing effort (and costs) to
  4799. achieve a target coverage depth by almost 50%.
  4800. Consistent with this near doubling of yield, the average difference in
  4801. un-normalized logCPM across all genes between the GB libraries and non-GB
  4802. libraries is approximately 1 (mean = 1.01, median = 1.08), an overall 2-fold
  4803. increase.
  4804. Un-normalized values are used here because the TMM normalization correctly
  4805. identifies this 2-fold difference as biologically irrelevant and removes
  4806. it.
  4807. \end_layout
  4808. \begin_layout Standard
  4809. \begin_inset Float figure
  4810. wide false
  4811. sideways false
  4812. status open
  4813. \begin_layout Plain Layout
  4814. \align center
  4815. \begin_inset Graphics
  4816. filename graphics/Globin Paper/figure1 - globin-fractions.pdf
  4817. \end_inset
  4818. \end_layout
  4819. \begin_layout Plain Layout
  4820. \begin_inset Caption Standard
  4821. \begin_layout Plain Layout
  4822. \series bold
  4823. \begin_inset Argument 1
  4824. status collapsed
  4825. \begin_layout Plain Layout
  4826. Fraction of genic reads in each sample aligned to non-globin genes, with
  4827. and without globin blocking (GB).
  4828. \end_layout
  4829. \end_inset
  4830. \begin_inset CommandInset label
  4831. LatexCommand label
  4832. name "fig:Fraction-of-genic-reads"
  4833. \end_inset
  4834. Fraction of genic reads in each sample aligned to non-globin genes, with
  4835. and without globin blocking (GB).
  4836. \series default
  4837. All reads in each sequencing library were aligned to the cyno genome, and
  4838. the number of reads uniquely aligning to each gene was counted.
  4839. For each sample, counts were summed separately for all globin genes and
  4840. for the remainder of the genes (non-globin genes), and the fraction of
  4841. genic reads aligned to non-globin genes was computed.
  4842. Each point represents an individual sample.
  4843. Gray + signs indicate the means for globin-blocked libraries and unblocked
  4844. libraries.
  4845. The overall distribution for each group is represented as a notched box
  4846. plots.
  4847. Points are randomly spread vertically to avoid excessive overlapping.
  4848. \end_layout
  4849. \end_inset
  4850. \end_layout
  4851. \begin_layout Plain Layout
  4852. \end_layout
  4853. \end_inset
  4854. \end_layout
  4855. \begin_layout Standard
  4856. \begin_inset Float table
  4857. placement p
  4858. wide false
  4859. sideways true
  4860. status open
  4861. \begin_layout Plain Layout
  4862. \align center
  4863. \begin_inset Tabular
  4864. <lyxtabular version="3" rows="4" columns="7">
  4865. <features tabularvalignment="middle">
  4866. <column alignment="center" valignment="top">
  4867. <column alignment="center" valignment="top">
  4868. <column alignment="center" valignment="top">
  4869. <column alignment="center" valignment="top">
  4870. <column alignment="center" valignment="top">
  4871. <column alignment="center" valignment="top">
  4872. <column alignment="center" valignment="top">
  4873. <row>
  4874. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4875. \begin_inset Text
  4876. \begin_layout Plain Layout
  4877. \end_layout
  4878. \end_inset
  4879. </cell>
  4880. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4881. \begin_inset Text
  4882. \begin_layout Plain Layout
  4883. \family roman
  4884. \series medium
  4885. \shape up
  4886. \size normal
  4887. \emph off
  4888. \bar no
  4889. \strikeout off
  4890. \xout off
  4891. \uuline off
  4892. \uwave off
  4893. \noun off
  4894. \color none
  4895. Percent of Total Reads
  4896. \end_layout
  4897. \end_inset
  4898. </cell>
  4899. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4900. \begin_inset Text
  4901. \begin_layout Plain Layout
  4902. \end_layout
  4903. \end_inset
  4904. </cell>
  4905. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4906. \begin_inset Text
  4907. \begin_layout Plain Layout
  4908. \end_layout
  4909. \end_inset
  4910. </cell>
  4911. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4912. \begin_inset Text
  4913. \begin_layout Plain Layout
  4914. \end_layout
  4915. \end_inset
  4916. </cell>
  4917. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4918. \begin_inset Text
  4919. \begin_layout Plain Layout
  4920. \family roman
  4921. \series medium
  4922. \shape up
  4923. \size normal
  4924. \emph off
  4925. \bar no
  4926. \strikeout off
  4927. \xout off
  4928. \uuline off
  4929. \uwave off
  4930. \noun off
  4931. \color none
  4932. Percent of Genic Reads
  4933. \end_layout
  4934. \end_inset
  4935. </cell>
  4936. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4937. \begin_inset Text
  4938. \begin_layout Plain Layout
  4939. \end_layout
  4940. \end_inset
  4941. </cell>
  4942. </row>
  4943. <row>
  4944. <cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
  4945. \begin_inset Text
  4946. \begin_layout Plain Layout
  4947. GB
  4948. \end_layout
  4949. \end_inset
  4950. </cell>
  4951. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4952. \begin_inset Text
  4953. \begin_layout Plain Layout
  4954. \family roman
  4955. \series medium
  4956. \shape up
  4957. \size normal
  4958. \emph off
  4959. \bar no
  4960. \strikeout off
  4961. \xout off
  4962. \uuline off
  4963. \uwave off
  4964. \noun off
  4965. \color none
  4966. Non-globin Reads
  4967. \end_layout
  4968. \end_inset
  4969. </cell>
  4970. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4971. \begin_inset Text
  4972. \begin_layout Plain Layout
  4973. \family roman
  4974. \series medium
  4975. \shape up
  4976. \size normal
  4977. \emph off
  4978. \bar no
  4979. \strikeout off
  4980. \xout off
  4981. \uuline off
  4982. \uwave off
  4983. \noun off
  4984. \color none
  4985. Globin Reads
  4986. \end_layout
  4987. \end_inset
  4988. </cell>
  4989. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4990. \begin_inset Text
  4991. \begin_layout Plain Layout
  4992. \family roman
  4993. \series medium
  4994. \shape up
  4995. \size normal
  4996. \emph off
  4997. \bar no
  4998. \strikeout off
  4999. \xout off
  5000. \uuline off
  5001. \uwave off
  5002. \noun off
  5003. \color none
  5004. All Genic Reads
  5005. \end_layout
  5006. \end_inset
  5007. </cell>
  5008. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5009. \begin_inset Text
  5010. \begin_layout Plain Layout
  5011. \family roman
  5012. \series medium
  5013. \shape up
  5014. \size normal
  5015. \emph off
  5016. \bar no
  5017. \strikeout off
  5018. \xout off
  5019. \uuline off
  5020. \uwave off
  5021. \noun off
  5022. \color none
  5023. All Aligned Reads
  5024. \end_layout
  5025. \end_inset
  5026. </cell>
  5027. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5028. \begin_inset Text
  5029. \begin_layout Plain Layout
  5030. \family roman
  5031. \series medium
  5032. \shape up
  5033. \size normal
  5034. \emph off
  5035. \bar no
  5036. \strikeout off
  5037. \xout off
  5038. \uuline off
  5039. \uwave off
  5040. \noun off
  5041. \color none
  5042. Non-globin Reads
  5043. \end_layout
  5044. \end_inset
  5045. </cell>
  5046. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  5047. \begin_inset Text
  5048. \begin_layout Plain Layout
  5049. \family roman
  5050. \series medium
  5051. \shape up
  5052. \size normal
  5053. \emph off
  5054. \bar no
  5055. \strikeout off
  5056. \xout off
  5057. \uuline off
  5058. \uwave off
  5059. \noun off
  5060. \color none
  5061. Globin Reads
  5062. \end_layout
  5063. \end_inset
  5064. </cell>
  5065. </row>
  5066. <row>
  5067. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5068. \begin_inset Text
  5069. \begin_layout Plain Layout
  5070. \family roman
  5071. \series medium
  5072. \shape up
  5073. \size normal
  5074. \emph off
  5075. \bar no
  5076. \strikeout off
  5077. \xout off
  5078. \uuline off
  5079. \uwave off
  5080. \noun off
  5081. \color none
  5082. Yes
  5083. \end_layout
  5084. \end_inset
  5085. </cell>
  5086. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5087. \begin_inset Text
  5088. \begin_layout Plain Layout
  5089. \family roman
  5090. \series medium
  5091. \shape up
  5092. \size normal
  5093. \emph off
  5094. \bar no
  5095. \strikeout off
  5096. \xout off
  5097. \uuline off
  5098. \uwave off
  5099. \noun off
  5100. \color none
  5101. 50.4% ± 6.82
  5102. \end_layout
  5103. \end_inset
  5104. </cell>
  5105. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5106. \begin_inset Text
  5107. \begin_layout Plain Layout
  5108. \family roman
  5109. \series medium
  5110. \shape up
  5111. \size normal
  5112. \emph off
  5113. \bar no
  5114. \strikeout off
  5115. \xout off
  5116. \uuline off
  5117. \uwave off
  5118. \noun off
  5119. \color none
  5120. 3.48% ± 2.94
  5121. \end_layout
  5122. \end_inset
  5123. </cell>
  5124. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5125. \begin_inset Text
  5126. \begin_layout Plain Layout
  5127. \family roman
  5128. \series medium
  5129. \shape up
  5130. \size normal
  5131. \emph off
  5132. \bar no
  5133. \strikeout off
  5134. \xout off
  5135. \uuline off
  5136. \uwave off
  5137. \noun off
  5138. \color none
  5139. 53.9% ± 6.81
  5140. \end_layout
  5141. \end_inset
  5142. </cell>
  5143. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5144. \begin_inset Text
  5145. \begin_layout Plain Layout
  5146. \family roman
  5147. \series medium
  5148. \shape up
  5149. \size normal
  5150. \emph off
  5151. \bar no
  5152. \strikeout off
  5153. \xout off
  5154. \uuline off
  5155. \uwave off
  5156. \noun off
  5157. \color none
  5158. 89.7% ± 2.40
  5159. \end_layout
  5160. \end_inset
  5161. </cell>
  5162. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5163. \begin_inset Text
  5164. \begin_layout Plain Layout
  5165. \family roman
  5166. \series medium
  5167. \shape up
  5168. \size normal
  5169. \emph off
  5170. \bar no
  5171. \strikeout off
  5172. \xout off
  5173. \uuline off
  5174. \uwave off
  5175. \noun off
  5176. \color none
  5177. 93.5% ± 5.25
  5178. \end_layout
  5179. \end_inset
  5180. </cell>
  5181. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5182. \begin_inset Text
  5183. \begin_layout Plain Layout
  5184. \family roman
  5185. \series medium
  5186. \shape up
  5187. \size normal
  5188. \emph off
  5189. \bar no
  5190. \strikeout off
  5191. \xout off
  5192. \uuline off
  5193. \uwave off
  5194. \noun off
  5195. \color none
  5196. 6.49% ± 5.25
  5197. \end_layout
  5198. \end_inset
  5199. </cell>
  5200. </row>
  5201. <row>
  5202. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5203. \begin_inset Text
  5204. \begin_layout Plain Layout
  5205. \family roman
  5206. \series medium
  5207. \shape up
  5208. \size normal
  5209. \emph off
  5210. \bar no
  5211. \strikeout off
  5212. \xout off
  5213. \uuline off
  5214. \uwave off
  5215. \noun off
  5216. \color none
  5217. No
  5218. \end_layout
  5219. \end_inset
  5220. </cell>
  5221. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5222. \begin_inset Text
  5223. \begin_layout Plain Layout
  5224. \family roman
  5225. \series medium
  5226. \shape up
  5227. \size normal
  5228. \emph off
  5229. \bar no
  5230. \strikeout off
  5231. \xout off
  5232. \uuline off
  5233. \uwave off
  5234. \noun off
  5235. \color none
  5236. 26.3% ± 8.95
  5237. \end_layout
  5238. \end_inset
  5239. </cell>
  5240. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5241. \begin_inset Text
  5242. \begin_layout Plain Layout
  5243. \family roman
  5244. \series medium
  5245. \shape up
  5246. \size normal
  5247. \emph off
  5248. \bar no
  5249. \strikeout off
  5250. \xout off
  5251. \uuline off
  5252. \uwave off
  5253. \noun off
  5254. \color none
  5255. 44.6% ± 16.6
  5256. \end_layout
  5257. \end_inset
  5258. </cell>
  5259. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5260. \begin_inset Text
  5261. \begin_layout Plain Layout
  5262. \family roman
  5263. \series medium
  5264. \shape up
  5265. \size normal
  5266. \emph off
  5267. \bar no
  5268. \strikeout off
  5269. \xout off
  5270. \uuline off
  5271. \uwave off
  5272. \noun off
  5273. \color none
  5274. 70.1% ± 9.38
  5275. \end_layout
  5276. \end_inset
  5277. </cell>
  5278. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5279. \begin_inset Text
  5280. \begin_layout Plain Layout
  5281. \family roman
  5282. \series medium
  5283. \shape up
  5284. \size normal
  5285. \emph off
  5286. \bar no
  5287. \strikeout off
  5288. \xout off
  5289. \uuline off
  5290. \uwave off
  5291. \noun off
  5292. \color none
  5293. 90.7% ± 5.16
  5294. \end_layout
  5295. \end_inset
  5296. </cell>
  5297. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5298. \begin_inset Text
  5299. \begin_layout Plain Layout
  5300. \family roman
  5301. \series medium
  5302. \shape up
  5303. \size normal
  5304. \emph off
  5305. \bar no
  5306. \strikeout off
  5307. \xout off
  5308. \uuline off
  5309. \uwave off
  5310. \noun off
  5311. \color none
  5312. 38.8% ± 17.1
  5313. \end_layout
  5314. \end_inset
  5315. </cell>
  5316. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  5317. \begin_inset Text
  5318. \begin_layout Plain Layout
  5319. \family roman
  5320. \series medium
  5321. \shape up
  5322. \size normal
  5323. \emph off
  5324. \bar no
  5325. \strikeout off
  5326. \xout off
  5327. \uuline off
  5328. \uwave off
  5329. \noun off
  5330. \color none
  5331. 61.2% ± 17.1
  5332. \end_layout
  5333. \end_inset
  5334. </cell>
  5335. </row>
  5336. </lyxtabular>
  5337. \end_inset
  5338. \end_layout
  5339. \begin_layout Plain Layout
  5340. \begin_inset Caption Standard
  5341. \begin_layout Plain Layout
  5342. \series bold
  5343. \begin_inset Argument 1
  5344. status collapsed
  5345. \begin_layout Plain Layout
  5346. Fractions of reads mapping to genomic features in GB and non-GB samples.
  5347. \end_layout
  5348. \end_inset
  5349. \begin_inset CommandInset label
  5350. LatexCommand label
  5351. name "tab:Fractions-of-reads"
  5352. \end_inset
  5353. Fractions of reads mapping to genomic features in GB and non-GB samples.
  5354. \series default
  5355. All values are given as mean ± standard deviation.
  5356. \end_layout
  5357. \end_inset
  5358. \end_layout
  5359. \begin_layout Plain Layout
  5360. \end_layout
  5361. \end_inset
  5362. \end_layout
  5363. \begin_layout Standard
  5364. Another important aspect is that the standard deviations in Table
  5365. \begin_inset CommandInset ref
  5366. LatexCommand ref
  5367. reference "tab:Fractions-of-reads"
  5368. plural "false"
  5369. caps "false"
  5370. noprefix "false"
  5371. \end_inset
  5372. are uniformly smaller in the GB samples than the non-GB ones, indicating
  5373. much greater consistency of yield.
  5374. This is best seen in the percentage of non-globin reads as a fraction of
  5375. total reads aligned to annotated genes (genic reads).
  5376. For the non-GB samples, this measure ranges from 10.9% to 80.9%, while for
  5377. the GB samples it ranges from 81.9% to 99.9% (Figure
  5378. \begin_inset CommandInset ref
  5379. LatexCommand ref
  5380. reference "fig:Fraction-of-genic-reads"
  5381. plural "false"
  5382. caps "false"
  5383. noprefix "false"
  5384. \end_inset
  5385. ).
  5386. This means that for applications where it is critical that each sample
  5387. achieve a specified minimum coverage in order to provide useful information,
  5388. it would be necessary to budget up to 10 times the sequencing depth per
  5389. sample without globin blocking, even though the average yield improvement
  5390. for globin blocking is only 2-fold, because every sample has a chance of
  5391. being 90% globin and 10% useful reads.
  5392. Hence, the more consistent behavior of GB samples makes planning an experiment
  5393. easier and more efficient because it eliminates the need to over-sequence
  5394. every sample in order to guard against the worst case of a high-globin
  5395. fraction.
  5396. \end_layout
  5397. \begin_layout Subsection
  5398. Globin blocking lowers the noise floor and allows detection of about 2000
  5399. more genes
  5400. \end_layout
  5401. \begin_layout Standard
  5402. \begin_inset Flex TODO Note (inline)
  5403. status open
  5404. \begin_layout Plain Layout
  5405. Remove redundant titles from figures
  5406. \end_layout
  5407. \end_inset
  5408. \end_layout
  5409. \begin_layout Standard
  5410. \begin_inset Float figure
  5411. wide false
  5412. sideways false
  5413. status open
  5414. \begin_layout Plain Layout
  5415. \align center
  5416. \begin_inset Graphics
  5417. filename graphics/Globin Paper/figure2 - aveLogCPM-colored.pdf
  5418. \end_inset
  5419. \end_layout
  5420. \begin_layout Plain Layout
  5421. \begin_inset Caption Standard
  5422. \begin_layout Plain Layout
  5423. \series bold
  5424. \begin_inset Argument 1
  5425. status collapsed
  5426. \begin_layout Plain Layout
  5427. Distributions of average group gene abundances when normalized separately
  5428. or together.
  5429. \end_layout
  5430. \end_inset
  5431. \begin_inset CommandInset label
  5432. LatexCommand label
  5433. name "fig:logcpm-dists"
  5434. \end_inset
  5435. Distributions of average group gene abundances when normalized separately
  5436. or together.
  5437. \series default
  5438. All reads in each sequencing library were aligned to the cyno genome, and
  5439. the number of reads uniquely aligning to each gene was counted.
  5440. Genes with zero counts in all libraries were discarded.
  5441. Libraries were normalized using the TMM method.
  5442. Libraries were split into globin-blocked (GB) and non-GB groups and the
  5443. average abundance for each gene in both groups, measured in log2 counts
  5444. per million reads counted, was computed using the aveLogCPM function.
  5445. The distribution of average gene logCPM values was plotted for both groups
  5446. using a kernel density plot to approximate a continuous distribution.
  5447. The logCPM GB distributions are marked in red, non-GB in blue.
  5448. The black vertical line denotes the chosen detection threshold of -1.
  5449. Top panel: Libraries were split into GB and non-GB groups first and normalized
  5450. separately.
  5451. Bottom panel: Libraries were all normalized together first and then split
  5452. into groups.
  5453. \end_layout
  5454. \end_inset
  5455. \end_layout
  5456. \begin_layout Plain Layout
  5457. \end_layout
  5458. \end_inset
  5459. \end_layout
  5460. \begin_layout Standard
  5461. Since globin blocking yields more usable sequencing depth, it should also
  5462. allow detection of more genes at any given threshold.
  5463. When we looked at the distribution of average normalized logCPM values
  5464. across all libraries for genes with at least one read assigned to them,
  5465. we observed the expected bimodal distribution, with a high-abundance "signal"
  5466. peak representing detected genes and a low-abundance "noise" peak representing
  5467. genes whose read count did not rise above the noise floor (Figure
  5468. \begin_inset CommandInset ref
  5469. LatexCommand ref
  5470. reference "fig:logcpm-dists"
  5471. plural "false"
  5472. caps "false"
  5473. noprefix "false"
  5474. \end_inset
  5475. ).
  5476. Consistent with the 2-fold increase in raw counts assigned to non-globin
  5477. genes, the signal peak for GB samples is shifted to the right relative
  5478. to the non-GB signal peak.
  5479. When all the samples are normalized together, this difference is normalized
  5480. out, lining up the signal peaks, and this reveals that, as expected, the
  5481. noise floor for the GB samples is about 2-fold lower.
  5482. This greater separation between signal and noise peaks in the GB samples
  5483. means that low-expression genes should be more easily detected and more
  5484. precisely quantified than in the non-GB samples.
  5485. \end_layout
  5486. \begin_layout Standard
  5487. \begin_inset Float figure
  5488. wide false
  5489. sideways false
  5490. status open
  5491. \begin_layout Plain Layout
  5492. \align center
  5493. \begin_inset Graphics
  5494. filename graphics/Globin Paper/figure3 - detection.pdf
  5495. \end_inset
  5496. \end_layout
  5497. \begin_layout Plain Layout
  5498. \begin_inset Caption Standard
  5499. \begin_layout Plain Layout
  5500. \series bold
  5501. \begin_inset Argument 1
  5502. status collapsed
  5503. \begin_layout Plain Layout
  5504. Gene detections as a function of abundance thresholds in globin-blocked
  5505. (GB) and non-GB samples.
  5506. \end_layout
  5507. \end_inset
  5508. \begin_inset CommandInset label
  5509. LatexCommand label
  5510. name "fig:Gene-detections"
  5511. \end_inset
  5512. Gene detections as a function of abundance thresholds in globin-blocked
  5513. (GB) and non-GB samples.
  5514. \series default
  5515. Average abundance (logCPM,
  5516. \begin_inset Formula $\log_{2}$
  5517. \end_inset
  5518. counts per million reads counted) was computed by separate group normalization
  5519. as described in Figure
  5520. \begin_inset CommandInset ref
  5521. LatexCommand ref
  5522. reference "fig:logcpm-dists"
  5523. plural "false"
  5524. caps "false"
  5525. noprefix "false"
  5526. \end_inset
  5527. for both the GB and non-GB groups, as well as for all samples considered
  5528. as one large group.
  5529. For each every integer threshold from -2 to 3, the number of genes detected
  5530. at or above that logCPM threshold was plotted for each group.
  5531. \end_layout
  5532. \end_inset
  5533. \end_layout
  5534. \begin_layout Plain Layout
  5535. \end_layout
  5536. \end_inset
  5537. \end_layout
  5538. \begin_layout Standard
  5539. Based on these distributions, we selected a detection threshold of -1, which
  5540. is approximately the leftmost edge of the trough between the signal and
  5541. noise peaks.
  5542. This represents the most liberal possible detection threshold that doesn't
  5543. call substantial numbers of noise genes as detected.
  5544. Among the full dataset, 13429 genes were detected at this threshold, and
  5545. 22276 were not.
  5546. When considering the GB libraries and non-GB libraries separately and re-comput
  5547. ing normalization factors independently within each group, 14535 genes were
  5548. detected in the GB libraries while only 12460 were detected in the non-GB
  5549. libraries.
  5550. Thus, GB allowed the detection of 2000 extra genes that were buried under
  5551. the noise floor without GB.
  5552. This pattern of at least 2000 additional genes detected with GB was also
  5553. consistent across a wide range of possible detection thresholds, from -2
  5554. to 3 (see Figure
  5555. \begin_inset CommandInset ref
  5556. LatexCommand ref
  5557. reference "fig:Gene-detections"
  5558. plural "false"
  5559. caps "false"
  5560. noprefix "false"
  5561. \end_inset
  5562. ).
  5563. \end_layout
  5564. \begin_layout Subsection
  5565. Globin blocking does not add significant additional noise or decrease sample
  5566. quality
  5567. \end_layout
  5568. \begin_layout Standard
  5569. One potential worry is that the globin blocking protocol could perturb the
  5570. levels of non-globin genes.
  5571. There are two kinds of possible perturbations: systematic and random.
  5572. The former is not a major concern for detection of differential expression,
  5573. since a 2-fold change in every sample has no effect on the relative fold
  5574. change between samples.
  5575. In contrast, random perturbations would increase the noise and obscure
  5576. the signal in the dataset, reducing the capacity to detect differential
  5577. expression.
  5578. \end_layout
  5579. \begin_layout Standard
  5580. \begin_inset Float figure
  5581. wide false
  5582. sideways false
  5583. status open
  5584. \begin_layout Plain Layout
  5585. \align center
  5586. \begin_inset Graphics
  5587. filename graphics/Globin Paper/figure4 - maplot-colored.pdf
  5588. \end_inset
  5589. \end_layout
  5590. \begin_layout Plain Layout
  5591. \begin_inset Caption Standard
  5592. \begin_layout Plain Layout
  5593. \begin_inset Argument 1
  5594. status collapsed
  5595. \begin_layout Plain Layout
  5596. MA plot showing effects of globin blocking on each gene's abundance.
  5597. \end_layout
  5598. \end_inset
  5599. \begin_inset CommandInset label
  5600. LatexCommand label
  5601. name "fig:MA-plot"
  5602. \end_inset
  5603. \series bold
  5604. MA plot showing effects of globin blocking on each gene's abundance.
  5605. \series default
  5606. All libraries were normalized together as described in Figure
  5607. \begin_inset CommandInset ref
  5608. LatexCommand ref
  5609. reference "fig:logcpm-dists"
  5610. plural "false"
  5611. caps "false"
  5612. noprefix "false"
  5613. \end_inset
  5614. , and genes with an average logCPM below -1 were filtered out.
  5615. Each remaining gene was tested for differential abundance with respect
  5616. to globin blocking (GB) using edgeR’s quasi-likelihod F-test, fitting a
  5617. negative binomial generalized linear model to table of read counts in each
  5618. library.
  5619. For each gene, edgeR reported average abundance (logCPM),
  5620. \begin_inset Formula $\log_{2}$
  5621. \end_inset
  5622. fold change (logFC), p-value, and Benjamini-Hochberg adjusted false discovery
  5623. rate (FDR).
  5624. Each gene's logFC was plotted against its logCPM, colored by FDR.
  5625. Red points are significant at ≤10% FDR, and blue are not significant at
  5626. that threshold.
  5627. The alpha and beta globin genes targeted for blocking are marked with large
  5628. triangles, while all other genes are represented as small points.
  5629. \end_layout
  5630. \end_inset
  5631. \end_layout
  5632. \begin_layout Plain Layout
  5633. \end_layout
  5634. \end_inset
  5635. \end_layout
  5636. \begin_layout Standard
  5637. \begin_inset Flex TODO Note (inline)
  5638. status open
  5639. \begin_layout Plain Layout
  5640. Standardize on
  5641. \begin_inset Quotes eld
  5642. \end_inset
  5643. log2
  5644. \begin_inset Quotes erd
  5645. \end_inset
  5646. notation
  5647. \end_layout
  5648. \end_inset
  5649. \end_layout
  5650. \begin_layout Standard
  5651. The data do indeed show small systematic perturbations in gene levels (Figure
  5652. \begin_inset CommandInset ref
  5653. LatexCommand ref
  5654. reference "fig:MA-plot"
  5655. plural "false"
  5656. caps "false"
  5657. noprefix "false"
  5658. \end_inset
  5659. ).
  5660. Other than the 3 designated alpha and beta globin genes, two other genes
  5661. stand out as having especially large negative log fold changes: HBD and
  5662. LOC1021365.
  5663. HBD, delta globin, is most likely targeted by the blocking oligos due to
  5664. high sequence homology with the other globin genes.
  5665. LOC1021365 is the aforementioned ncRNA that is reverse-complementary to
  5666. one of the alpha-like genes and that would be expected to be removed during
  5667. the globin blocking step.
  5668. All other genes appear in a cluster centered vertically at 0, and the vast
  5669. majority of genes in this cluster show an absolute log2(FC) of 0.5 or less.
  5670. Nevertheless, many of these small perturbations are still statistically
  5671. significant, indicating that the globin blocking oligos likely cause very
  5672. small but non-zero systematic perturbations in measured gene expression
  5673. levels.
  5674. \end_layout
  5675. \begin_layout Standard
  5676. \begin_inset Float figure
  5677. wide false
  5678. sideways false
  5679. status open
  5680. \begin_layout Plain Layout
  5681. \align center
  5682. \begin_inset Graphics
  5683. filename graphics/Globin Paper/figure5 - corrplot.pdf
  5684. \end_inset
  5685. \end_layout
  5686. \begin_layout Plain Layout
  5687. \begin_inset Caption Standard
  5688. \begin_layout Plain Layout
  5689. \series bold
  5690. \begin_inset Argument 1
  5691. status collapsed
  5692. \begin_layout Plain Layout
  5693. Comparison of inter-sample gene abundance correlations with and without
  5694. globin blocking.
  5695. \end_layout
  5696. \end_inset
  5697. \begin_inset CommandInset label
  5698. LatexCommand label
  5699. name "fig:gene-abundance-correlations"
  5700. \end_inset
  5701. Comparison of inter-sample gene abundance correlations with and without
  5702. globin blocking (GB).
  5703. \series default
  5704. All libraries were normalized together as described in Figure 2, and genes
  5705. with an average abundance (logCPM, log2 counts per million reads counted)
  5706. less than -1 were filtered out.
  5707. Each gene’s logCPM was computed in each library using the edgeR cpm function.
  5708. For each pair of biological samples, the Pearson correlation between those
  5709. samples' GB libraries was plotted against the correlation between the same
  5710. samples’ non-GB libraries.
  5711. Each point represents an unique pair of samples.
  5712. The solid gray line shows a quantile-quantile plot of distribution of GB
  5713. correlations vs.
  5714. that of non-GB correlations.
  5715. The thin dashed line is the identity line, provided for reference.
  5716. \end_layout
  5717. \end_inset
  5718. \end_layout
  5719. \begin_layout Plain Layout
  5720. \end_layout
  5721. \end_inset
  5722. \end_layout
  5723. \begin_layout Standard
  5724. To evaluate the possibility of globin blocking causing random perturbations
  5725. and reducing sample quality, we computed the Pearson correlation between
  5726. logCPM values for every pair of samples with and without GB and plotted
  5727. them against each other (Figure
  5728. \begin_inset CommandInset ref
  5729. LatexCommand ref
  5730. reference "fig:gene-abundance-correlations"
  5731. plural "false"
  5732. caps "false"
  5733. noprefix "false"
  5734. \end_inset
  5735. ).
  5736. The plot indicated that the GB libraries have higher sample-to-sample correlati
  5737. ons than the non-GB libraries.
  5738. Parametric and nonparametric tests for differences between the correlations
  5739. with and without GB both confirmed that this difference was highly significant
  5740. (2-sided paired t-test: t = 37.2, df = 665, P ≪ 2.2e-16; 2-sided Wilcoxon
  5741. sign-rank test: V = 2195, P ≪ 2.2e-16).
  5742. Performing the same tests on the Spearman correlations gave the same conclusion
  5743. (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
  5744. The edgeR package was used to compute the overall biological coefficient
  5745. of variation (BCV) for GB and non-GB libraries, and found that globin blocking
  5746. resulted in a negligible increase in the BCV (0.417 with GB vs.
  5747. 0.400 without).
  5748. The near equality of the BCVs for both sets indicates that the higher correlati
  5749. ons in the GB libraries are most likely a result of the increased yield
  5750. of useful reads, which reduces the contribution of Poisson counting uncertainty
  5751. to the overall variance of the logCPM values
  5752. \begin_inset CommandInset citation
  5753. LatexCommand cite
  5754. key "McCarthy2012"
  5755. literal "false"
  5756. \end_inset
  5757. .
  5758. This improves the precision of expression measurements and more than offsets
  5759. the negligible increase in BCV.
  5760. \end_layout
  5761. \begin_layout Subsection
  5762. More differentially expressed genes are detected with globin blocking
  5763. \end_layout
  5764. \begin_layout Standard
  5765. \begin_inset Float table
  5766. wide false
  5767. sideways false
  5768. status open
  5769. \begin_layout Plain Layout
  5770. \align center
  5771. \begin_inset Tabular
  5772. <lyxtabular version="3" rows="5" columns="5">
  5773. <features tabularvalignment="middle">
  5774. <column alignment="center" valignment="top">
  5775. <column alignment="center" valignment="top">
  5776. <column alignment="center" valignment="top">
  5777. <column alignment="center" valignment="top">
  5778. <column alignment="center" valignment="top">
  5779. <row>
  5780. <cell alignment="center" valignment="top" usebox="none">
  5781. \begin_inset Text
  5782. \begin_layout Plain Layout
  5783. \end_layout
  5784. \end_inset
  5785. </cell>
  5786. <cell alignment="center" valignment="top" usebox="none">
  5787. \begin_inset Text
  5788. \begin_layout Plain Layout
  5789. \end_layout
  5790. \end_inset
  5791. </cell>
  5792. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5793. \begin_inset Text
  5794. \begin_layout Plain Layout
  5795. \series bold
  5796. No Globin Blocking
  5797. \end_layout
  5798. \end_inset
  5799. </cell>
  5800. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5801. \begin_inset Text
  5802. \begin_layout Plain Layout
  5803. \end_layout
  5804. \end_inset
  5805. </cell>
  5806. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  5807. \begin_inset Text
  5808. \begin_layout Plain Layout
  5809. \end_layout
  5810. \end_inset
  5811. </cell>
  5812. </row>
  5813. <row>
  5814. <cell alignment="center" valignment="top" usebox="none">
  5815. \begin_inset Text
  5816. \begin_layout Plain Layout
  5817. \end_layout
  5818. \end_inset
  5819. </cell>
  5820. <cell alignment="center" valignment="top" usebox="none">
  5821. \begin_inset Text
  5822. \begin_layout Plain Layout
  5823. \end_layout
  5824. \end_inset
  5825. </cell>
  5826. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5827. \begin_inset Text
  5828. \begin_layout Plain Layout
  5829. \series bold
  5830. Up
  5831. \end_layout
  5832. \end_inset
  5833. </cell>
  5834. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5835. \begin_inset Text
  5836. \begin_layout Plain Layout
  5837. \series bold
  5838. NS
  5839. \end_layout
  5840. \end_inset
  5841. </cell>
  5842. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5843. \begin_inset Text
  5844. \begin_layout Plain Layout
  5845. \series bold
  5846. Down
  5847. \end_layout
  5848. \end_inset
  5849. </cell>
  5850. </row>
  5851. <row>
  5852. <cell multirow="3" alignment="center" valignment="middle" topline="true" bottomline="true" leftline="true" usebox="none">
  5853. \begin_inset Text
  5854. \begin_layout Plain Layout
  5855. \series bold
  5856. Globin-Blocking
  5857. \end_layout
  5858. \end_inset
  5859. </cell>
  5860. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5861. \begin_inset Text
  5862. \begin_layout Plain Layout
  5863. \series bold
  5864. Up
  5865. \end_layout
  5866. \end_inset
  5867. </cell>
  5868. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5869. \begin_inset Text
  5870. \begin_layout Plain Layout
  5871. \family roman
  5872. \series medium
  5873. \shape up
  5874. \size normal
  5875. \emph off
  5876. \bar no
  5877. \strikeout off
  5878. \xout off
  5879. \uuline off
  5880. \uwave off
  5881. \noun off
  5882. \color none
  5883. 231
  5884. \end_layout
  5885. \end_inset
  5886. </cell>
  5887. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5888. \begin_inset Text
  5889. \begin_layout Plain Layout
  5890. \family roman
  5891. \series medium
  5892. \shape up
  5893. \size normal
  5894. \emph off
  5895. \bar no
  5896. \strikeout off
  5897. \xout off
  5898. \uuline off
  5899. \uwave off
  5900. \noun off
  5901. \color none
  5902. 515
  5903. \end_layout
  5904. \end_inset
  5905. </cell>
  5906. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5907. \begin_inset Text
  5908. \begin_layout Plain Layout
  5909. \family roman
  5910. \series medium
  5911. \shape up
  5912. \size normal
  5913. \emph off
  5914. \bar no
  5915. \strikeout off
  5916. \xout off
  5917. \uuline off
  5918. \uwave off
  5919. \noun off
  5920. \color none
  5921. 2
  5922. \end_layout
  5923. \end_inset
  5924. </cell>
  5925. </row>
  5926. <row>
  5927. <cell multirow="4" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5928. \begin_inset Text
  5929. \begin_layout Plain Layout
  5930. \end_layout
  5931. \end_inset
  5932. </cell>
  5933. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5934. \begin_inset Text
  5935. \begin_layout Plain Layout
  5936. \series bold
  5937. NS
  5938. \end_layout
  5939. \end_inset
  5940. </cell>
  5941. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5942. \begin_inset Text
  5943. \begin_layout Plain Layout
  5944. \family roman
  5945. \series medium
  5946. \shape up
  5947. \size normal
  5948. \emph off
  5949. \bar no
  5950. \strikeout off
  5951. \xout off
  5952. \uuline off
  5953. \uwave off
  5954. \noun off
  5955. \color none
  5956. 160
  5957. \end_layout
  5958. \end_inset
  5959. </cell>
  5960. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5961. \begin_inset Text
  5962. \begin_layout Plain Layout
  5963. \family roman
  5964. \series medium
  5965. \shape up
  5966. \size normal
  5967. \emph off
  5968. \bar no
  5969. \strikeout off
  5970. \xout off
  5971. \uuline off
  5972. \uwave off
  5973. \noun off
  5974. \color none
  5975. 11235
  5976. \end_layout
  5977. \end_inset
  5978. </cell>
  5979. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5980. \begin_inset Text
  5981. \begin_layout Plain Layout
  5982. \family roman
  5983. \series medium
  5984. \shape up
  5985. \size normal
  5986. \emph off
  5987. \bar no
  5988. \strikeout off
  5989. \xout off
  5990. \uuline off
  5991. \uwave off
  5992. \noun off
  5993. \color none
  5994. 136
  5995. \end_layout
  5996. \end_inset
  5997. </cell>
  5998. </row>
  5999. <row>
  6000. <cell multirow="4" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6001. \begin_inset Text
  6002. \begin_layout Plain Layout
  6003. \end_layout
  6004. \end_inset
  6005. </cell>
  6006. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6007. \begin_inset Text
  6008. \begin_layout Plain Layout
  6009. \series bold
  6010. Down
  6011. \end_layout
  6012. \end_inset
  6013. </cell>
  6014. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6015. \begin_inset Text
  6016. \begin_layout Plain Layout
  6017. \family roman
  6018. \series medium
  6019. \shape up
  6020. \size normal
  6021. \emph off
  6022. \bar no
  6023. \strikeout off
  6024. \xout off
  6025. \uuline off
  6026. \uwave off
  6027. \noun off
  6028. \color none
  6029. 0
  6030. \end_layout
  6031. \end_inset
  6032. </cell>
  6033. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6034. \begin_inset Text
  6035. \begin_layout Plain Layout
  6036. \family roman
  6037. \series medium
  6038. \shape up
  6039. \size normal
  6040. \emph off
  6041. \bar no
  6042. \strikeout off
  6043. \xout off
  6044. \uuline off
  6045. \uwave off
  6046. \noun off
  6047. \color none
  6048. 548
  6049. \end_layout
  6050. \end_inset
  6051. </cell>
  6052. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  6053. \begin_inset Text
  6054. \begin_layout Plain Layout
  6055. \family roman
  6056. \series medium
  6057. \shape up
  6058. \size normal
  6059. \emph off
  6060. \bar no
  6061. \strikeout off
  6062. \xout off
  6063. \uuline off
  6064. \uwave off
  6065. \noun off
  6066. \color none
  6067. 127
  6068. \end_layout
  6069. \end_inset
  6070. </cell>
  6071. </row>
  6072. </lyxtabular>
  6073. \end_inset
  6074. \end_layout
  6075. \begin_layout Plain Layout
  6076. \begin_inset Caption Standard
  6077. \begin_layout Plain Layout
  6078. \series bold
  6079. \begin_inset Argument 1
  6080. status open
  6081. \begin_layout Plain Layout
  6082. Comparison of significantly differentially expressed genes with and without
  6083. globin blocking.
  6084. \end_layout
  6085. \end_inset
  6086. \begin_inset CommandInset label
  6087. LatexCommand label
  6088. name "tab:Comparison-of-significant"
  6089. \end_inset
  6090. Comparison of significantly differentially expressed genes with and without
  6091. globin blocking.
  6092. \series default
  6093. Up, Down: Genes significantly up/down-regulated in post-transplant samples
  6094. relative to pre-transplant samples, with a false discovery rate of 10%
  6095. or less.
  6096. NS: Non-significant genes (false discovery rate greater than 10%).
  6097. \end_layout
  6098. \end_inset
  6099. \end_layout
  6100. \begin_layout Plain Layout
  6101. \end_layout
  6102. \end_inset
  6103. \end_layout
  6104. \begin_layout Standard
  6105. To compare performance on differential gene expression tests, we took subsets
  6106. of both the GB and non-GB libraries with exactly one pre-transplant and
  6107. one post-transplant sample for each animal that had paired samples available
  6108. for analysis (N=7 animals, N=14 samples in each subset).
  6109. The same test for pre- vs.
  6110. post-transplant differential gene expression was performed on the same
  6111. 7 pairs of samples from GB libraries and non-GB libraries, in each case
  6112. using an FDR of 10% as the threshold of significance.
  6113. Out of 12954 genes that passed the detection threshold in both subsets,
  6114. 358 were called significantly differentially expressed in the same direction
  6115. in both sets; 1063 were differentially expressed in the GB set only; 296
  6116. were differentially expressed in the non-GB set only; 2 genes were called
  6117. significantly up in the GB set but significantly down in the non-GB set;
  6118. and the remaining 11235 were not called differentially expressed in either
  6119. set.
  6120. These data are summarized in Table
  6121. \begin_inset CommandInset ref
  6122. LatexCommand ref
  6123. reference "tab:Comparison-of-significant"
  6124. plural "false"
  6125. caps "false"
  6126. noprefix "false"
  6127. \end_inset
  6128. .
  6129. The differences in BCV calculated by EdgeR for these subsets of samples
  6130. were negligible (BCV = 0.302 for GB and 0.297 for non-GB).
  6131. \end_layout
  6132. \begin_layout Standard
  6133. The key point is that the GB data results in substantially more differentially
  6134. expressed calls than the non-GB data.
  6135. Since there is no gold standard for this dataset, it is impossible to be
  6136. certain whether this is due to under-calling of differential expression
  6137. in the non-GB samples or over-calling in the GB samples.
  6138. However, given that both datasets are derived from the same biological
  6139. samples and have nearly equal BCVs, it is more likely that the larger number
  6140. of DE calls in the GB samples are genuine detections that were enabled
  6141. by the higher sequencing depth and measurement precision of the GB samples.
  6142. Note that the same set of genes was considered in both subsets, so the
  6143. larger number of differentially expressed gene calls in the GB data set
  6144. reflects a greater sensitivity to detect significant differential gene
  6145. expression and not simply the larger total number of detected genes in
  6146. GB samples described earlier.
  6147. \end_layout
  6148. \begin_layout Section
  6149. Discussion
  6150. \end_layout
  6151. \begin_layout Standard
  6152. The original experience with whole blood gene expression profiling on DNA
  6153. microarrays demonstrated that the high concentration of globin transcripts
  6154. reduced the sensitivity to detect genes with relatively low expression
  6155. levels, in effect, significantly reducing the sensitivity.
  6156. To address this limitation, commercial protocols for globin reduction were
  6157. developed based on strategies to block globin transcript amplification
  6158. during labeling or physically removing globin transcripts by affinity bead
  6159. methods
  6160. \begin_inset CommandInset citation
  6161. LatexCommand cite
  6162. key "Winn2010"
  6163. literal "false"
  6164. \end_inset
  6165. .
  6166. More recently, using the latest generation of labeling protocols and arrays,
  6167. it was determined that globin reduction was no longer necessary to obtain
  6168. sufficient sensitivity to detect differential transcript expression
  6169. \begin_inset CommandInset citation
  6170. LatexCommand cite
  6171. key "NuGEN2010"
  6172. literal "false"
  6173. \end_inset
  6174. .
  6175. However, we are not aware of any publications using these currently available
  6176. protocols the with latest generation of microarrays that actually compare
  6177. the detection sensitivity with and without globin reduction.
  6178. However, in practice this has now been adopted generally primarily driven
  6179. by concerns for cost control.
  6180. The main objective of our work was to directly test the impact of globin
  6181. gene transcripts and a new globin blocking protocol for application to
  6182. the newest generation of differential gene expression profiling determined
  6183. using next generation sequencing.
  6184. \end_layout
  6185. \begin_layout Standard
  6186. The challenge of doing global gene expression profiling in cynomolgus monkeys
  6187. is that the current available arrays were never designed to comprehensively
  6188. cover this genome and have not been updated since the first assemblies
  6189. of the cynomolgus genome were published.
  6190. Therefore, we determined that the best strategy for peripheral blood profiling
  6191. was to do deep RNA-seq and inform the workflow using the latest available
  6192. genome assembly and annotation
  6193. \begin_inset CommandInset citation
  6194. LatexCommand cite
  6195. key "Wilson2013"
  6196. literal "false"
  6197. \end_inset
  6198. .
  6199. However, it was not immediately clear whether globin reduction was necessary
  6200. for RNA-seq or how much improvement in efficiency or sensitivity to detect
  6201. differential gene expression would be achieved for the added cost and work.
  6202. \end_layout
  6203. \begin_layout Standard
  6204. We only found one report that demonstrated that globin reduction significantly
  6205. improved the effective read yields for sequencing of human peripheral blood
  6206. cell RNA using a DeepSAGE protocol
  6207. \begin_inset CommandInset citation
  6208. LatexCommand cite
  6209. key "Mastrokolias2012"
  6210. literal "false"
  6211. \end_inset
  6212. .
  6213. The approach to DeepSAGE involves two different restriction enzymes that
  6214. purify and then tag small fragments of transcripts at specific locations
  6215. and thus, significantly reduces the complexity of the transcriptome.
  6216. Therefore, we could not determine how DeepSAGE results would translate
  6217. to the common strategy in the field for assaying the entire transcript
  6218. population by whole-transcriptome 3’-end RNA-seq.
  6219. Furthermore, if globin reduction is necessary, we also needed a globin
  6220. reduction method specific to cynomolgus globin sequences that would work
  6221. an organism for which no kit is available off the shelf.
  6222. \end_layout
  6223. \begin_layout Standard
  6224. As mentioned above, the addition of globin blocking oligos has a very small
  6225. impact on measured expression levels of gene expression.
  6226. However, this is a non-issue for the purposes of differential expression
  6227. testing, since a systematic change in a gene in all samples does not affect
  6228. relative expression levels between samples.
  6229. However, we must acknowledge that simple comparisons of gene expression
  6230. data obtained by GB and non-GB protocols are not possible without additional
  6231. normalization.
  6232. \end_layout
  6233. \begin_layout Standard
  6234. More importantly, globin blocking not only nearly doubles the yield of usable
  6235. reads, it also increases inter-sample correlation and sensitivity to detect
  6236. differential gene expression relative to the same set of samples profiled
  6237. without blocking.
  6238. In addition, globin blocking does not add a significant amount of random
  6239. noise to the data.
  6240. Globin blocking thus represents a cost-effective way to squeeze more data
  6241. and statistical power out of the same blood samples and the same amount
  6242. of sequencing.
  6243. In conclusion, globin reduction greatly increases the yield of useful RNA-seq
  6244. reads mapping to the rest of the genome, with minimal perturbations in
  6245. the relative levels of non-globin genes.
  6246. Based on these results, globin transcript reduction using sequence-specific,
  6247. complementary blocking oligonucleotides is recommended for all deep RNA-seq
  6248. of cynomolgus and other nonhuman primate blood samples.
  6249. \end_layout
  6250. \begin_layout Chapter
  6251. Future Directions
  6252. \end_layout
  6253. \begin_layout Standard
  6254. \begin_inset Flex TODO Note (inline)
  6255. status open
  6256. \begin_layout Plain Layout
  6257. Consider per-chapter future directions.
  6258. Check instructions.
  6259. \end_layout
  6260. \end_inset
  6261. \end_layout
  6262. \begin_layout Itemize
  6263. Study other epigenetic marks in more contexts
  6264. \end_layout
  6265. \begin_deeper
  6266. \begin_layout Itemize
  6267. DNA methylation, histone marks, chromatin accessibility & conformation in
  6268. CD4 T-cells
  6269. \end_layout
  6270. \begin_layout Itemize
  6271. Also look at other types of lymphocytes: CD8 T-cells, B-cells, NK cells
  6272. \end_layout
  6273. \end_deeper
  6274. \begin_layout Itemize
  6275. Use CV or bootstrap to better evaluate classifiers
  6276. \end_layout
  6277. \begin_layout Itemize
  6278. fRMAtools could be adapted to not require equal-sized groups
  6279. \end_layout
  6280. \begin_layout Standard
  6281. \begin_inset ERT
  6282. status open
  6283. \begin_layout Plain Layout
  6284. % Call it "References" instead of "Bibliography"
  6285. \end_layout
  6286. \begin_layout Plain Layout
  6287. \backslash
  6288. renewcommand{
  6289. \backslash
  6290. bibname}{References}
  6291. \end_layout
  6292. \end_inset
  6293. \end_layout
  6294. \begin_layout Standard
  6295. \begin_inset Flex TODO Note (inline)
  6296. status open
  6297. \begin_layout Plain Layout
  6298. Check bib entry formatting & sort order
  6299. \end_layout
  6300. \end_inset
  6301. \end_layout
  6302. \begin_layout Standard
  6303. \begin_inset CommandInset bibtex
  6304. LatexCommand bibtex
  6305. btprint "btPrintCited"
  6306. bibfiles "refs,code-refs"
  6307. options "bibtotoc,unsrt"
  6308. \end_inset
  6309. \end_layout
  6310. \end_body
  6311. \end_document