123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178 |
- #LyX 2.3 created this file. For more info see http://www.lyx.org/
- \lyxformat 544
- \begin_document
- \begin_header
- \save_transient_properties true
- \origin unavailable
- \textclass extbook
- \begin_preamble
- % List all used files in log output
- \listfiles
- % Add a DRAFT watermark
- \usepackage{draftwatermark}
- \SetWatermarkLightness{0.97}
- \SetWatermarkScale{1}
- % Set up required header format
- \usepackage{fancyhdr}
- \pagestyle{fancy}
- \renewcommand{\headrulewidth}{0pt}
- \rhead{}
- \lhead{}
- \rfoot{}
- \lfoot{}
- \cfoot{\thepage} % Page number bottom center
- % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
- \usepackage{xstring}
- \usepackage{etoolbox}
- \usepackage{caption}
- \captionsetup{labelfont=bf,tableposition=top}
- \makeatletter
- \newcommand\formatlabel[1]{%
- \noexpandarg
- \IfSubStr{#1}{.}{%
- \StrBefore{#1}{.}[\firstcaption]%
- \StrBehind{#1}{.}[\secondcaption]%
- \textbf{\firstcaption.} \secondcaption}{%
- #1}%
- }
- \patchcmd{\@caption}{#3}{\formatlabel{#3}}
- \makeatother
- % Allow FloatBarrier command
- \usepackage{placeins}
- \end_preamble
- \use_default_options true
- \begin_modules
- todonotes
- \end_modules
- \maintain_unincluded_children false
- \language english
- \language_package default
- \inputencoding utf8
- \fontencoding default
- \font_roman "default" "default"
- \font_sans "default" "default"
- \font_typewriter "default" "default"
- \font_math "auto" "auto"
- \font_default_family default
- \use_non_tex_fonts false
- \font_sc false
- \font_osf false
- \font_sf_scale 100 100
- \font_tt_scale 100 100
- \use_microtype false
- \use_dash_ligatures true
- \graphics default
- \default_output_format pdf4
- \output_sync 0
- \bibtex_command default
- \index_command default
- \paperfontsize 12
- \spacing double
- \use_hyperref true
- \pdf_bookmarks true
- \pdf_bookmarksnumbered false
- \pdf_bookmarksopen false
- \pdf_bookmarksopenlevel 1
- \pdf_breaklinks false
- \pdf_pdfborder false
- \pdf_colorlinks false
- \pdf_backref false
- \pdf_pdfusetitle true
- \papersize letterpaper
- \use_geometry true
- \use_package amsmath 1
- \use_package amssymb 1
- \use_package cancel 1
- \use_package esint 1
- \use_package mathdots 1
- \use_package mathtools 1
- \use_package mhchem 1
- \use_package stackrel 1
- \use_package stmaryrd 1
- \use_package undertilde 1
- \cite_engine basic
- \cite_engine_type default
- \biblio_style plain
- \use_bibtopic false
- \use_indices false
- \paperorientation portrait
- \suppress_date false
- \justification true
- \use_refstyle 1
- \use_minted 0
- \index Index
- \shortcut idx
- \color #008000
- \end_index
- \leftmargin 1.5in
- \topmargin 1in
- \rightmargin 1in
- \bottommargin 1in
- \secnumdepth 3
- \tocdepth 3
- \paragraph_separation indent
- \paragraph_indentation default
- \is_math_indent 0
- \math_numbering_side default
- \quotes_style english
- \dynamic_quotes 0
- \papercolumns 1
- \papersides 2
- \paperpagestyle default
- \tracking_changes false
- \output_changes false
- \html_math_output 0
- \html_css_as_file 0
- \html_be_strict false
- \end_header
- \begin_body
- \begin_layout Title
- Bioinformatic analysis of complex, high-throughput genomic and epigenomic
- data in the context of immunology and transplant rejection
- \end_layout
- \begin_layout Author
- A thesis presented
- \begin_inset Newline newline
- \end_inset
- by
- \begin_inset Newline newline
- \end_inset
- Ryan C.
- Thompson
- \begin_inset Newline newline
- \end_inset
- to
- \begin_inset Newline newline
- \end_inset
- The Scripps Research Institute Graduate Program
- \begin_inset Newline newline
- \end_inset
- in partial fulfillment of the requirements for the degree of
- \begin_inset Newline newline
- \end_inset
- Doctor of Philosophy in the subject of Biology
- \begin_inset Newline newline
- \end_inset
- for
- \begin_inset Newline newline
- \end_inset
- The Scripps Research Institute
- \begin_inset Newline newline
- \end_inset
- La Jolla, California
- \end_layout
- \begin_layout Date
- May 2019
- \end_layout
- \begin_layout Standard
- [Copyright notice]
- \end_layout
- \begin_layout Standard
- [Thesis acceptance form]
- \end_layout
- \begin_layout Standard
- [Dedication]
- \end_layout
- \begin_layout Standard
- [Acknowledgements]
- \end_layout
- \begin_layout Standard
- \begin_inset CommandInset toc
- LatexCommand tableofcontents
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset FloatList table
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset FloatList figure
- \end_inset
- \end_layout
- \begin_layout Standard
- [List of Abbreviations]
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Look into auto-generated nomenclature list: https://wiki.lyx.org/Tips/Nomenclature
- \end_layout
- \end_inset
- \end_layout
- \begin_layout List of TODOs
- \end_layout
- \begin_layout Standard
- [Abstract]
- \end_layout
- \begin_layout Chapter*
- Abstract
- \end_layout
- \begin_layout Chapter
- Introduction
- \end_layout
- \begin_layout Section
- Background & Significance
- \end_layout
- \begin_layout Subsection
- Biological motivation
- \end_layout
- \begin_layout Itemize
- Rejection is the major long-term threat to organ and tissue grafts
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Common mechanisms of rejection
- \end_layout
- \begin_layout Itemize
- Effective immune suppression requires monitoring for rejection and tuning
-
- \end_layout
- \begin_layout Itemize
- Current tests for rejection (tissue biopsy) are invasive and biased
- \end_layout
- \begin_layout Itemize
- A blood test based on microarrays would be less biased and invasive
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Memory cells are resistant to immune suppression
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Mechanisms of resistance in memory cells are poorly understood
- \end_layout
- \begin_layout Itemize
- A better understanding of immune memory formation is needed
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Mesenchymal stem cell infusion is a promising new treatment to prevent/delay
- rejection
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Demonstrated in mice, but not yet in primates
- \end_layout
- \begin_layout Itemize
- Mechanism currently unknown, but MSC are known to be immune modulatory
- \end_layout
- \end_deeper
- \begin_layout Subsection
- Overview of bioinformatic analysis methods
- \end_layout
- \begin_layout Standard
- An overview of all the methods used, including what problem they solve,
- what assumptions they make, and a basic description of how they work.
- \end_layout
- \begin_layout Itemize
- ChIP-seq Peak calling
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Cross-correlation analysis to determine fragment size
- \end_layout
- \begin_layout Itemize
- Broad vs narrow peaks
- \end_layout
- \begin_layout Itemize
- SICER for broad peaks
- \end_layout
- \begin_layout Itemize
- IDR for biologically reproducible peaks
- \end_layout
- \begin_layout Itemize
- csaw peak filtering guidelines for unbiased downstream analysis
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Normalization is non-trivial and application-dependant
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Expression arrays: RMA & fRMA; why fRMA is needed
- \end_layout
- \begin_layout Itemize
- Methylation arrays: M-value transformation approximates normal data but
- induces heteroskedasticity
- \end_layout
- \begin_layout Itemize
- RNA-seq: normalize based on assumption that the average gene is not changing
- \end_layout
- \begin_layout Itemize
- ChIP-seq: complex with many considerations, dependent on experimental methods,
- biological system, and analysis goals
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Limma: The standard linear modeling framework for genomics
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- empirical Bayes variance modeling: limma's core feature
- \end_layout
- \begin_layout Itemize
- edgeR & DESeq2: Extend with negative bonomial GLM for RNA-seq and other
- count data
- \end_layout
- \begin_layout Itemize
- voom: Extend with precision weights to model mean-variance trend
- \end_layout
- \begin_layout Itemize
- arrayWeights and duplicateCorrelation to handle complex variance structures
- \end_layout
- \end_deeper
- \begin_layout Itemize
- sva and ComBat for batch correction
- \end_layout
- \begin_layout Itemize
- Factor analysis: PCA, MDS, MOFA
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Batch-corrected PCA is informative, but careful application is required
- to avoid bias
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Gene set analysis: camera and SPIA
- \end_layout
- \begin_layout Section
- Innovation
- \end_layout
- \begin_layout Itemize
- MSC infusion to improve transplant outcomes (prevent/delay rejection)
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Characterize MSC response to interferon gamma
- \end_layout
- \begin_layout Itemize
- IFN-g is thought to stimulate their function
- \end_layout
- \begin_layout Itemize
- Test IFN-g treated MSC infusion as a therapy to delay graft rejection in
- cynomolgus monkeys
- \end_layout
- \begin_layout Itemize
- Monitor animals post-transplant using blood RNA-seq at serial time points
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Investigate dynamics of histone marks in CD4 T-cell activation and memory
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Previous studies have looked at single snapshots of histone marks
- \end_layout
- \begin_layout Itemize
- Instead, look at changes in histone marks across activation and memory
- \end_layout
- \end_deeper
- \begin_layout Itemize
- High-throughput sequencing and microarray technologies
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Powerful methods for assaying gene expression and epigenetics across entire
- genomes
- \end_layout
- \begin_layout Itemize
- Proper analysis requires finding and exploiting systematic genome-wide trends
- \end_layout
- \end_deeper
- \begin_layout Chapter
- Reproducible genome-wide epigenetic analysis of H3K4 and H3K27 methylation
- in naive and memory CD4 T-cell activation
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Author list: Me, Sarah, Dan
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Section
- Approach
- \end_layout
- \begin_layout Itemize
- CD4 T-cells are central to all adaptive immune responses and memory
- \end_layout
- \begin_layout Itemize
- H3K4 and H3K27 methylation are major epigenetic regulators of gene expression
- \end_layout
- \begin_layout Itemize
- Canonically, H3K4 is activating and H3K27 is inhibitory, but the reality
- is complex
- \end_layout
- \begin_layout Itemize
- Looking at these marks during CD4 activation and memory should reveal new
- mechanistic details
- \end_layout
- \begin_layout Itemize
- Test
- \begin_inset Quotes eld
- \end_inset
- poised promoter
- \begin_inset Quotes erd
- \end_inset
- hypothesis in which H3K4 and H3K27 are both methylated
- \end_layout
- \begin_layout Itemize
- Expand scope of analysis beyond simple promoter counts
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Analyze peaks genome-wide, including in intergenic regions
- \end_layout
- \begin_layout Itemize
- Analysis of coverage distribution shape within promoters, e.g.
- upstream vs downstream coverage
- \end_layout
- \end_deeper
- \begin_layout Section
- Methods
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways true
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rulegraphs/rulegraph-all.pdf
- width 100theight%
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:rulegraph"
- \end_inset
- \series bold
- Dependency graph of steps in reproducible workflow
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- A reproducible workflow
- \begin_inset CommandInset citation
- LatexCommand cite
- key "gh-cd4-csaw"
- literal "false"
- \end_inset
- was written to analyze the raw ChIP-seq and RNA-seq data from previous
- studies
- \begin_inset CommandInset citation
- LatexCommand cite
- key "LaMere2016,LaMere2017"
- literal "true"
- \end_inset
- .
- Briefly, this data consists of RNA-seq and ChIP-seq from CD4 T-cells cultured
- from 4 donors.
- From each donor, naive and memory CD4 T-cells were isolated separately.
- Then cultures of both cells were activated [how?], and samples were taken
- at 4 time points: Day 0 (pre-activation), Day 1 (early activation), Day
- 5 (peak activation), and Day 14 (post-activation).
- For each combination of cell type and time point, RNA was isolated, and
- ChIP-seq was performed for each of 3 histone marks: H3K4me2, H3K4me3, and
- H3K27me3.
- The ChIP-seq input was also sequenced for each sample.
- The result was 32 samples for each assay.
- \end_layout
- \begin_layout Standard
- Sequence reads were retrieved from the Sequence Read Archive (SRA)
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Leinonen2011"
- literal "false"
- \end_inset
- .
- ChIP-seq (and input) reads were aligned to CRCh38 genome assembly using
- Bowtie 2
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Langmead2012,Schneider2017,gh-hg38-ref"
- literal "false"
- \end_inset
- .
- Artifact regions were annotated using a custom implementation of the GreyListCh
- IP algorithm, and these
- \begin_inset Quotes eld
- \end_inset
- greylists
- \begin_inset Quotes erd
- \end_inset
- were merged with the ENCODE blacklist
- \begin_inset CommandInset citation
- LatexCommand cite
- key "greylistchip,Amemiya2019,Dunham2012"
- literal "false"
- \end_inset
- .
- Any read or peak overlapping one of these regions was regarded as artifactual
- and excluded from downstream analyses.
-
- \end_layout
- \begin_layout Standard
- Peaks are called using epic, an implementation of the SICER algorithm
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Zang2009,gh-epic"
- literal "false"
- \end_inset
- .
- Peaks are also called separately using MACS, but MACS was determined to
- be a poor fit for the data, and these peak calls are not used further
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Zhang2008"
- literal "false"
- \end_inset
- .
- \end_layout
- \begin_layout Itemize
- Re-analyze previously published CD4 ChIP-seq & RNA-seq data
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Completely reimplement analysis from scratch as a reproducible workflow
- \end_layout
- \begin_layout Itemize
- Use newly published methods & algorithms not available during the original
- analysis: SICER, csaw, MOFA, ComBat, sva, GREAT, and more
- \end_layout
- \end_deeper
- \begin_layout Itemize
- SICER, IDR, csaw, & GREAT to call ChIP-seq peaks genome-wide, perform differenti
- al abundance analysis, and relate those peaks to gene expression
- \end_layout
- \begin_layout Itemize
- Promoter counts in sliding windows around each gene's highest-expressed
- TSS to investigate coverage distribution within promoters
- \end_layout
- \begin_layout Section
- Results
- \end_layout
- \begin_layout Standard
- \begin_inset Note Note
- status open
- \begin_layout Plain Layout
- Focus on what hypotheses were tested, then select figures that show how
- those hypotheses were tested, even if the result is a negative.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Subsection
- H3K4 and H3K27 methylation occur in broad regions and are enriched near
- promoters
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Re-generate IDR rank consistency plots for SICER and MACS side-by-side
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:IDR-RC-H3K4me2"
- \end_inset
- Irreproducible Discovery Rate consistency plots for H3K4me2
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Re-generate IDR rank consistency plots for SICER and MACS side-by-side
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:IDR-RC-H3K4me3"
- \end_inset
- Irreproducible Discovery Rate consistency plots for H3K4me3
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Re-generate IDR rank consistency plots for SICER and MACS side-by-side
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:IDR-RC-H3K27me3"
- \end_inset
- Irreproducible Discovery Rate consistency plots for H3K27me3
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Need
- \emph on
- median
- \emph default
- peak width, not mean
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="4" columns="5">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Histone Mark
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- # Peaks
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Mean peak width
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- genome coverage
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- read coverage
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- H3K4me2
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 14965
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 3970
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 1.92%
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 14.2%
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- H3K4me3
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 6163
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 2946
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0.588%
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 6.57%
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- H3K27me3
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 18139
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 18967
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 11.1%
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 22.5%
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:peak-calling-summary"
- \end_inset
- SICER+IDR peak-calling summary
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Figures
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:IDR-RC-H3K4me2"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ,
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:IDR-RC-H3K4me3"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , and
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:IDR-RC-H3K27me3"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- show the IDR rank-consistency plots for peaks called in an arbitrarily-chosen
- pair of donors.
- For all 3 histone marks, when the peaks for each donor are ranked according
- to their scores, SICER produces much more reproducible results between
- donors.
- This is consistent with SICER's stated goal of identifying broad peaks,
- in contrast to MACS, which is designed for identifying sharp peaks.
- Based on this observation, the SICER peak calls were used for all downstream
- analyses that involved ChIP-seq peaks.
- Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:peak-calling-summary"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- gives a summary of the peak calling statistics for each histone mark.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/Promoter Peak Distance Profile-PAGE1-CROP.pdf
- width 100col%
- groupId colwidth
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:effective-promoter-radius"
- \end_inset
- Enrichment of peaks in promoter neighborhoods.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Itemize
- Each histone mark is enriched within a certain radius of gene TSS positions,
- but that radius is different for each mark (figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:effective-promoter-radius"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , previously in
- \begin_inset CommandInset citation
- LatexCommand cite
- key "LaMere2016"
- literal "false"
- \end_inset
- Fig.
- S2)
- \end_layout
- \begin_layout Subsection
- RNA-seq align+quant method selection
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Maybe fix up the axis ranges for these plots?
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-star-CROP.png
- lyxscale 25
- width 100col%
- groupId colwidth-raster
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- Comparison of STAR quantification between Ensembl and Entrez gene identifiers
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-shoal-CROP.png
- lyxscale 25
- width 100col%
- groupId colwidth-raster
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- Comparison of Salmon+Shoal quantification between Ensembl and Entrez gene
- identifiers
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rnaseq-compare/star-vs-hisat2-CROP.png
- lyxscale 25
- width 100col%
- groupId colwidth-raster
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- Comparison of quantification between STAR and HISAT2 for identical annotation
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rnaseq-compare/star-vs-salmon-CROP.png
- lyxscale 25
- width 100col%
- groupId colwidth-raster
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- Comparison of quantification between STAR and Salmon for identical annotation
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-kallisto-CROP.png
- lyxscale 25
- width 100col%
- groupId colwidth-raster
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- Comparison of quantification between Salmon and Kallisto for identical annotatio
- n
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-shoal-CROP.png
- lyxscale 25
- width 100col%
- groupId colwidth-raster
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- Comparison of quantification between Salmon with and without Shoal for identical
- annotation
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \end_layout
- \begin_layout Subsection
- RNA-seq has a large confounding batch effect
- \end_layout
- \begin_layout Itemize
- RNA-seq batch effect can be partially corrected, but still induces uncorrectable
- biases in downstream analysis
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Figure showing MDS plot before & after ComBat
- \end_layout
- \begin_layout Itemize
- Figure relating sample weights to batches, cell types, time points, etc.,
- showing that one batch is significantly worse quality
- \end_layout
- \begin_layout Itemize
- Figures showing p-value histograms for within-batch and cross-batch contrasts,
- showing that cross-batch contrasts have attenuated signal, as do comparisons
- within the bad batch
- \end_layout
- \end_deeper
- \begin_layout Subsection
- ChIP-seq must be corrected for hidden confounding factors
- \end_layout
- \begin_layout Itemize
- Figures showing pre- and post-SVA MDS plots for each histone mark
- \end_layout
- \begin_layout Itemize
- Figures showing BCV plots with and without SVA for each histone mark
- \end_layout
- \begin_layout Subsection
- H3K4 and H3K27 promoter methylation has broadly the expected correlation
- with gene expression
- \end_layout
- \begin_layout Itemize
- H3K4 is correlated with higher expression, and H3K27 is correlated with
- lower expression genome-wide
- \end_layout
- \begin_layout Itemize
- Figures showing these correlations: box/violin plots of expression distributions
- with every combination of peak presence/absence in promoter
- \end_layout
- \begin_layout Itemize
- Appropriate statistical tests showing significant differences in expected
- directions
- \end_layout
- \begin_layout Subsection
- MOFA recovers biologically relevant variation from blind analysis by correlating
- across datasets
- \end_layout
- \begin_layout Itemize
- MOFA
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Argelaguet2018"
- literal "false"
- \end_inset
- successfully separates biologically relevant patterns of variation from
- technical confounding factors without knowing the sample labels, by finding
- latent factors that explain variation across multiple data sets.
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Figure: show percent-variance-explained plot from MOFA and PCA-like plots
- for the relevant latent factors
- \end_layout
- \begin_layout Itemize
- MOFA analysis also shows that batch effect correction can't get much better
- than it already is (Figure comparing blind MOFA batch correction to ComBat
- correction)
- \end_layout
- \end_deeper
- \begin_layout Subsection
- Naive-to-memory convergence observed in H3K4 and RNA-seq data, not in H3K27me3
- \end_layout
- \begin_layout Itemize
- H3K4 and RNA-seq data show clear evidence of naive convergence with memory
- between days 1 and 5 (MDS plot figure, also compare with last figure from
-
- \begin_inset CommandInset citation
- LatexCommand cite
- key "LaMere2016"
- literal "false"
- \end_inset
- )
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Note that Sarah has granted permission to use her figures
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Itemize
- Table of numbers of genes different between N & M at each time point, showing
- dwindling differences at later time points, consistent with convergence
- \end_layout
- \begin_layout Itemize
- Similar figure for H3K27me3 showing lack of convergence
- \end_layout
- \begin_layout Subsection
- Effect of promoter coverage upstream vs downstream of TSS
- \end_layout
- \begin_layout Itemize
- H3K4me peaks seem to correlate with increased expression as long as they
- are anywhere near the TSS
- \end_layout
- \begin_layout Itemize
- H3K27me3 peaks can have different correlations to gene expression depending
- on their position relative to TSS (e.g.
- upstream vs downstream) Results consistent with
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Young2011"
- literal "false"
- \end_inset
- \end_layout
- \begin_layout Section
- Discussion
- \end_layout
- \begin_layout Itemize
- "Promoter radius" is not constant and must be defined empirically for a
- given data set
- \end_layout
- \begin_layout Itemize
- MOFA shows great promise for accelerating discovery of major biological
- effects in multi-omics datasets
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- MOFA was added to this analysis late and played primarily a confirmatory
- role, but it was able to confirm earlier conclusions with much less prior
- information (no sample labels) and much less analyst effort
- \end_layout
- \begin_layout Itemize
- MOFA confirmed that the already-implemented batch correction in the RNA-seq
- data was already performing as well as possible given the limitations of
- the data
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Naive-to-memory convergence implies that naive cells are differentiating
- into memory cells, and that gene expression and H3K4 methylation are involved
- in this differentiation while H3K27me3 is less involved
- \end_layout
- \begin_layout Itemize
- H3K27me3, canonically regarded as a deactivating mark, seems to have a more
- complex
- \end_layout
- \begin_layout Itemize
- Discuss advantages of developing using a reproducible workflow
- \end_layout
- \begin_layout Chapter
- Improving array-based analyses of transplant rejection by optimizing data
- preprocessing
- \end_layout
- \begin_layout Standard
- \begin_inset Note Note
- status open
- \begin_layout Plain Layout
- Author list: Me, Sunil, Tom, Padma, Dan
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Section
- Approach
- \end_layout
- \begin_layout Subsection
- Proper pre-processing is essential for array data
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- This section could probably use some citations
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Microarrays, bead arrays, and similar assays produce raw data in the form
- of fluorescence intensity measurements, with the each intensity measurement
- proportional to the abundance of some fluorescently-labelled target DNA
- or RNA sequence that base pairs to a specific probe sequence.
- However, these measurements for each probe are also affected my many technical
- confounding factors, such as the concentration of target material, strength
- of off-target binding, and the sensitivity of the imaging sensor.
- Some array designs also use multiple probe sequences for each target.
- Hence, extensive pre-processing of array data is necessary to normalize
- out the effects of these technical factors and summarize the information
- from multiple probes to arrive at a single usable estimate of abundance
- or other relevant quantity, such as a ratio of two abundances, for each
- target.
- \end_layout
- \begin_layout Standard
- The choice of pre-processing algorithms used in the analysis of an array
- data set can have a large effect on the results of that analysis.
- However, despite their importance, these steps are often neglected or rushed
- in order to get to the more scientifically interesting analysis steps involving
- the actual biology of the system under study.
- Hence, it is often possible to achieve substantial gains in statistical
- power, model goodness-of-fit, or other relevant performance measures, by
- checking the assumptions made by each preprocessing step and choosing specific
- normalization methods tailored to the specific goals of the current analysis.
- \end_layout
- \begin_layout Subsection
- Normalization for clinical microarray classifiers must be single-channel
- \end_layout
- \begin_layout Subsubsection
- Standard normalization methods are unsuitable for clinical application
- \end_layout
- \begin_layout Standard
- As the cost of performing microarray assays falls, there is increasing interest
- in using genomic assays for diagnostic purposes, such as distinguishing
- healthy transplants (TX) from transplants undergoing acute rejection (AR)
- or acute dysfunction with no rejection (ADNR).
- However, the the standard normalization algorithm used for microarray data,
- Robust Multi-chip Average (RMA)
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Irizarry2003a"
- literal "false"
- \end_inset
- , is not applicable in a clinical setting.
- Two of the steps in RMA, quantile normalization and probe summarization
- by median polish, depend on every array in the data set being normalized.
- This means that adding or removing any arrays from a data set changes the
- normalized values for all arrays, and data sets that have been normalized
- separately cannot be compared to each other.
- Hence, when using RMA, any arrays to be analyzed together must also be
- normalized together, and the set of arrays included in the data set must
- be held constant throughout an analysis.
- \end_layout
- \begin_layout Standard
- These limitations present serious impediments to the use of arrays as a
- diagnostic tool.
- When training a classifier, the samples to be classified must not be involved
- in any step of the training process, lest their inclusion bias the training
- process.
- Once a classifier is deployed in a clinical setting, the samples to be
- classified will not even
- \emph on
- exist
- \emph default
- at the time of training, so including them would be impossible even if
- it were statistically justifiable.
- Therefore, any machine learning application for microarrays demands that
- the normalized expression values computed for an array must depend only
- on information contained within that array.
- This would ensure that each array's normalization is independent of every
- other array, and that arrays normalized separately can still be compared
- to each other without bias.
- Such a normalization is commonly referred to as
- \begin_inset Quotes eld
- \end_inset
- single-channel normalization
- \begin_inset Quotes erd
- \end_inset
- .
- \end_layout
- \begin_layout Subsubsection
- Several strategies are available to meet clinical normalization requirements
- \end_layout
- \begin_layout Standard
- Frozen RMA (fRMA) addresses these concerns by replacing the quantile normalizati
- on and median polish with alternatives that do not introduce inter-array
- dependence, allowing each array to be normalized independently of all others
-
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCall2010"
- literal "false"
- \end_inset
- .
- Quantile normalization is performed against a pre-generated set of quantiles
- learned from a collection of 850 publically available arrays sampled from
- a wide variety of tissues in the Gene Expression Omnibus (GEO).
- Each array's probe intensity distribution is normalized against these pre-gener
- ated quantiles.
- The median polish step is replaced with a robust weighted average of probe
- intensities, using inverse variance weights learned from the same public
- GEO data.
- The result is a normalization that satisfies the requirements mentioned
- above: each array is normalized independently of all others, and any two
- normalized arrays can be compared directly to each other.
- \end_layout
- \begin_layout Standard
- One important limitation of fRMA is that it requires a separate reference
- data set from which to learn the parameters (reference quantiles and probe
- weights) that will be used to normalize each array.
- These parameters are specific to a given array platform, and pre-generated
- parameters are only provided for the most common platforms, such as Affymetrix
- hgu133plus2.
- For a less common platform, such as hthgu133pluspm, is is necessary to
- learn custom parameters from in-house data before fRMA can be used to normalize
- samples on that platform
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCall2011"
- literal "false"
- \end_inset
- .
- \end_layout
- \begin_layout Standard
- One other option is the aptly-named Single Channel Array Normalization (SCAN),
- which adapts a normalization method originally designed for tiling arrays
-
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Piccolo2012"
- literal "false"
- \end_inset
- .
- SCAN is truly single-channel in that it does not require a set of normalization
- paramters estimated from an external set of reference samples like fRMA
- does.
- \end_layout
- \begin_layout Subsection
- Heteroskedasticity must be accounted for in methylation array data
- \end_layout
- \begin_layout Subsubsection
- Methylation array preprocessing induces heteroskedasticity
- \end_layout
- \begin_layout Standard
- DNA methylation arrays are a relatively new kind of assay that uses microarrays
- to measure the degree of methylation on cytosines in specific regions arrayed
- across the genome.
- First, bisulfite treatment converts all unmethylated cytosines to uracil
- (which then become thymine after amplication) while leaving methylated
- cytosines unaffected.
- Then, each target region is interrogated with two probes: one binds to
- the original genomic sequence and interrogates the level of methylated
- DNA, and the other binds to the same sequence with all cytosines replaced
- by thymidines and interrogates the level of unmethylated DNA.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/methylvoom/sigmoid.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:Sigmoid-beta-m-mapping"
- \end_inset
- \series bold
- Sigmoid shape of the mapping between β and M values
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- After normalization, these two probe intensities are summarized in one of
- two ways, each with advantages and disadvantages.
- β
- \series bold
-
- \series default
- values, interpreted as fraction of DNA copies methylated, range from 0 to
- 1.
- β
- \series bold
-
- \series default
- values are conceptually easy to interpret, but the constrained range makes
- them unsuitable for linear modeling, and their error distributions are
- highly non-normal, which also frustrates linear modeling.
- M-values, interpreted as the log ratio of methylated to unmethylated copies,
- are computed by mapping the beta values from
- \begin_inset Formula $[0,1]$
- \end_inset
- onto
- \begin_inset Formula $(-\infty,+\infty)$
- \end_inset
- using a sigmoid curve (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:Sigmoid-beta-m-mapping"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- This transformation results in values with better statistical perperties:
- the unconstrained range is suitable for linear modeling, and the error
- distributions are more normal.
- Hence, most linear modeling and other statistical testing on methylation
- arrays is performed using M-values.
- \end_layout
- \begin_layout Standard
- However, the steep slope of the sigmoid transformation near 0 and 1 tends
- to over-exaggerate small differences in β values near those extremes, which
- in turn amplifies the error in those values, leading to a U-shaped trend
- in the mean-variance curve: extreme values have higher variances than values
- near the middle.
- This mean-variance dependency must be accounted for when fitting the linear
- model for differential methylation, or else the variance will be systematically
- overestimated for probes with moderate M-values and underestimated for
- probes with extreme M-values.
- \end_layout
- \begin_layout Subsubsection
- The voom method for RNA-seq data can model M-value heteroskedasticity
- \end_layout
- \begin_layout Standard
- RNA-seq read count data are also known to show heteroskedasticity, and the
- voom method was developed for modeling this heteroskedasticity by estimating
- the mean-variance trend in the data and using this trend to assign precision
- weights to each observation
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Law2013"
- literal "false"
- \end_inset
- .
- While methylation array data are not derived from counts and have a very
- different mean-variance relationship from that of typical RNA-seq data,
- the voom method makes no specific assumptions on the shape of the mean-variance
- relationship - it only assumes that the relationship is smooth enough to
- model using a lowess curve.
- Hence, the method is sufficiently general to model the mean-variance relationsh
- ip in methylation array data.
- However, the standard implementation of voom assumes that the input is
- given in raw read counts, and it must be adapted to run on methylation
- M-values.
- \end_layout
- \begin_layout Section
- Methods
- \end_layout
- \begin_layout Subsection
- Evaluation of classifier performance with different normalization methods
- \end_layout
- \begin_layout Standard
- For testing different expression microarray normalizations, a data set of
- 157 hgu133plus2 arrays was used, consisting of blood samples from kidney
- transplant patients whose grafts had been graded as TX, AR, or ADNR via
- biopsy and histology (46 TX, 69 AR, 42 ADNR)
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Kurian2014"
- literal "true"
- \end_inset
- .
- Additionally, an external validation set of 75 samples was gathered from
- public GEO data (37 TX, 38 AR, no ADNR).
-
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status collapsed
- \begin_layout Plain Layout
- Find appropriate GEO identifiers if possible.
- Kurian 2014 says GSE15296, but this seems to be different data.
- I also need to look up the GEO accession for the external validation set.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- To evaluate the effect of each normalization on classifier performance,
- the same classifier training and validation procedure was used after each
- normalization method.
- The PAM package was used to train a nearest shrunken centroid classifier
- on the training set and select the appropriate threshold for centroid shrinking.
- Then the trained classifier was used to predict the class probabilities
- of each validation sample.
- From these class probabilities, ROC curves and area-under-curve (AUC) values
- were generated
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Turck2011"
- literal "false"
- \end_inset
- .
- Each normalization was tested on two different sets of training and validation
- samples.
- For internal validation, the 115 TX and AR arrays in the internal set were
- split at random into two equal sized sets, one for training and one for
- validation, each containing the same numbers of TX and AR samples as the
- other set.
- For external validation, the full set of 115 TX and AR samples were used
- as a training set, and the 75 external TX and AR samples were used as the
- validation set.
- Thus, 2 ROC curves and AUC values were generated for each normalization
- method: one internal and one external.
- Because the external validation set contains no ADNR samples, only classificati
- on of TX and AR samples was considered.
- The ADNR samples were included during normalization but excluded from all
- classifier training and validation.
- This ensures that the performance on internal and external validation sets
- is directly comparable, since both are performing the same task: distinguising
- TX from AR.
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status collapsed
- \begin_layout Plain Layout
- Summarize the get.best.threshold algorithm for PAM threshold selection
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Six different normalization strategies were evaluated.
- First, 2 well-known non-single-channel normalization methods were considered:
- RMA and dChip
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Li2001,Irizarry2003a"
- literal "false"
- \end_inset
- .
- Since RMA produces expression values on a log2 scale and dChip does not,
- the values from dChip were log2 transformed after normalization.
- Next, RMA and dChip followed by Global Rank-invariant Set Normalization
- (GRSN) were tested
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Pelz2008"
- literal "false"
- \end_inset
- .
- Post-processing with GRSN does not turn RMA or dChip into single-channel
- methods, but it may help mitigate batch effects and is therefore useful
- as a benchmark.
- Lastly, the two single-channel normalization methods, fRMA and SCAN, were
- tested
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCall2010,Piccolo2012"
- literal "false"
- \end_inset
- .
- When evaluting internal validation performance, only the 157 internal samples
- were normalized; when evaluating external validation performance, all 157
- internal samples and 75 external samples were normalized together.
- \end_layout
- \begin_layout Standard
- For demonstrating the problem with separate normalization of training and
- validation data, one additional normalization was performed: the internal
- and external sets were each normalized separately using RMA, and the normalized
- data for each set were combined into a single set with no further attempts
- at normalizing between the two sets.
- The represents approximately how RMA would have to be used in a clinical
- setting, where the samples to be classified are not available at the time
- the classifier is trained.
- \end_layout
- \begin_layout Subsection
- Generating custom fRMA vectors for hthgu133pluspm array platform
- \end_layout
- \begin_layout Standard
- In order to enable fRMA normalization for the hthgu133pluspm array platform,
- custom fRMA normalization vectors were trained using the frmaTools package
-
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCall2011"
- literal "false"
- \end_inset
- .
- Separate vectors were created for two types of samples: kidney graft biopsy
- samples and blood samples from graft recipients.
- For training, a 341 kidney biopsy samples from 2 data sets and 965 blood
- samples from 5 data sets were used as the reference set.
- Arrays were groups into batches based on unique combinations of sample
- type (blood or biopsy), diagnosis (TX, AR, etc.), data set, and scan date.
- Thus, each batch represents arrays of the same kind that were run together
- on the same day.
- For estimating the probe inverse variance weights, frmaTools requires equal-siz
- ed batches, which means a batch size must be chosen, and then batches smaller
- than that size must be ignored, while batches larger than the chosen size
- must be downsampled.
- This downsampling is performed randomly, so the sampling process is repeated
- 5 times and the resulting normalizations are compared to each other.
- \end_layout
- \begin_layout Standard
- To evaluate the consistency of the generated normalization vectors, the
- 5 fRMA vector sets generated from 5 random batch samplings were each used
- to normalize the same 20 randomly selected samples from each tissue.
- Then the normalized expression values for each probe on each array were
- compared across all normalizations.
- Each fRMA normalization was also compared against the normalized expression
- values obtained by normalizing the same 20 samples with ordinary RMA.
- \end_layout
- \begin_layout Subsection
- Modeling methylation array M-value heteroskedasticy in linear models with
- modified voom implementation
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Put code on Github and reference it.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- To investigate the whether DNA methylation could be used to distinguish
- between healthy and dysfunctional transplants, a data set of 78 Illumina
- 450k methylation arrays from human kidney graft biopsies was analyzed for
- differential metylation between 4 transplant statuses: healthy transplant
- (TX), transplants undergoing acute rejection (AR), acute dysfunction with
- no rejection (ADNR), and chronic allograpft nephropathy (CAN).
- The data consisted of 33 TX, 9 AR, 8 ADNR, and 28 CAN samples.
- The uneven group sizes are a result of taking the biopsy samples before
- the eventual fate of the transplant was known.
- Each sample was additionally annotated with a donor ID (anonymized), Sex,
- Age, Ethnicity, Creatinine Level, and Diabetes diagnosois (all samples
- in this data set came from patients with either Type 1 or Type 2 diabetes).
-
- \end_layout
- \begin_layout Standard
- The intensity data were first normalized using subset-quantile within array
- normalization (SWAN)
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Maksimovic2012"
- literal "false"
- \end_inset
- , then converted to intensity ratios (beta values)
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Aryee2014"
- literal "false"
- \end_inset
- .
- Any probes binding to loci that overlapped annotated SNPs were dropped,
- and the annotated sex of each sample was verified against the sex inferred
- from the ratio of median probe intensities for the X and Y chromosomes.
- Then, the ratios were transformed to M-values.
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \begin_inset Tabular
- <lyxtabular version="3" rows="4" columns="6">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Analysis
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- patient random effect
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- empirical Bayes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- SVA
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- sample weights
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- voom
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- A
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- B
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- C
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:Summary-of-meth-analysis"
- \end_inset
- Summary of analysis variants for methylation array data.
-
- \series default
- Each analysis included a different set of steps to adjust or account for
- various systematic features of the data.
- See the text for a more detailed explanation of each step.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- From the M-values, a series of parallel analyses was performed, each adding
- additional steps into the model fit to accomodate a feature of the data
- (see Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:Summary-of-meth-analysis"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- For analysis A, a
- \begin_inset Quotes eld
- \end_inset
- basic
- \begin_inset Quotes erd
- \end_inset
- linear modeling analysis was performed, compensating for known confounders
- by including terms for the factor of interest (transplant status) as well
- as the known biological confounders: sex, age, ethnicity, and diabetes.
- Since some samples came from the same patients at different times, the
- intra-patient correlation was modeled as a random effect, estimating a
- shared correlation value across all probes
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Smyth2005a"
- literal "false"
- \end_inset
- .
- Then the linear model was fit, and the variance was modeled using empirical
- Bayes squeezing toward the mean-variance trend
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Ritchie2015"
- literal "false"
- \end_inset
- .
- Finally, t-tests or F-tests were performed as appropriate for each test:
- t-tests for single contrasts, and F-tests for multiple contrasts.
- P-values were corrected for multiple testing using the Benjamini-Hochberg
- procedure for FDR control
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Benjamini1995"
- literal "false"
- \end_inset
- .
- \end_layout
- \begin_layout Standard
- For the analysis B, surrogate variable analysis (SVA) was used to infer
- additional unobserved sources of heterogeneity in the data
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Leek2007"
- literal "false"
- \end_inset
- .
- These surrogate variables were added to the design matrix before fitting
- the linear model.
- In addition, sample quality weights were estimated from the data and used
- during linear modeling to down-weight the contribution of highly variable
- arrays while increasing the weight to arrays with lower variability
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Ritchie2006"
- literal "false"
- \end_inset
- .
- The remainder of the analysis proceeded as in analysis A.
- For analysis C, the voom method was adapted to run on methylation array
- data and used to model and correct for the mean-variance trend using individual
- observation weights
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Law2013"
- literal "false"
- \end_inset
- , which were combined with the sample weights
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Liu2015"
- literal "false"
- \end_inset
- .
- Each time weights were used, they were estimated once before estimating
- the random effect correlation value, and then the weights were re-estimated
- taking the random effect into account.
- The remainder of the analysis proceeded as in analysis B.
- \end_layout
- \begin_layout Section
- Results
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Improve subsection titles in this section
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Subsection
- fRMA eliminates unwanted dependence of classifier training on normalization
- strategy caused by RMA
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Write figure legends
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Subsubsection
- Separate normalization with RMA introduces unwanted biases in classification
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/PAM/predplot.pdf
- width 100col%
- groupId colwidth
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:Classifier-probabilities-RMA"
- \end_inset
- \series bold
- Classifier probabilities on validation samples when normalized with RMA
- together vs.
- separately.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- To demonstrate the problem with non-single-channel normalization methods,
- we considered the problem of training a classifier to distinguish TX from
- AR using the samples from the internal set as training data, evaluating
- performance on the external set.
- First, training and evaluation were performed after normalizing all array
- samples together as a single set using RMA, and second, the internal samples
- were normalized separately from the external samples and the training and
- evaluation were repeated.
- For each sample in the validation set, the classifier probabilities from
- both classifiers were plotted against each other (Fig.
-
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:Classifier-probabilities-RMA"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- As expected, separate normalization biases the classifier probabilities,
- resulting in several misclassifications.
- In this case, the bias from separate normalization causes the classifier
- to assign a lower probability of AR to every sample.
-
- \end_layout
- \begin_layout Subsubsection
- fRMA and SCAN achieve maintain classification performance while eliminating
- dependence on normalization strategy
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- placement tb
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/PAM/ROC-TXvsAR-internal.pdf
- width 100col%
- groupId colwidth
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:ROC-PAM-int"
- \end_inset
- ROC curves for PAM on internal validation data using different normalization
- strategies
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="7" columns="4">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Normalization
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Single-channel?
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Internal Val.
- AUC
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- External Val.
- AUC
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- RMA
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.852
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.713
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- dChip
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.891
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.657
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- RMA + GRSN
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.816
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.750
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- dChip + GRSN
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.875
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.642
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- fRMA
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.863
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.718
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- SCAN
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.853
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0.689
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:AUC-PAM"
- \end_inset
- \series bold
- AUC values for internal and external validation with 6 different normalization
- strategies.
- \series default
- Only fRMA and SCAN are single-channel normalizations.
- The other 4 normalizations are for comparison.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- For internal validation, the 6 methods' AUC values ranged from 0.816 to 0.891,
- as shown in Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:AUC-PAM"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- Among the non-single-channel normalizations, dChip outperformed RMA, while
- GRSN reduced the AUC values for both dChip and RMA.
- Both single-channel methods, fRMA and SCAN, slightly outperformed RMA,
- with fRMA ahead of SCAN.
- However, the difference between RMA and fRMA is still quite small.
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:ROC-PAM-int"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows that the ROC curves for RMA, dChip, and fRMA look very similar and
- relatively smooth, while both GRSN curves and the curve for SCAN have a
- more jagged appearance.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- placement tb
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/PAM/ROC-TXvsAR-external.pdf
- width 100col%
- groupId colwidth
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:ROC-PAM-ext"
- \end_inset
- ROC curve for PAM on external validation data using different normalization
- strategies
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- For external validation, as expected, all the AUC values are lower than
- the internal validations, ranging from 0.642 to 0.750 (Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:AUC-PAM"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- With or without GRSN, RMA shows its dominance over dChip in this more challengi
- ng test.
- Unlike in the internal validation, GRSN actually improves the classifier
- performance for RMA, although it does not for dChip.
- Once again, both single-channel methods perform about on par with RMA,
- with fRMA performing slightly better and SCAN performing a bit worse.
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:ROC-PAM-ext"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the ROC curves for the external validation test.
- As expected, none of them are as clean-looking as the internal validation
- ROC curves.
- The curves for RMA, RMA+GRSN, and fRMA all look similar, while the other
- curves look more divergent.
- \end_layout
- \begin_layout Standard
- \begin_inset ERT
- status collapsed
- \begin_layout Plain Layout
- \backslash
- FloatBarrier
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Subsection
- fRMA with custom-generated vectors enables normalization on hthgu133pluspm
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- placement tb
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/batchsize_batches.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:batch-size-batches"
- \end_inset
- \series bold
- Effect of batch size selection on number of batches included in fRMA probe
- weight learning.
-
- \series default
- For batch sizes ranging from 3 to 15, the number of batches with at least
- that many samples was plotted for biopsy (BX) and blood (PAX) samples.
- The selected batch size, 5, is marked with a dotted vertical line.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- placement tb
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/batchsize_samples.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:batch-size-samples"
- \end_inset
- \series bold
- Effect of batch size selection on number of samples included in fRMA probe
- weight learning.
-
- \series default
- For batch sizes ranging from 3 to 15, the number of samples included in
- probe weight training was plotted for biopsy (BX) and blood (PAX) samples.
- The selected batch size, 5, is marked with a dotted vertical line.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- In order to enable use of fRMA to normalize hthgu133pluspm, a custom set
- of fRMA vectors was created.
- First, an appropriate batch size was chosen by looking at the number of
- batches and number of samples included as a function of batch size (Figures
-
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:batch-size-batches"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- and
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:batch-size-samples"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , respectively).
- For a given batch size, all batches with fewer samples that the chosen
- size must be ignored during training, while larger batches must be randomly
- downsampled to the chosen size.
- Hence, the number of samples included for a given batch size equals the
- batch size times the number of batches with at least that many samples.
- From Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:batch-size-samples"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , it is apparent that that a batch size of 8 maximizes the number of samples
- included in training.
- Increasing the batch size beyond this causes too many smaller batches to
- be excluded, reducing the total number of samples for both tissue types.
- However, a batch size of 8 is not necessarily optimal.
- The article introducing frmaTools concluded that it was highly advantageous
- to use a smaller batch size in order to include more batches, even at the
- expense of including fewer total samples in training
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCall2011"
- literal "false"
- \end_inset
- .
- To strike an appropriate balance between more batches and more samples,
- a batch size of 5 was chosen.
- For both blood and biopsy samples, this increased the number of batches
- included by 10, with only a modest reduction in the number of samples compared
- to a batch size of 8.
- With a batch size of 5, 26 batches of biopsy samples and 46 batches of
- blood samples were available.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/M-BX-violin.pdf
- lyxscale 40
- height 80theight%
- groupId m-violin
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:m-bx-violin"
- \end_inset
- \series bold
- Violin plot of log ratios between normalizations for 20 biopsy samples.
-
- \series default
- Each of 20 randomly selected biopsy samples was normalized with RMA and
- with 5 different sets of fRMA vectors.
- This shows the distribution of log ratios between normalized expression
- values, aggregated across all 20 arrays.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Since fRMA training requires equal-size batches, larger batches are downsampled
- randomly.
- This introduces a nondeterministic step in the generation of normalization
- vectors.
- To show that this randomness does not substantially change the outcome,
- the random downsampling and subsequent vector learning was repeated 5 times,
- with a different random seed each time.
- 20 samples were selected at random as a test set and normalized with each
- of the 5 sets of fRMA normalization vectors as well as ordinary RMA, and
- the normalized expression values were compared across normalizations.
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:m-bx-violin"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows a summary of these comparisons for biopsy samples.
- Comparing RMA to each of the 5 fRMA normalizations, the distribution of
- log ratios is somewhat wide, indicating that the normalizations disagree
- on the expression values of a fair number of probe sets.
- In contrast, comparisons of fRMA against fRMA, the vast mojority of probe
- sets have very small log ratios, indicating a very high agreement between
- the normalized values generated by the two normalizations.
- This shows that the fRMA normalization's behavior is not very sensitive
- to the random downsampling of larger batches during training.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/MA-BX-RMA.fRMA.pdf
- lyxscale 50
- width 100text%
- groupId ma-frma
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:ma-bx-rma-frma"
- \end_inset
- \series bold
- Representative MA plot comparing RMA against fRMA for 20 biopsy samples.
-
- \series default
- Averages and log ratios were computed for every probe in each of 20 biopsy
- samples between RMA normalization and fRMA.
- Density of points is represented by darkness of shading, and individual
- outlier points are plotted.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/MA-BX-fRMA.fRMA.pdf
- lyxscale 50
- width 100text%
- groupId ma-frma
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:ma-bx-frma-frma"
- \end_inset
- \series bold
- Representative MA plot comparing different fRMA vectors for 20 biopsy samples.
-
- \series default
- Averages and log ratios were computed for every probe in each of 20 biopsy
- samples between fRMA normalizations using vectors from two different batch
- samplings.
- Density of points is represented by darkness of shading, and individual
- outlier points are plotted.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:ma-bx-rma-frma"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows an MA plot of the RMA-normalized values against the fRMA-normalized
- values for the same probe sets and arrays, corresponding to the first row
- of Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:m-bx-violin"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- This MA plot shows that not only is there a wide distribution of M-values,
- but the trend of M-values is dependent on the average normalized intensity.
- This is expected, since the overall trend represents the differences in
- the quantile normalization step.
- When running RMA, only the quantiles for these specific 20 arrays are used,
- while for fRMA the quantile distribution is taking from all arrays used
- in training.
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:ma-bx-frma-frma"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows a similar MA plot comparing 2 different fRMA normalizations, correspondin
- g to the 6th row of Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:m-bx-violin"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- The MA plot is very tightly centered around zero with no visible trend.
- Figures
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:m-pax-violin"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ,
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:MA-PAX-rma-frma"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , and
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:ma-bx-frma-frma"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- show exactly the same information for the blood samples, once again comparing
- the normalized expression values between normalizations for all probe sets
- across 20 randomly selected test arrays.
- Once again, there is a wider distribution of log ratios between RMA-normalized
- values and fRMA-normalized, and a much tighter distribution when comparing
- different fRMA normalizations to each other, indicating that the fRMA training
- process is robust to random batch downsampling for the blood samples as
- well.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/M-PAX-violin.pdf
- lyxscale 40
- height 80theight%
- groupId m-violin
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:m-pax-violin"
- \end_inset
- \series bold
- Violin plot of log ratios between normalizations for 20 blood samples.
-
- \series default
- Each of 20 randomly selected blood samples was normalized with RMA and with
- 5 different sets of fRMA vectors.
- This shows the distribution of log ratios between normalized expression
- values, aggregated across all 20 arrays.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/MA-PAX-RMA.fRMA.pdf
- lyxscale 50
- width 100text%
- groupId ma-frma
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:MA-PAX-rma-frma"
- \end_inset
- \series bold
- Representative MA plot comparing RMA against fRMA for 20 blood samples.
-
- \series default
- Averages and log ratios were computed for every probe in each of 20 blood
- samples between RMA normalization and fRMA.
- Density of points is represented by darkness of shading, and individual
- outlier points are plotted.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/frma-pax-bx/MA-PAX-fRMA.fRMA.pdf
- lyxscale 50
- width 100text%
- groupId ma-frma
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:MA-PAX-frma-frma"
- \end_inset
- \series bold
- Representative MA plot comparing different fRMA vectors for 20 blood samples.
-
- \series default
- Averages and log ratios were computed for every probe in each of 20 blood
- samples between fRMA normalizations using vectors from two different batch
- samplings.
- Density of points is represented by darkness of shading, and individual
- outlier points are plotted.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset ERT
- status collapsed
- \begin_layout Plain Layout
- \backslash
- FloatBarrier
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Subsection
- SVA, voom, and array weights improve model fit for methylation array data
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Fix axis labels:
- \begin_inset Quotes eld
- \end_inset
- log2 M-value
- \begin_inset Quotes erd
- \end_inset
- is redundant because M-values are already log scale
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/methylvoom/unadj.dupcor/meanvar-trends-PAGE1-CROP-RASTER.png
- lyxscale 15
- width 100col%
- groupId raster-600ppi
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:meanvar-basic"
- \end_inset
- Mean-variance trend for analysis A.
-
- \series default
- The log2(standard deviation) for each probe is plotted against the probe's
- average M-value across all samples as a black point, with some transparency
- to make overplotting more visible, since there are about 450,000 points.
- Density of points is also indicated by the dark blue contour lines.
- The prior variance trend estimated by eBayes is shown in light blue, while
- the lowess trend of the points is shown in red.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-basic"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the relationship between the mean M-value and the standard deviation
- calculated for each probe in the methylation array data set.
- A few features of the data are apparent.
- First, the data are very strongly bimodal, with peaks in the density around
- M-values of +4 and -4.
- These modes correspond to methylation sites that are nearly 100% methylated
- and nearly 100% unmethylated, respectively.
- The strong bomodality indicates that a majority of probes interrogate sites
- that fall into one of these two categories.
- The points in between these modes represent sites that are either partially
- methylated in many samples, or are fully methylated in some samples and
- fully unmethylated in other samples, or some combination.
- The next visible feature of the data is the W-shaped variance trend.
- The upticks in the variance trend on either side are expected, based on
- the sigmoid transformation exaggerating small differences at extreme M-values
- (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:Sigmoid-beta-m-mapping"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- However, the uptick in the center is interesting: it indicates that sites
- that are not constitutitively methylated or unmethylated have a higher
- variance.
- This could be a genuine biological effect, or it could be spurious noise
- that is only observable at sites with varying methylation.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \begin_inset Graphics
- filename graphics/methylvoom/unadj.dupcor.sva.aw/meanvar-trends-PAGE1-CROP-RASTER.png
- lyxscale 15
- width 100col%
- groupId raster-600ppi
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:meanvar-sva-aw"
- \end_inset
- Mean-variance trend for analysis B.
-
- \series default
- Interpretation is as in Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-basic"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- In Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-sva-aw"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , we see the mean-variance trend for the same methylation array data, this
- time with surrogate variables and sample quality weights estimated from
- the data and included in the model.
- As expected, the overall average variance is smaller, since the surrogate
- variables account for some of the variance.
- In addition, the uptick in variance in the middle of the M-value range
- has disappeared, turning the W shape into a wide U shape.
- This indicates that the excess variance in the probes with intermediate
- M-values was explained by systematic variations not correlated with known
- covariates, and these variations were modeled by the surrogate variables.
- The result is a nearly flat variance trend for the entire intermediate
- M-value range from about -3 to +3.
- In contrast, the excess variance at the extremes was not
- \begin_inset Quotes eld
- \end_inset
- absorbed
- \begin_inset Quotes erd
- \end_inset
- by the surrogate variables and remains in the plot, indicating that this
- variation has no systematic component: probes with extreme M-values are
- uniformly more variable across all samples, as expected.
-
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \begin_inset Graphics
- filename graphics/methylvoom/unadj.dupcor.sva.voomaw/meanvar-trends-PAGE2-CROP-RASTER.png
- lyxscale 15
- width 100col%
- groupId raster-600ppi
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:meanvar-sva-voomaw"
- \end_inset
- Mean-variance trend after voom modeling in analysis C.
-
- \series default
- Interpretation is as in Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-basic"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-sva-voomaw"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the mean-variance trend after fitting the model with the observation
- weights assigned by voom based on the mean-variance trend shown in Figure
-
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-sva-aw"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- As expected, the weights exactly counteract the trend in the data, resulting
- in a nearly flat trend centered vertically at 1 (i.e.
- 0 on the log scale).
- This shows that the observations with extreme M-values have been appropriately
- down-weighted to account for the fact that the noise in those observations
- has been amplified by the non-linear M-value transformation.
- In turn, this gives relatively more weight to observervations in the middle
- region, which are more likely to correspond to probes measuring interesting
- biology (not constitutively methylated or unmethylated).
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="5" columns="3">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Covariate
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Test used
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- p-value
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Transplant Status
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- F-test
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0.404
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Diabetes Diagnosis
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- t-test
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0.00106
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Sex
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- t-test
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0.148
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Age
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- linear regression
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0.212
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:weight-covariate-tests"
- \end_inset
- Association of sample weights with clinical covariates in methylation array
- data.
-
- \series default
- Computed sample quality log weights were tested for significant association
- with each of the variables in the model (1st column).
- An appropriate test was selected for each variable (2nd column).
- P-values for significant association are shown in the 3rd column.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Redo the sample weight boxplot with notches and without fill colors (and
- update the legend)
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \begin_inset Graphics
- filename graphics/methylvoom/unadj.dupcor.sva.voomaw/sample-weights-PAGE3-CROP.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:diabetes-sample-weights"
- \end_inset
- \series bold
- Boxplot of sample quality weights grouped by diabetes diagnosis.
-
- \series default
- Sample were grouped based on diabetes diagnosis, and the distribution of
- sample quality weights for each diagnosis was plotted.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- To determine whether any of the known experimental factors had an impact
- on data quality, the sample quality weights estimated from the data were
- tested for association with each of the experimental factors (Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:weight-covariate-tests"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- Diabetes diagnosis was found to have a potentially significant association
- with the sample weights, with a t-test p-value of
- \begin_inset Formula $1.06\times10^{-3}$
- \end_inset
- .
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:diabetes-sample-weights"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the distribution of sample weights grouped by diabetes diagnosis.
- The samples from patients with Type 2 diabetes were assigned significantly
- lower weights than those from patients with Type 1 diabetes.
- This indicates that the type 2 diabetes samples had an overall higher variance
- on average across all probes.
-
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Consider transposing this table and the next one
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="5" columns="4">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Analysis
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Contrast
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- A
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- B
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- C
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- TX vs AR
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 25
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 22
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- TX vs ADNR
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 7
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 338
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 369
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- TX vs CAN
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 231
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 278
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:methyl-num-signif"
- \end_inset
- \series bold
- Number of probes significant at 10% FDR for each contrast in each analysis.
-
- \series default
- For each of the analyses in Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:Summary-of-meth-analysis"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , the table shows the number of probes called significantly differentially
- methylated at a threshold of 10% FDR for each comparison between TX and
- the other 3 transplant statuses.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="5" columns="4">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Analysis
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- Contrast
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- A
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- B
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- C
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- TX vs AR
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 0
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 10,063
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 11,225
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- TX vs ADNR
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 27
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 12,674
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 13,086
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- TX vs CAN
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 966
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 20,039
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- 20,955
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:methyl-est-nonnull"
- \end_inset
- \series bold
- Estimated number of non-null tests for each contrast in each analysis.
-
- \series default
- For each of the analyses in Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:Summary-of-meth-analysis"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , the table shows the number of probes estimated to be differentially methylated
- between TX and the other 3 transplant statuses.
- \end_layout
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status collapsed
- \begin_layout Plain Layout
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Re-generate p-value histograms for all relevant contrasts in a single page,
- then write an appropriate legend.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \align center
- \series bold
- [Figure goes here]
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:meth-p-value-histograms"
- \end_inset
- Probe p-value histograms for each contrast in each analysis.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:methyl-num-signif"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the number of significantly differentially methylated probes reported
- by each analysis for each comparison of interest at an FDR of 10%.
- As expected, the more elaborate analyses, B and C, report more significant
- probes than the more basic analysis A, consistent with the conclusions
- above that the data contain hidden systematic variations that must be modeled.
- Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:methyl-est-nonnull"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the estimated number differentially methylated probes for each test
- from each analysis.
- This was computed by estimating the proportion of null hypotheses that
- were true using the method of
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Phipson2013"
- literal "false"
- \end_inset
- and subtracting that fraction from the total number of probes, yielding
- an estimate of the number of null hypotheses that are false based on the
- distribution of p-values across the entire dataset.
- Note that this does not identify which null hypotheses should be rejected
- (i.e.
- which probes are significant); it only estimates the true number of such
- probes.
- Once again, analyses B and C result it much larger estimates for the number
- of differentially methylated probes.
- In this case, analysis C, the only analysis that includes voom, estimates
- the largest number of differentially methylated probes for all 3 contrasts.
- If the assumptions of all the methods employed hold, then this represents
- a gain in statistical power over the simpler analysis A.
- Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meth-p-value-histograms"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- shows the p-value distributions for each test, from which the numbers in
- Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:methyl-est-nonnull"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- were generated.
- The distributions for analysis A all have a dip in density near zero, which
- is a strong sign of a poor model fit.
- The histograms for analyses B and C are more well-behaved, with a uniform
- component stretching all the way from 0 to 1 representing the probes for
- which the null hypotheses is true (no differential methylation), and a
- zero-biased component representing the probes for which the null hypothesis
- is false (differentially methylated).
- These histograms do not indicate any major issues with the model fit.
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Maybe include the PCA plots before/after SVA effect subtraction?
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset ERT
- status collapsed
- \begin_layout Plain Layout
- \backslash
- FloatBarrier
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Section
- Discussion
- \end_layout
- \begin_layout Subsection
- fRMA achieves clinically applicable normalization without sacrificing classifica
- tion performance
- \end_layout
- \begin_layout Standard
- As shown in Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:Classifier-probabilities-RMA"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , improper normalization, particularly separate normalization of training
- and test samples, leads to unwanted biases in classification.
- In a controlled experimental context, it is always possible to correct
- this issue by normalizing all experimental samples together.
- However, because it is not feasible to normalize all samples together in
- a clinical context, a single-channel normalization is required is required.
-
- \end_layout
- \begin_layout Standard
- The major concern in using a single-channel normalization is that non-single-cha
- nnel methods can share information between arrays to improve the normalization,
- and single-channel methods risk sacrificing the gains in normalization
- accuracy that come from this information sharing.
- In the case of RMA, this information sharing is accomplished through quantile
- normalization and median polish steps.
- The need for information sharing in quantile normalization can easily be
- removed by learning a fixed set of quantiles from external data and normalizing
- each array to these fixed quantiles, instead of the quantiles of the data
- itself.
- As long as the fixed quantiles are reasonable, the result will be similar
- to standard RMA.
- However, there is no analogous way to eliminate cross-array information
- sharing in the median polish step, so fRMA replaces this with a weighted
- average of probes on each array, with the weights learned from external
- data.
- This step of fRMA has the greatest potential to diverge from RMA un undesirable
- ways.
- \end_layout
- \begin_layout Standard
- However, when run on real data, fRMA performed at least as well as RMA in
- both the internal validation and external validation tests.
- This shows that fRMA can be used to normalize individual clinical samples
- in a class prediction context without sacrificing the classifier performance
- that would be obtained by using the more well-established RMA for normalization.
- The other single-channel normalization method considered, SCAN, showed
- some loss of AUC in the external validation test.
- Based on these results, fRMA is the preferred normalization for clinical
- samples in a class prediction context.
- \end_layout
- \begin_layout Subsection
- Robust fRMA vectors can be generated for new array platforms
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Look up the exact numbers, do a find & replace for
- \begin_inset Quotes eld
- \end_inset
- 850
- \begin_inset Quotes erd
- \end_inset
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- The published fRMA normalization vectors for the hgu133plus2 platform were
- generated from a set of about 850 samples chosen from a wide range of tissues,
- which the authors determined was sufficient to generate a robust set of
- normalization vectors that could be applied across all tissues
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCall2010"
- literal "false"
- \end_inset
- .
- Since we only had hthgu133pluspm for 2 tissues of interest, our needs were
- more modest.
- Even using only 130 samples in 26 batches of 5 samples each for kidney
- biopsies, we were able to train a robust set of fRMA normalization vectors
- that were not meaningfully affected by the random selection of 5 samples
- from each batch.
- As expected, the training process was just as robust for the blood samples
- with 230 samples in 46 batches of 5 samples each.
- Because these vectors were each generated using training samples from a
- single tissue, they are not suitable for general use, unlike the vectors
- provided with fRMA itself.
- They are purpose-built for normalizing a specific type of sample on a specific
- platform.
- This is a mostly acceptable limitation in the context of developing a machine
- learning classifier for diagnosing a disease based on samples of a specific
- tissue.
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- How to bring up that these custom vectors were used in another project by
- someone else that was never published?
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Subsection
- Methylation array data can be successfully analyzed using existing techniques,
- but machine learning poses additional challenges
- \end_layout
- \begin_layout Standard
- Both analysis strategies B and C both yield a reasonable analysis, with
- a mean-variance trend that matches the expected behavior for the non-linear
- M-value transformation (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-sva-aw"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ) and well-behaved p-value distributions (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meth-p-value-histograms"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- These two analyses also yield similar numbers of significant probes (Table
-
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:methyl-num-signif"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ) and similar estimates of the number of differentially methylated probes
- (Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:methyl-est-nonnull"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- The main difference between these two analyses is the method used to account
- for the mean-variance trend.
- In analysis B, the trend is estimated and applied at the probe level: each
- probe's estimated variance is squeezed toward the trend using an empirical
- Bayes procedure (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-sva-aw"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- In analysis C, the trend is still estimated at the probe level, but instead
- of estimating a single variance value shared across all observations for
- a given probe, the voom method computes an initial estiamte of the variance
- for each observation individually based on where its model-fitted M-value
- falls on the trend line and then assigns inverse-variance weights to model
- the difference in variance between observations.
- An overall variance is still estimated for each probe using the same empirical
- Bayes method, but now the residual trend is flat (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:meanvar-sva-voomaw"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ), and the mean-variance trend is modeled by scaling the probe's estimated
- variance for each observation using the weights computed by voom.
- The difference between these two methods is analogous to the difference
- between a t-test with equal variance and a t-test with unequal variance,
- except that the unequal group variances used in the latter test are estimated
- based on the mean-variance trend from all the probes rather than the data
- for the specific probe being tested, thus stabilizing the group variance
- estimates by sharing information between probes.
- In practice, allowing voom to model the variance using observation weights
- in this manner allows the linear model fit to concentrate statistical power
- where it will do the most good.
- For example, if a particular probe's M-values are always at the extreme
- of the M-value range (e.g.
- less than -4) for ADNR samples, but the M-values for that probe in TX and
- CAN samples are within the flat region of the mean-variance trend (between
- -3 and +3), voom is able to down-weight the contribution of the high-variance
- M-values from the ADNR samples in order to gain more statistical power
- while testing for differential methylation between TX and CAN.
- In contrast, modeling the mean-variance trend only at the probe level would
- combine the high-variance ADNR samples and lower-variance samples from
- other conditions and estimate an intermediate variance for this probe.
- In practice, analysis B shows that this approach is adequate, but the voom
- approach in analysis C is at least as good on all model fit criteria and
- yields a larger estimate for the number of differentially methylated genes.
- \end_layout
- \begin_layout Standard
- The significant association of diebetes diagnosis with sample quality is
- interesting.
- The samples with Type 2 diabetes tended to have more variation, averaged
- across all probes, than those with Type 1 diabetes.
- This is consistent with the consensus that type 2 disbetes and the associated
- metabolic syndrome represent a broad dysregulation of the body's endocrine
- signalling related to metabolism [citation needed].
- This dysregulation could easily manifest as a greater degree of variation
- in the DNA methylation patterns of affected tissues.
- In contrast, Type 1 disbetes has a more specific cause and effect, so a
- less variable methylation signature is expected.
- \end_layout
- \begin_layout Standard
- This preliminary anlaysis suggests that some degree of differential methylation
- exists between TX and each of the three types of transplant disfunction
- studied.
- Hence, it may be feasible to train a classifier to diagnose transplant
- disfunction from DNA methylation array data.
- However, the major importance of both SVA and sample quality weighting
- for proper modeling of this data poses significant challenges for any attempt
- at a machine learning on data of similar quality.
- While these are easily used in a modeling context with full sample information,
- neither of these methods is directly applicable in a machine learning context,
- where the diagnosis is not known ahead of time.
- If a machine learning approach for methylation-based diagnosis is to be
- pursued, it will either require machine-learning-friendly methods to address
- the same systematic trends in the data that SVA and sample quality weighting
- address, or it will require higher quality data with substantially less
- systematic perturbation of the data.
- \end_layout
- \begin_layout Chapter
- Globin-blocking for more effective blood RNA-seq analysis in primate animal
- model
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Choose between above and the paper title: Optimizing yield of deep RNA sequencin
- g for gene expression profiling by globin reduction of peripheral blood
- samples from cynomolgus monkeys (Macaca fascicularis).
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Chapter author list: https://tex.stackexchange.com/questions/156862/displaying-aut
- hor-for-each-chapter-in-book Every chapter gets an author list, which may
- or may not be part of a citation to a published/preprinted paper.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Preprint then cite the paper
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Section*
- Abstract
- \end_layout
- \begin_layout Paragraph
- Background
- \end_layout
- \begin_layout Standard
- Primate blood contains high concentrations of globin messenger RNA.
- Globin reduction is a standard technique used to improve the expression
- results obtained by DNA microarrays on RNA from blood samples.
- However, with whole transcriptome RNA-sequencing (RNA-seq) quickly replacing
- microarrays for many applications, the impact of globin reduction for RNA-seq
- has not been previously studied.
- Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
- primates.
-
- \end_layout
- \begin_layout Paragraph
- Results
- \end_layout
- \begin_layout Standard
- Here we report a protocol for RNA-seq in primate blood samples that uses
- complimentary oligonucleotides to block reverse transcription of the alpha
- and beta globin genes.
- In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
- blocking protocol approximately doubles the yield of informative (non-globin)
- reads by greatly reducing the fraction of globin reads, while also improving
- the consistency in sequencing depth between samples.
- The increased yield enables detection of about 2000 more genes, significantly
- increases the correlation in measured gene expression levels between samples,
- and increases the sensitivity of differential gene expression tests.
- \end_layout
- \begin_layout Paragraph
- Conclusions
- \end_layout
- \begin_layout Standard
- These results show that globin blocking significantly improves the cost-effectiv
- eness of mRNA sequencing in primate blood samples by doubling the yield
- of useful reads, allowing detection of more genes, and improving the precision
- of gene expression measurements.
- Based on these results, a globin reducing or blocking protocol is recommended
- for all RNA-seq studies of primate blood samples.
- \end_layout
- \begin_layout Section
- Approach
- \end_layout
- \begin_layout Standard
- \begin_inset Note Note
- status open
- \begin_layout Plain Layout
- Consider putting some of this in the Intro chapter
- \end_layout
- \begin_layout Itemize
- Cynomolgus monkeys as a model organism
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Highly related to humans
- \end_layout
- \begin_layout Itemize
- Small size and short life cycle - good research animal
- \end_layout
- \begin_layout Itemize
- Genomics resources still in development
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Inadequacy of existing blood RNA-seq protocols
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- Existing protocols use a separate globin pulldown step, slowing down processing
- \end_layout
- \end_deeper
- \end_inset
- \end_layout
- \begin_layout Standard
- Increasingly, researchers are turning to high-throughput mRNA sequencing
- technologies (RNA-seq) in preference to expression microarrays for analysis
- of gene expression
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Mutz2012"
- literal "false"
- \end_inset
- .
- The advantages are even greater for study of model organisms with no well-estab
- lished array platforms available, such as the cynomolgus monkey (Macaca
- fascicularis).
- High fractions of globin mRNA are naturally present in mammalian peripheral
- blood samples (up to 70% of total mRNA) and these are known to interfere
- with the results of array-based expression profiling
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Winn2010"
- literal "false"
- \end_inset
- .
- The importance of globin reduction for RNA-seq of blood has only been evaluated
- for a deepSAGE protocol on human samples
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Mastrokolias2012"
- literal "false"
- \end_inset
- .
- In the present report, we evaluated globin reduction using custom blocking
- oligonucleotides for deep RNA-seq of peripheral blood samples from a nonhuman
- primate, cynomolgus monkey, using the Illumina technology platform.
- We demonstrate that globin reduction significantly improves the cost-effectiven
- ess of RNA-seq in blood samples.
- Thus, our protocol offers a significant advantage to any investigator planning
- to use RNA-seq for gene expression profiling of nonhuman primate blood
- samples.
- Our method can be generally applied to any species by designing complementary
- oligonucleotide blocking probes to the globin gene sequences of that species.
- Indeed, any highly expressed but biologically uninformative transcripts
- can also be blocked to further increase sequencing efficiency and value
-
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Arnaud2016"
- literal "false"
- \end_inset
- .
- \end_layout
- \begin_layout Section
- Methods
- \end_layout
- \begin_layout Subsection
- Sample collection
- \end_layout
- \begin_layout Standard
- All research reported here was done under IACUC-approved protocols at the
- University of Miami and complied with all applicable federal and state
- regulations and ethical principles for nonhuman primate research.
- Blood draws occurred between 16 April 2012 and 18 June 2015.
- The experimental system involved intrahepatic pancreatic islet transplantation
- into Cynomolgus monkeys with induced diabetes mellitus with or without
- concomitant infusion of mesenchymal stem cells.
- Blood was collected at serial time points before and after transplantation
- into PAXgene Blood RNA tubes (PreAnalytiX/Qiagen, Valencia, CA) at the
- precise volume:volume ratio of 2.5 ml whole blood into 6.9 ml of PAX gene
- additive.
- \end_layout
- \begin_layout Subsection
- Globin Blocking
- \end_layout
- \begin_layout Standard
- Four oligonucleotides were designed to hybridize to the 3’ end of the transcript
- s for Cynomolgus HBA1, HBA2 and HBB, with two hybridization sites for HBB
- and 2 sites for HBA (the chosen sites were identical in both HBA genes).
- All oligos were purchased from Sigma and were entirely composed of 2’O-Me
- bases with a C3 spacer positioned at the 3’ ends to prevent any polymerase
- mediated primer extension.
- \end_layout
- \begin_layout Quote
- HBA1/2 site 1: GCCCACUCAGACUUUAUUCAAAG-C3spacer
- \end_layout
- \begin_layout Quote
- HBA1/2 site 2: GGUGCAAGGAGGGGAGGAG-C3spacer
- \end_layout
- \begin_layout Quote
- HBB site 1: AAUGAAAAUAAAUGUUUUUUAUUAG-C3spacer
- \end_layout
- \begin_layout Quote
- HBB site 2: CUCAAGGCCCUUCAUAAUAUCCC-C3spacer
- \end_layout
- \begin_layout Subsection
- RNA-seq Library Preparation
- \end_layout
- \begin_layout Standard
- Sequencing libraries were prepared with 200ng total RNA from each sample.
- Polyadenylated mRNA was selected from 200 ng aliquots of cynomologus blood-deri
- ved total RNA using Ambion Dynabeads Oligo(dT)25 beads (Invitrogen) following
- manufacturer’s recommended protocol.
- PolyA selected RNA was then combined with 8 pmol of HBA1/2 (site 1), 8
- pmol of HBA1/2 (site 2), 12 pmol of HBB (site 1) and 12 pmol of HBB (site
- 2) oligonucleotides.
- In addition, 20 pmol of RT primer containing a portion of the Illumina
- adapter sequence (B-oligo-dTV: GAGTTCCTTGGCACCCGAGAATTCCATTTTTTTTTTTTTTTTTTTV)
- and 4 µL of 5X First Strand buffer (250 mM Tris-HCl pH 8.3, 375 mM KCl,
- 15mM MgCl2) were added in a total volume of 15 µL.
- The RNA was fragmented by heating this cocktail for 3 minutes at 95°C and
- then placed on ice.
- This was followed by the addition of 2 µL 0.1 M DTT, 1 µL RNaseOUT, 1 µL
- 10mM dNTPs 10% biotin-16 aminoallyl-2’- dUTP and 10% biotin-16 aminoallyl-2’-
- dCTP (TriLink Biotech, San Diego, CA), 1 µL Superscript II (200U/ µL, Thermo-Fi
- sher).
- A second “unblocked” library was prepared in the same way for each sample
- but replacing the blocking oligos with an equivalent volume of water.
- The reaction was carried out at 25°C for 15 minutes and 42°C for 40 minutes,
- followed by incubation at 75°C for 10 minutes to inactivate the reverse
- transcriptase.
- \end_layout
- \begin_layout Standard
- The cDNA/RNA hybrid molecules were purified using 1.8X Ampure XP beads (Agencourt
- ) following supplier’s recommended protocol.
- The cDNA/RNA hybrid was eluted in 25 µL of 10 mM Tris-HCl pH 8.0, and then
- bound to 25 µL of M280 Magnetic Streptavidin beads washed per recommended
- protocol (Thermo-Fisher).
- After 30 minutes of binding, beads were washed one time in 100 µL 0.1N NaOH
- to denature and remove the bound RNA, followed by two 100 µL washes with
- 1X TE buffer.
- \end_layout
- \begin_layout Standard
- Subsequent attachment of the 5-prime Illumina A adapter was performed by
- on-bead random primer extension of the following sequence (A-N8 primer:
- TTCAGAGTTCTACAGTCCGACGATCNNNNNNNN).
- Briefly, beads were resuspended in a 20 µL reaction containing 5 µM A-N8
- primer, 40mM Tris-HCl pH 7.5, 20mM MgCl2, 50mM NaCl, 0.325U/µL Sequenase
- 2.0 (Affymetrix, Santa Clara, CA), 0.0025U/µL inorganic pyrophosphatase (Affymetr
- ix) and 300 µM each dNTP.
- Reaction was incubated at 22°C for 30 minutes, then beads were washed 2
- times with 1X TE buffer (200µL).
- \end_layout
- \begin_layout Standard
- The magnetic streptavidin beads were resuspended in 34 µL nuclease-free
- water and added directly to a PCR tube.
- The two Illumina protocol-specified PCR primers were added at 0.53 µM (Illumina
- TruSeq Universal Primer 1 and Illumina TruSeq barcoded PCR primer 2), along
- with 40 µL 2X KAPA HiFi Hotstart ReadyMix (KAPA, Willmington MA) and thermocycl
- ed as follows: starting with 98°C (2 min-hold); 15 cycles of 98°C, 20sec;
- 60°C, 30sec; 72°C, 30sec; and finished with a 72°C (2 min-hold).
- \end_layout
- \begin_layout Standard
- PCR products were purified with 1X Ampure Beads following manufacturer’s
- recommended protocol.
- Libraries were then analyzed using the Agilent TapeStation and quantitation
- of desired size range was performed by “smear analysis”.
- Samples were pooled in equimolar batches of 16 samples.
- Pooled libraries were size selected on 2% agarose gels (E-Gel EX Agarose
- Gels; Thermo-Fisher).
- Products were cut between 250 and 350 bp (corresponding to insert sizes
- of 130 to 230 bps).
- Finished library pools were then sequenced on the Illumina NextSeq500 instrumen
- t with 75 base read lengths.
-
- \end_layout
- \begin_layout Subsection
- Read alignment and counting
- \end_layout
- \begin_layout Standard
- Reads were aligned to the cynomolgus genome using STAR
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Dobin2013,Wilson2013"
- literal "false"
- \end_inset
- .
- Counts of uniquely mapped reads were obtained for every gene in each sample
- with the “featureCounts” function from the Rsubread package, using each
- of the three possibilities for the “strandSpecific” option: sense, antisense,
- and unstranded
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Liao2014"
- literal "false"
- \end_inset
- .
- A few artifacts in the cynomolgus genome annotation complicated read counting.
- First, no ortholog is annotated for alpha globin in the cynomolgus genome,
- presumably because the human genome has two alpha globin genes with nearly
- identical sequences, making the orthology relationship ambiguous.
- However, two loci in the cynomolgus genome are as “hemoglobin subunit alpha-lik
- e” (LOC102136192 and LOC102136846).
- LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
- as protein-coding.
- Our globin reduction protocol was designed to include blocking of these
- two genes.
- Indeed, these two genes have almost the same read counts in each library
- as the properly-annotated HBB gene and much larger counts than any other
- gene in the unblocked libraries, giving confidence that reads derived from
- the real alpha globin are mapping to both genes.
- Thus, reads from both of these loci were counted as alpha globin reads
- in all further analyses.
- The second artifact is a small, uncharacterized non-coding RNA gene (LOC1021365
- 91), which overlaps the HBA-like gene (LOC102136192) on the opposite strand.
- If counting is not performed in stranded mode (or if a non-strand-specific
- sequencing protocol is used), many reads mapping to the globin gene will
- be discarded as ambiguous due to their overlap with this ncRNA gene, resulting
- in significant undercounting of globin reads.
- Therefore, stranded sense counts were used for all further analysis in
- the present study to insure that we accurately accounted for globin transcript
- reduction.
- However, we note that stranded reads are not necessary for RNA-seq using
- our protocol in standard practice.
-
- \end_layout
- \begin_layout Subsection
- Normalization and Exploratory Data Analysis
- \end_layout
- \begin_layout Standard
- Libraries were normalized by computing scaling factors using the edgeR package’s
- Trimmed Mean of M-values method
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Robinson2010"
- literal "false"
- \end_inset
- .
- Log2 counts per million values (logCPM) were calculated using the cpm function
- in edgeR for individual samples and aveLogCPM function for averages across
- groups of samples, using those functions’ default prior count values to
- avoid taking the logarithm of 0.
- Genes were considered “present” if their average normalized logCPM values
- across all libraries were at least -1.
- Normalizing for gene length was unnecessary because the sequencing protocol
- is 3’-biased and hence the expected read count for each gene is related
- to the transcript’s copy number but not its length.
- \end_layout
- \begin_layout Standard
- In order to assess the effect of blocking on reproducibility, Pearson and
- Spearman correlation coefficients were computed between the logCPM values
- for every pair of libraries within the globin-blocked (GB) and unblocked
- (non-GB) groups, and edgeR's “estimateDisp” function was used to compute
- negative binomial dispersions separately for the two groups
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Chen2014"
- literal "false"
- \end_inset
- .
- \end_layout
- \begin_layout Subsection
- Differential Expression Analysis
- \end_layout
- \begin_layout Standard
- All tests for differential gene expression were performed using edgeR, by
- first fitting a negative binomial generalized linear model to the counts
- and normalization factors and then performing a quasi-likelihood F-test
- with robust estimation of outlier gene dispersions
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Lund2012,Phipson2016"
- literal "false"
- \end_inset
- .
- To investigate the effects of globin blocking on each gene, an additive
- model was fit to the full data with coefficients for globin blocking and
- SampleID.
- To test the effect of globin blocking on detection of differentially expressed
- genes, the GB samples and non-GB samples were each analyzed independently
- as follows: for each animal with both a pre-transplant and a post-transplant
- time point in the data set, the pre-transplant sample and the earliest
- post-transplant sample were selected, and all others were excluded, yielding
- a pre-/post-transplant pair of samples for each animal (N=7 animals with
- paired samples).
- These samples were analyzed for pre-transplant vs.
- post-transplant differential gene expression while controlling for inter-animal
- variation using an additive model with coefficients for transplant and
- animal ID.
- In all analyses, p-values were adjusted using the Benjamini-Hochberg procedure
- for FDR control
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Benjamini1995"
- literal "false"
- \end_inset
- .
- \end_layout
- \begin_layout Standard
- \begin_inset Note Note
- status open
- \begin_layout Itemize
- New blood RNA-seq protocol to block reverse transcription of globin genes
- \end_layout
- \begin_layout Itemize
- Blood RNA-seq time course after transplants with/without MSC infusion
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Section
- Results
- \end_layout
- \begin_layout Subsection
- Globin blocking yields a larger and more consistent fraction of useful reads
- \end_layout
- \begin_layout Standard
- The objective of the present study was to validate a new protocol for deep
- RNA-seq of whole blood drawn into PaxGene tubes from cynomolgus monkeys
- undergoing islet transplantation, with particular focus on minimizing the
- loss of useful sequencing space to uninformative globin reads.
- The details of the analysis with respect to transplant outcomes and the
- impact of mesenchymal stem cell treatment will be reported in a separate
- manuscript (in preparation).
- To focus on the efficacy of our globin blocking protocol, 37 blood samples,
- 16 from pre-transplant and 21 from post-transplant time points, were each
- prepped once with and once without globin blocking oligos, and were then
- sequenced on an Illumina NextSeq500 instrument.
- The number of reads aligning to each gene in the cynomolgus genome was
- counted.
- Table 1 summarizes the distribution of read fractions among the GB and
- non-GB libraries.
- In the libraries with no globin blocking, globin reads made up an average
- of 44.6% of total input reads, while reads assigned to all other genes made
- up an average of 26.3%.
- The remaining reads either aligned to intergenic regions (that include
- long non-coding RNAs) or did not align with any annotated transcripts in
- the current build of the cynomolgus genome.
- In the GB libraries, globin reads made up only 3.48% and reads assigned
- to all other genes increased to 50.4%.
- Thus, globin blocking resulted in a 92.2% reduction in globin reads and
- a 91.6% increase in yield of useful non-globin reads.
- \end_layout
- \begin_layout Standard
- This reduction is not quite as efficient as the previous analysis showed
- for human samples by DeepSAGE (<0.4% globin reads after globin reduction)
-
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Mastrokolias2012"
- literal "false"
- \end_inset
- .
- Nonetheless, this degree of globin reduction is sufficient to nearly double
- the yield of useful reads.
- Thus, globin blocking cuts the required sequencing effort (and costs) to
- achieve a target coverage depth by almost 50%.
- Consistent with this near doubling of yield, the average difference in
- un-normalized logCPM across all genes between the GB libraries and non-GB
- libraries is approximately 1 (mean = 1.01, median = 1.08), an overall 2-fold
- increase.
- Un-normalized values are used here because the TMM normalization correctly
- identifies this 2-fold difference as biologically irrelevant and removes
- it.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/Globin Paper/figure1 - globin-fractions.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset Argument 1
- status collapsed
- \begin_layout Plain Layout
- Fraction of genic reads in each sample aligned to non-globin genes, with
- and without globin blocking (GB).
-
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:Fraction-of-genic-reads"
- \end_inset
- Fraction of genic reads in each sample aligned to non-globin genes, with
- and without globin blocking (GB).
- \series default
- All reads in each sequencing library were aligned to the cyno genome, and
- the number of reads uniquely aligning to each gene was counted.
- For each sample, counts were summed separately for all globin genes and
- for the remainder of the genes (non-globin genes), and the fraction of
- genic reads aligned to non-globin genes was computed.
- Each point represents an individual sample.
- Gray + signs indicate the means for globin-blocked libraries and unblocked
- libraries.
- The overall distribution for each group is represented as a notched box
- plots.
- Points are randomly spread vertically to avoid excessive overlapping.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- placement p
- wide false
- sideways true
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="4" columns="7">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Percent of Total Reads
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Percent of Genic Reads
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- GB
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Non-globin Reads
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Globin Reads
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- All Genic Reads
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- All Aligned Reads
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Non-globin Reads
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Globin Reads
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- Yes
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 50.4% ± 6.82
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 3.48% ± 2.94
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 53.9% ± 6.81
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 89.7% ± 2.40
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 93.5% ± 5.25
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 6.49% ± 5.25
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- No
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 26.3% ± 8.95
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 44.6% ± 16.6
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 70.1% ± 9.38
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 90.7% ± 5.16
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 38.8% ± 17.1
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 61.2% ± 17.1
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset Argument 1
- status collapsed
- \begin_layout Plain Layout
- Fractions of reads mapping to genomic features in GB and non-GB samples.
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:Fractions-of-reads"
- \end_inset
- Fractions of reads mapping to genomic features in GB and non-GB samples.
-
- \series default
- All values are given as mean ± standard deviation.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Another important aspect is that the standard deviations in Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:Fractions-of-reads"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- are uniformly smaller in the GB samples than the non-GB ones, indicating
- much greater consistency of yield.
- This is best seen in the percentage of non-globin reads as a fraction of
- total reads aligned to annotated genes (genic reads).
- For the non-GB samples, this measure ranges from 10.9% to 80.9%, while for
- the GB samples it ranges from 81.9% to 99.9% (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:Fraction-of-genic-reads"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- This means that for applications where it is critical that each sample
- achieve a specified minimum coverage in order to provide useful information,
- it would be necessary to budget up to 10 times the sequencing depth per
- sample without globin blocking, even though the average yield improvement
- for globin blocking is only 2-fold, because every sample has a chance of
- being 90% globin and 10% useful reads.
- Hence, the more consistent behavior of GB samples makes planning an experiment
- easier and more efficient because it eliminates the need to over-sequence
- every sample in order to guard against the worst case of a high-globin
- fraction.
- \end_layout
- \begin_layout Subsection
- Globin blocking lowers the noise floor and allows detection of about 2000
- more genes
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Remove redundant titles from figures
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/Globin Paper/figure2 - aveLogCPM-colored.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset Argument 1
- status collapsed
- \begin_layout Plain Layout
- Distributions of average group gene abundances when normalized separately
- or together.
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:logcpm-dists"
- \end_inset
- Distributions of average group gene abundances when normalized separately
- or together.
- \series default
- All reads in each sequencing library were aligned to the cyno genome, and
- the number of reads uniquely aligning to each gene was counted.
- Genes with zero counts in all libraries were discarded.
- Libraries were normalized using the TMM method.
- Libraries were split into globin-blocked (GB) and non-GB groups and the
- average abundance for each gene in both groups, measured in log2 counts
- per million reads counted, was computed using the aveLogCPM function.
- The distribution of average gene logCPM values was plotted for both groups
- using a kernel density plot to approximate a continuous distribution.
- The logCPM GB distributions are marked in red, non-GB in blue.
- The black vertical line denotes the chosen detection threshold of -1.
- Top panel: Libraries were split into GB and non-GB groups first and normalized
- separately.
- Bottom panel: Libraries were all normalized together first and then split
- into groups.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Since globin blocking yields more usable sequencing depth, it should also
- allow detection of more genes at any given threshold.
- When we looked at the distribution of average normalized logCPM values
- across all libraries for genes with at least one read assigned to them,
- we observed the expected bimodal distribution, with a high-abundance "signal"
- peak representing detected genes and a low-abundance "noise" peak representing
- genes whose read count did not rise above the noise floor (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:logcpm-dists"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- Consistent with the 2-fold increase in raw counts assigned to non-globin
- genes, the signal peak for GB samples is shifted to the right relative
- to the non-GB signal peak.
- When all the samples are normalized together, this difference is normalized
- out, lining up the signal peaks, and this reveals that, as expected, the
- noise floor for the GB samples is about 2-fold lower.
- This greater separation between signal and noise peaks in the GB samples
- means that low-expression genes should be more easily detected and more
- precisely quantified than in the non-GB samples.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/Globin Paper/figure3 - detection.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset Argument 1
- status collapsed
- \begin_layout Plain Layout
- Gene detections as a function of abundance thresholds in globin-blocked
- (GB) and non-GB samples.
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:Gene-detections"
- \end_inset
- Gene detections as a function of abundance thresholds in globin-blocked
- (GB) and non-GB samples.
- \series default
- Average abundance (logCPM,
- \begin_inset Formula $\log_{2}$
- \end_inset
- counts per million reads counted) was computed by separate group normalization
- as described in Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:logcpm-dists"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- for both the GB and non-GB groups, as well as for all samples considered
- as one large group.
- For each every integer threshold from -2 to 3, the number of genes detected
- at or above that logCPM threshold was plotted for each group.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- Based on these distributions, we selected a detection threshold of -1, which
- is approximately the leftmost edge of the trough between the signal and
- noise peaks.
- This represents the most liberal possible detection threshold that doesn't
- call substantial numbers of noise genes as detected.
- Among the full dataset, 13429 genes were detected at this threshold, and
- 22276 were not.
- When considering the GB libraries and non-GB libraries separately and re-comput
- ing normalization factors independently within each group, 14535 genes were
- detected in the GB libraries while only 12460 were detected in the non-GB
- libraries.
- Thus, GB allowed the detection of 2000 extra genes that were buried under
- the noise floor without GB.
- This pattern of at least 2000 additional genes detected with GB was also
- consistent across a wide range of possible detection thresholds, from -2
- to 3 (see Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:Gene-detections"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- \end_layout
- \begin_layout Subsection
- Globin blocking does not add significant additional noise or decrease sample
- quality
- \end_layout
- \begin_layout Standard
- One potential worry is that the globin blocking protocol could perturb the
- levels of non-globin genes.
- There are two kinds of possible perturbations: systematic and random.
- The former is not a major concern for detection of differential expression,
- since a 2-fold change in every sample has no effect on the relative fold
- change between samples.
- In contrast, random perturbations would increase the noise and obscure
- the signal in the dataset, reducing the capacity to detect differential
- expression.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/Globin Paper/figure4 - maplot-colored.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \begin_inset Argument 1
- status collapsed
- \begin_layout Plain Layout
- MA plot showing effects of globin blocking on each gene's abundance.
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:MA-plot"
- \end_inset
- \series bold
- MA plot showing effects of globin blocking on each gene's abundance.
-
- \series default
- All libraries were normalized together as described in Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:logcpm-dists"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- , and genes with an average logCPM below -1 were filtered out.
- Each remaining gene was tested for differential abundance with respect
- to globin blocking (GB) using edgeR’s quasi-likelihod F-test, fitting a
- negative binomial generalized linear model to table of read counts in each
- library.
- For each gene, edgeR reported average abundance (logCPM),
- \begin_inset Formula $\log_{2}$
- \end_inset
- fold change (logFC), p-value, and Benjamini-Hochberg adjusted false discovery
- rate (FDR).
- Each gene's logFC was plotted against its logCPM, colored by FDR.
- Red points are significant at ≤10% FDR, and blue are not significant at
- that threshold.
- The alpha and beta globin genes targeted for blocking are marked with large
- triangles, while all other genes are represented as small points.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Standardize on
- \begin_inset Quotes eld
- \end_inset
- log2
- \begin_inset Quotes erd
- \end_inset
- notation
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- The data do indeed show small systematic perturbations in gene levels (Figure
-
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:MA-plot"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- Other than the 3 designated alpha and beta globin genes, two other genes
- stand out as having especially large negative log fold changes: HBD and
- LOC1021365.
- HBD, delta globin, is most likely targeted by the blocking oligos due to
- high sequence homology with the other globin genes.
- LOC1021365 is the aforementioned ncRNA that is reverse-complementary to
- one of the alpha-like genes and that would be expected to be removed during
- the globin blocking step.
- All other genes appear in a cluster centered vertically at 0, and the vast
- majority of genes in this cluster show an absolute log2(FC) of 0.5 or less.
- Nevertheless, many of these small perturbations are still statistically
- significant, indicating that the globin blocking oligos likely cause very
- small but non-zero systematic perturbations in measured gene expression
- levels.
- \end_layout
- \begin_layout Standard
- \begin_inset Float figure
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Graphics
- filename graphics/Globin Paper/figure5 - corrplot.pdf
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset Argument 1
- status collapsed
- \begin_layout Plain Layout
- Comparison of inter-sample gene abundance correlations with and without
- globin blocking.
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "fig:gene-abundance-correlations"
- \end_inset
- Comparison of inter-sample gene abundance correlations with and without
- globin blocking (GB).
- \series default
- All libraries were normalized together as described in Figure 2, and genes
- with an average abundance (logCPM, log2 counts per million reads counted)
- less than -1 were filtered out.
- Each gene’s logCPM was computed in each library using the edgeR cpm function.
- For each pair of biological samples, the Pearson correlation between those
- samples' GB libraries was plotted against the correlation between the same
- samples’ non-GB libraries.
- Each point represents an unique pair of samples.
- The solid gray line shows a quantile-quantile plot of distribution of GB
- correlations vs.
- that of non-GB correlations.
- The thin dashed line is the identity line, provided for reference.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- To evaluate the possibility of globin blocking causing random perturbations
- and reducing sample quality, we computed the Pearson correlation between
- logCPM values for every pair of samples with and without GB and plotted
- them against each other (Figure
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "fig:gene-abundance-correlations"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- ).
- The plot indicated that the GB libraries have higher sample-to-sample correlati
- ons than the non-GB libraries.
- Parametric and nonparametric tests for differences between the correlations
- with and without GB both confirmed that this difference was highly significant
- (2-sided paired t-test: t = 37.2, df = 665, P ≪ 2.2e-16; 2-sided Wilcoxon
- sign-rank test: V = 2195, P ≪ 2.2e-16).
- Performing the same tests on the Spearman correlations gave the same conclusion
- (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
- The edgeR package was used to compute the overall biological coefficient
- of variation (BCV) for GB and non-GB libraries, and found that globin blocking
- resulted in a negligible increase in the BCV (0.417 with GB vs.
- 0.400 without).
- The near equality of the BCVs for both sets indicates that the higher correlati
- ons in the GB libraries are most likely a result of the increased yield
- of useful reads, which reduces the contribution of Poisson counting uncertainty
- to the overall variance of the logCPM values
- \begin_inset CommandInset citation
- LatexCommand cite
- key "McCarthy2012"
- literal "false"
- \end_inset
- .
- This improves the precision of expression measurements and more than offsets
- the negligible increase in BCV.
- \end_layout
- \begin_layout Subsection
- More differentially expressed genes are detected with globin blocking
- \end_layout
- \begin_layout Standard
- \begin_inset Float table
- wide false
- sideways false
- status open
- \begin_layout Plain Layout
- \align center
- \begin_inset Tabular
- <lyxtabular version="3" rows="5" columns="5">
- <features tabularvalignment="middle">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <column alignment="center" valignment="top">
- <row>
- <cell alignment="center" valignment="top" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- No Globin Blocking
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell alignment="center" valignment="top" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- Up
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- NS
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- Down
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell multirow="3" alignment="center" valignment="middle" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- Globin-Blocking
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- Up
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 231
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 515
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 2
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell multirow="4" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- NS
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 160
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 11235
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 136
- \end_layout
- \end_inset
- </cell>
- </row>
- <row>
- <cell multirow="4" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \series bold
- Down
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 0
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 548
- \end_layout
- \end_inset
- </cell>
- <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
- \begin_inset Text
- \begin_layout Plain Layout
- \family roman
- \series medium
- \shape up
- \size normal
- \emph off
- \bar no
- \strikeout off
- \xout off
- \uuline off
- \uwave off
- \noun off
- \color none
- 127
- \end_layout
- \end_inset
- </cell>
- </row>
- </lyxtabular>
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \begin_inset Caption Standard
- \begin_layout Plain Layout
- \series bold
- \begin_inset Argument 1
- status open
- \begin_layout Plain Layout
- Comparison of significantly differentially expressed genes with and without
- globin blocking.
- \end_layout
- \end_inset
- \begin_inset CommandInset label
- LatexCommand label
- name "tab:Comparison-of-significant"
- \end_inset
- Comparison of significantly differentially expressed genes with and without
- globin blocking.
- \series default
- Up, Down: Genes significantly up/down-regulated in post-transplant samples
- relative to pre-transplant samples, with a false discovery rate of 10%
- or less.
- NS: Non-significant genes (false discovery rate greater than 10%).
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Plain Layout
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- To compare performance on differential gene expression tests, we took subsets
- of both the GB and non-GB libraries with exactly one pre-transplant and
- one post-transplant sample for each animal that had paired samples available
- for analysis (N=7 animals, N=14 samples in each subset).
- The same test for pre- vs.
- post-transplant differential gene expression was performed on the same
- 7 pairs of samples from GB libraries and non-GB libraries, in each case
- using an FDR of 10% as the threshold of significance.
- Out of 12954 genes that passed the detection threshold in both subsets,
- 358 were called significantly differentially expressed in the same direction
- in both sets; 1063 were differentially expressed in the GB set only; 296
- were differentially expressed in the non-GB set only; 2 genes were called
- significantly up in the GB set but significantly down in the non-GB set;
- and the remaining 11235 were not called differentially expressed in either
- set.
- These data are summarized in Table
- \begin_inset CommandInset ref
- LatexCommand ref
- reference "tab:Comparison-of-significant"
- plural "false"
- caps "false"
- noprefix "false"
- \end_inset
- .
- The differences in BCV calculated by EdgeR for these subsets of samples
- were negligible (BCV = 0.302 for GB and 0.297 for non-GB).
- \end_layout
- \begin_layout Standard
- The key point is that the GB data results in substantially more differentially
- expressed calls than the non-GB data.
- Since there is no gold standard for this dataset, it is impossible to be
- certain whether this is due to under-calling of differential expression
- in the non-GB samples or over-calling in the GB samples.
- However, given that both datasets are derived from the same biological
- samples and have nearly equal BCVs, it is more likely that the larger number
- of DE calls in the GB samples are genuine detections that were enabled
- by the higher sequencing depth and measurement precision of the GB samples.
- Note that the same set of genes was considered in both subsets, so the
- larger number of differentially expressed gene calls in the GB data set
- reflects a greater sensitivity to detect significant differential gene
- expression and not simply the larger total number of detected genes in
- GB samples described earlier.
- \end_layout
- \begin_layout Section
- Discussion
- \end_layout
- \begin_layout Standard
- The original experience with whole blood gene expression profiling on DNA
- microarrays demonstrated that the high concentration of globin transcripts
- reduced the sensitivity to detect genes with relatively low expression
- levels, in effect, significantly reducing the sensitivity.
- To address this limitation, commercial protocols for globin reduction were
- developed based on strategies to block globin transcript amplification
- during labeling or physically removing globin transcripts by affinity bead
- methods
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Winn2010"
- literal "false"
- \end_inset
- .
- More recently, using the latest generation of labeling protocols and arrays,
- it was determined that globin reduction was no longer necessary to obtain
- sufficient sensitivity to detect differential transcript expression
- \begin_inset CommandInset citation
- LatexCommand cite
- key "NuGEN2010"
- literal "false"
- \end_inset
- .
- However, we are not aware of any publications using these currently available
- protocols the with latest generation of microarrays that actually compare
- the detection sensitivity with and without globin reduction.
- However, in practice this has now been adopted generally primarily driven
- by concerns for cost control.
- The main objective of our work was to directly test the impact of globin
- gene transcripts and a new globin blocking protocol for application to
- the newest generation of differential gene expression profiling determined
- using next generation sequencing.
-
- \end_layout
- \begin_layout Standard
- The challenge of doing global gene expression profiling in cynomolgus monkeys
- is that the current available arrays were never designed to comprehensively
- cover this genome and have not been updated since the first assemblies
- of the cynomolgus genome were published.
- Therefore, we determined that the best strategy for peripheral blood profiling
- was to do deep RNA-seq and inform the workflow using the latest available
- genome assembly and annotation
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Wilson2013"
- literal "false"
- \end_inset
- .
- However, it was not immediately clear whether globin reduction was necessary
- for RNA-seq or how much improvement in efficiency or sensitivity to detect
- differential gene expression would be achieved for the added cost and work.
-
- \end_layout
- \begin_layout Standard
- We only found one report that demonstrated that globin reduction significantly
- improved the effective read yields for sequencing of human peripheral blood
- cell RNA using a DeepSAGE protocol
- \begin_inset CommandInset citation
- LatexCommand cite
- key "Mastrokolias2012"
- literal "false"
- \end_inset
- .
- The approach to DeepSAGE involves two different restriction enzymes that
- purify and then tag small fragments of transcripts at specific locations
- and thus, significantly reduces the complexity of the transcriptome.
- Therefore, we could not determine how DeepSAGE results would translate
- to the common strategy in the field for assaying the entire transcript
- population by whole-transcriptome 3’-end RNA-seq.
- Furthermore, if globin reduction is necessary, we also needed a globin
- reduction method specific to cynomolgus globin sequences that would work
- an organism for which no kit is available off the shelf.
- \end_layout
- \begin_layout Standard
- As mentioned above, the addition of globin blocking oligos has a very small
- impact on measured expression levels of gene expression.
- However, this is a non-issue for the purposes of differential expression
- testing, since a systematic change in a gene in all samples does not affect
- relative expression levels between samples.
- However, we must acknowledge that simple comparisons of gene expression
- data obtained by GB and non-GB protocols are not possible without additional
- normalization.
-
- \end_layout
- \begin_layout Standard
- More importantly, globin blocking not only nearly doubles the yield of usable
- reads, it also increases inter-sample correlation and sensitivity to detect
- differential gene expression relative to the same set of samples profiled
- without blocking.
- In addition, globin blocking does not add a significant amount of random
- noise to the data.
- Globin blocking thus represents a cost-effective way to squeeze more data
- and statistical power out of the same blood samples and the same amount
- of sequencing.
- In conclusion, globin reduction greatly increases the yield of useful RNA-seq
- reads mapping to the rest of the genome, with minimal perturbations in
- the relative levels of non-globin genes.
- Based on these results, globin transcript reduction using sequence-specific,
- complementary blocking oligonucleotides is recommended for all deep RNA-seq
- of cynomolgus and other nonhuman primate blood samples.
- \end_layout
- \begin_layout Chapter
- Future Directions
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Consider per-chapter future directions.
- Check instructions.
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Itemize
- Study other epigenetic marks in more contexts
- \end_layout
- \begin_deeper
- \begin_layout Itemize
- DNA methylation, histone marks, chromatin accessibility & conformation in
- CD4 T-cells
- \end_layout
- \begin_layout Itemize
- Also look at other types of lymphocytes: CD8 T-cells, B-cells, NK cells
- \end_layout
- \end_deeper
- \begin_layout Itemize
- Use CV or bootstrap to better evaluate classifiers
- \end_layout
- \begin_layout Itemize
- fRMAtools could be adapted to not require equal-sized groups
- \end_layout
- \begin_layout Standard
- \begin_inset ERT
- status open
- \begin_layout Plain Layout
- % Call it "References" instead of "Bibliography"
- \end_layout
- \begin_layout Plain Layout
- \backslash
- renewcommand{
- \backslash
- bibname}{References}
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset Flex TODO Note (inline)
- status open
- \begin_layout Plain Layout
- Check bib entry formatting & sort order
- \end_layout
- \end_inset
- \end_layout
- \begin_layout Standard
- \begin_inset CommandInset bibtex
- LatexCommand bibtex
- btprint "btPrintCited"
- bibfiles "refs,code-refs"
- options "bibtotoc,unsrt"
- \end_inset
- \end_layout
- \end_body
- \end_document
|