thesis.lyx 204 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733
  1. #LyX 2.3 created this file. For more info see http://www.lyx.org/
  2. \lyxformat 544
  3. \begin_document
  4. \begin_header
  5. \save_transient_properties true
  6. \origin unavailable
  7. \textclass extbook
  8. \begin_preamble
  9. % List all used files in log output
  10. \listfiles
  11. % Add a DRAFT watermark
  12. \usepackage{draftwatermark}
  13. \SetWatermarkLightness{0.97}
  14. \SetWatermarkScale{1}
  15. % Set up required header format
  16. \usepackage{fancyhdr}
  17. \pagestyle{fancy}
  18. \renewcommand{\headrulewidth}{0pt}
  19. \rhead{}
  20. \lhead{}
  21. \rfoot{}
  22. \lfoot{}
  23. \cfoot{\thepage} % Page number bottom center
  24. % https://tex.stackexchange.com/questions/65680/automatically-bold-first-sentence-of-a-floats-caption
  25. \usepackage{xstring}
  26. \usepackage{etoolbox}
  27. \usepackage{caption}
  28. \captionsetup{labelfont=bf,tableposition=top}
  29. \makeatletter
  30. \newcommand\formatlabel[1]{%
  31. \noexpandarg
  32. \IfSubStr{#1}{.}{%
  33. \StrBefore{#1}{.}[\firstcaption]%
  34. \StrBehind{#1}{.}[\secondcaption]%
  35. \textbf{\firstcaption.} \secondcaption}{%
  36. #1}%
  37. }
  38. \patchcmd{\@caption}{#3}{\formatlabel{#3}}
  39. \makeatother
  40. % Allow FloatBarrier command
  41. \usepackage{placeins}
  42. \end_preamble
  43. \use_default_options true
  44. \begin_modules
  45. todonotes
  46. \end_modules
  47. \maintain_unincluded_children false
  48. \language english
  49. \language_package default
  50. \inputencoding utf8
  51. \fontencoding default
  52. \font_roman "default" "default"
  53. \font_sans "default" "default"
  54. \font_typewriter "default" "default"
  55. \font_math "auto" "auto"
  56. \font_default_family default
  57. \use_non_tex_fonts false
  58. \font_sc false
  59. \font_osf false
  60. \font_sf_scale 100 100
  61. \font_tt_scale 100 100
  62. \use_microtype false
  63. \use_dash_ligatures true
  64. \graphics default
  65. \default_output_format pdf4
  66. \output_sync 0
  67. \bibtex_command default
  68. \index_command default
  69. \paperfontsize 12
  70. \spacing double
  71. \use_hyperref true
  72. \pdf_bookmarks true
  73. \pdf_bookmarksnumbered false
  74. \pdf_bookmarksopen false
  75. \pdf_bookmarksopenlevel 1
  76. \pdf_breaklinks false
  77. \pdf_pdfborder false
  78. \pdf_colorlinks false
  79. \pdf_backref false
  80. \pdf_pdfusetitle true
  81. \papersize letterpaper
  82. \use_geometry true
  83. \use_package amsmath 1
  84. \use_package amssymb 1
  85. \use_package cancel 1
  86. \use_package esint 1
  87. \use_package mathdots 1
  88. \use_package mathtools 1
  89. \use_package mhchem 1
  90. \use_package stackrel 1
  91. \use_package stmaryrd 1
  92. \use_package undertilde 1
  93. \cite_engine basic
  94. \cite_engine_type default
  95. \biblio_style plain
  96. \use_bibtopic false
  97. \use_indices false
  98. \paperorientation portrait
  99. \suppress_date false
  100. \justification true
  101. \use_refstyle 1
  102. \use_minted 0
  103. \index Index
  104. \shortcut idx
  105. \color #008000
  106. \end_index
  107. \leftmargin 1.5in
  108. \topmargin 1in
  109. \rightmargin 1in
  110. \bottommargin 1in
  111. \secnumdepth 3
  112. \tocdepth 3
  113. \paragraph_separation indent
  114. \paragraph_indentation default
  115. \is_math_indent 0
  116. \math_numbering_side default
  117. \quotes_style english
  118. \dynamic_quotes 0
  119. \papercolumns 1
  120. \papersides 2
  121. \paperpagestyle default
  122. \tracking_changes false
  123. \output_changes false
  124. \html_math_output 0
  125. \html_css_as_file 0
  126. \html_be_strict false
  127. \end_header
  128. \begin_body
  129. \begin_layout Title
  130. Bioinformatic analysis of complex, high-throughput genomic and epigenomic
  131. data in the context of immunology and transplant rejection
  132. \end_layout
  133. \begin_layout Author
  134. A thesis presented
  135. \begin_inset Newline newline
  136. \end_inset
  137. by
  138. \begin_inset Newline newline
  139. \end_inset
  140. Ryan C.
  141. Thompson
  142. \begin_inset Newline newline
  143. \end_inset
  144. to
  145. \begin_inset Newline newline
  146. \end_inset
  147. The Scripps Research Institute Graduate Program
  148. \begin_inset Newline newline
  149. \end_inset
  150. in partial fulfillment of the requirements for the degree of
  151. \begin_inset Newline newline
  152. \end_inset
  153. Doctor of Philosophy in the subject of Biology
  154. \begin_inset Newline newline
  155. \end_inset
  156. for
  157. \begin_inset Newline newline
  158. \end_inset
  159. The Scripps Research Institute
  160. \begin_inset Newline newline
  161. \end_inset
  162. La Jolla, California
  163. \end_layout
  164. \begin_layout Date
  165. October 2019
  166. \end_layout
  167. \begin_layout Standard
  168. [Copyright notice]
  169. \end_layout
  170. \begin_layout Standard
  171. [Thesis acceptance form]
  172. \end_layout
  173. \begin_layout Standard
  174. [Dedication]
  175. \end_layout
  176. \begin_layout Standard
  177. [Acknowledgements]
  178. \end_layout
  179. \begin_layout Standard
  180. \begin_inset CommandInset toc
  181. LatexCommand tableofcontents
  182. \end_inset
  183. \end_layout
  184. \begin_layout Standard
  185. \begin_inset FloatList table
  186. \end_inset
  187. \end_layout
  188. \begin_layout Standard
  189. \begin_inset FloatList figure
  190. \end_inset
  191. \end_layout
  192. \begin_layout Standard
  193. [List of Abbreviations]
  194. \end_layout
  195. \begin_layout Standard
  196. \begin_inset Flex TODO Note (inline)
  197. status open
  198. \begin_layout Plain Layout
  199. Look into auto-generated nomenclature list: https://wiki.lyx.org/Tips/Nomenclature
  200. \end_layout
  201. \end_inset
  202. \end_layout
  203. \begin_layout List of TODOs
  204. \end_layout
  205. \begin_layout Standard
  206. [Abstract]
  207. \end_layout
  208. \begin_layout Chapter*
  209. Abstract
  210. \end_layout
  211. \begin_layout Chapter
  212. Introduction
  213. \end_layout
  214. \begin_layout Section
  215. Background & Significance
  216. \end_layout
  217. \begin_layout Subsection
  218. Biological motivation
  219. \end_layout
  220. \begin_layout Itemize
  221. Rejection is the major long-term threat to organ and tissue grafts
  222. \end_layout
  223. \begin_deeper
  224. \begin_layout Itemize
  225. Common mechanisms of rejection
  226. \end_layout
  227. \begin_layout Itemize
  228. Effective immune suppression requires monitoring for rejection and tuning
  229. \end_layout
  230. \begin_layout Itemize
  231. Current tests for rejection (tissue biopsy) are invasive and biased
  232. \end_layout
  233. \begin_layout Itemize
  234. A blood test based on microarrays would be less biased and invasive
  235. \end_layout
  236. \end_deeper
  237. \begin_layout Itemize
  238. Memory cells are resistant to immune suppression
  239. \end_layout
  240. \begin_deeper
  241. \begin_layout Itemize
  242. Mechanisms of resistance in memory cells are poorly understood
  243. \end_layout
  244. \begin_layout Itemize
  245. A better understanding of immune memory formation is needed
  246. \end_layout
  247. \end_deeper
  248. \begin_layout Itemize
  249. Mesenchymal stem cell infusion is a promising new treatment to prevent/delay
  250. rejection
  251. \end_layout
  252. \begin_deeper
  253. \begin_layout Itemize
  254. Demonstrated in mice, but not yet in primates
  255. \end_layout
  256. \begin_layout Itemize
  257. Mechanism currently unknown, but MSC are known to be immune modulatory
  258. \end_layout
  259. \end_deeper
  260. \begin_layout Subsection
  261. Overview of bioinformatic analysis methods
  262. \end_layout
  263. \begin_layout Standard
  264. An overview of all the methods used, including what problem they solve,
  265. what assumptions they make, and a basic description of how they work.
  266. \end_layout
  267. \begin_layout Itemize
  268. ChIP-seq Peak calling
  269. \end_layout
  270. \begin_deeper
  271. \begin_layout Itemize
  272. Cross-correlation analysis to determine fragment size
  273. \end_layout
  274. \begin_layout Itemize
  275. Broad vs narrow peaks
  276. \end_layout
  277. \begin_layout Itemize
  278. SICER for broad peaks
  279. \end_layout
  280. \begin_layout Itemize
  281. IDR for biologically reproducible peaks
  282. \end_layout
  283. \begin_layout Itemize
  284. csaw peak filtering guidelines for unbiased downstream analysis
  285. \end_layout
  286. \end_deeper
  287. \begin_layout Itemize
  288. Normalization is non-trivial and application-dependant
  289. \end_layout
  290. \begin_deeper
  291. \begin_layout Itemize
  292. Expression arrays: RMA & fRMA; why fRMA is needed
  293. \end_layout
  294. \begin_layout Itemize
  295. Methylation arrays: M-value transformation approximates normal data but
  296. induces heteroskedasticity
  297. \end_layout
  298. \begin_layout Itemize
  299. RNA-seq: normalize based on assumption that the average gene is not changing
  300. \end_layout
  301. \begin_layout Itemize
  302. ChIP-seq: complex with many considerations, dependent on experimental methods,
  303. biological system, and analysis goals
  304. \end_layout
  305. \end_deeper
  306. \begin_layout Itemize
  307. Limma: The standard linear modeling framework for genomics
  308. \end_layout
  309. \begin_deeper
  310. \begin_layout Itemize
  311. empirical Bayes variance modeling: limma's core feature
  312. \end_layout
  313. \begin_layout Itemize
  314. edgeR & DESeq2: Extend with negative bonomial GLM for RNA-seq and other
  315. count data
  316. \end_layout
  317. \begin_layout Itemize
  318. voom: Extend with precision weights to model mean-variance trend
  319. \end_layout
  320. \begin_layout Itemize
  321. arrayWeights and duplicateCorrelation to handle complex variance structures
  322. \end_layout
  323. \end_deeper
  324. \begin_layout Itemize
  325. sva and ComBat for batch correction
  326. \end_layout
  327. \begin_layout Itemize
  328. Factor analysis: PCA, MDS, MOFA
  329. \end_layout
  330. \begin_deeper
  331. \begin_layout Itemize
  332. Batch-corrected PCA is informative, but careful application is required
  333. to avoid bias
  334. \end_layout
  335. \end_deeper
  336. \begin_layout Itemize
  337. Gene set analysis: camera and SPIA
  338. \end_layout
  339. \begin_layout Section
  340. Innovation
  341. \end_layout
  342. \begin_layout Itemize
  343. MSC infusion to improve transplant outcomes (prevent/delay rejection)
  344. \end_layout
  345. \begin_deeper
  346. \begin_layout Itemize
  347. Characterize MSC response to interferon gamma
  348. \end_layout
  349. \begin_layout Itemize
  350. IFN-g is thought to stimulate their function
  351. \end_layout
  352. \begin_layout Itemize
  353. Test IFN-g treated MSC infusion as a therapy to delay graft rejection in
  354. cynomolgus monkeys
  355. \end_layout
  356. \begin_layout Itemize
  357. Monitor animals post-transplant using blood RNA-seq at serial time points
  358. \end_layout
  359. \end_deeper
  360. \begin_layout Itemize
  361. Investigate dynamics of histone marks in CD4 T-cell activation and memory
  362. \end_layout
  363. \begin_deeper
  364. \begin_layout Itemize
  365. Previous studies have looked at single snapshots of histone marks
  366. \end_layout
  367. \begin_layout Itemize
  368. Instead, look at changes in histone marks across activation and memory
  369. \end_layout
  370. \end_deeper
  371. \begin_layout Itemize
  372. High-throughput sequencing and microarray technologies
  373. \end_layout
  374. \begin_deeper
  375. \begin_layout Itemize
  376. Powerful methods for assaying gene expression and epigenetics across entire
  377. genomes
  378. \end_layout
  379. \begin_layout Itemize
  380. Proper analysis requires finding and exploiting systematic genome-wide trends
  381. \end_layout
  382. \end_deeper
  383. \begin_layout Chapter
  384. Reproducible genome-wide epigenetic analysis of H3K4 and H3K27 methylation
  385. in naive and memory CD4 T-cell activation
  386. \end_layout
  387. \begin_layout Standard
  388. \begin_inset Flex TODO Note (inline)
  389. status open
  390. \begin_layout Plain Layout
  391. Chapter author list: Me, Sarah, Dan
  392. \end_layout
  393. \end_inset
  394. \end_layout
  395. \begin_layout Standard
  396. \begin_inset Flex TODO Note (inline)
  397. status open
  398. \begin_layout Plain Layout
  399. Need better section titles throughout the chapter
  400. \end_layout
  401. \end_inset
  402. \end_layout
  403. \begin_layout Section
  404. Approach
  405. \end_layout
  406. \begin_layout Itemize
  407. CD4 T-cells are central to all adaptive immune responses and memory
  408. \end_layout
  409. \begin_layout Itemize
  410. H3K4 and H3K27 methylation are major epigenetic regulators of gene expression
  411. \end_layout
  412. \begin_layout Itemize
  413. Canonically, H3K4 is activating and H3K27 is inhibitory, but the reality
  414. is complex
  415. \end_layout
  416. \begin_layout Itemize
  417. Looking at these marks during CD4 activation and memory should reveal new
  418. mechanistic details
  419. \end_layout
  420. \begin_layout Itemize
  421. Test
  422. \begin_inset Quotes eld
  423. \end_inset
  424. poised promoter
  425. \begin_inset Quotes erd
  426. \end_inset
  427. hypothesis in which H3K4 and H3K27 are both methylated
  428. \end_layout
  429. \begin_layout Itemize
  430. Expand scope of analysis beyond simple promoter counts
  431. \end_layout
  432. \begin_deeper
  433. \begin_layout Itemize
  434. Analyze peaks genome-wide, including in intergenic regions
  435. \end_layout
  436. \begin_layout Itemize
  437. Analysis of coverage distribution shape within promoters, e.g.
  438. upstream vs downstream coverage
  439. \end_layout
  440. \end_deeper
  441. \begin_layout Section
  442. Methods
  443. \end_layout
  444. \begin_layout Standard
  445. \begin_inset Flex TODO Note (inline)
  446. status open
  447. \begin_layout Plain Layout
  448. Move figures that are only justifying methods into this section
  449. \end_layout
  450. \end_inset
  451. \end_layout
  452. \begin_layout Standard
  453. A reproducible workflow
  454. \begin_inset CommandInset citation
  455. LatexCommand cite
  456. key "gh-cd4-csaw"
  457. literal "false"
  458. \end_inset
  459. was written to analyze the raw ChIP-seq and RNA-seq data from previous
  460. studies
  461. \begin_inset CommandInset citation
  462. LatexCommand cite
  463. key "LaMere2016,LaMere2017"
  464. literal "true"
  465. \end_inset
  466. .
  467. Briefly, this data consists of RNA-seq and ChIP-seq from CD4 T-cells cultured
  468. from 4 donors.
  469. From each donor, naive and memory CD4 T-cells were isolated separately.
  470. Then cultures of both cells were activated [how?], and samples were taken
  471. at 4 time points: Day 0 (pre-activation), Day 1 (early activation), Day
  472. 5 (peak activation), and Day 14 (post-activation).
  473. For each combination of cell type and time point, RNA was isolated, and
  474. ChIP-seq was performed for each of 3 histone marks: H3K4me2, H3K4me3, and
  475. H3K27me3.
  476. The ChIP-seq input was also sequenced for each sample.
  477. The result was 32 samples for each assay.
  478. \end_layout
  479. \begin_layout Standard
  480. Sequence reads were retrieved from the Sequence Read Archive (SRA)
  481. \begin_inset CommandInset citation
  482. LatexCommand cite
  483. key "Leinonen2011"
  484. literal "false"
  485. \end_inset
  486. .
  487. ChIP-seq (and input) reads were aligned to CRCh38 genome assembly using
  488. Bowtie 2
  489. \begin_inset CommandInset citation
  490. LatexCommand cite
  491. key "Langmead2012,Schneider2017,gh-hg38-ref"
  492. literal "false"
  493. \end_inset
  494. .
  495. Artifact regions were annotated using a custom implementation of the GreyListCh
  496. IP algorithm, and these
  497. \begin_inset Quotes eld
  498. \end_inset
  499. greylists
  500. \begin_inset Quotes erd
  501. \end_inset
  502. were merged with the ENCODE blacklist
  503. \begin_inset CommandInset citation
  504. LatexCommand cite
  505. key "greylistchip,Amemiya2019,Dunham2012"
  506. literal "false"
  507. \end_inset
  508. .
  509. Any read or peak overlapping one of these regions was regarded as artifactual
  510. and excluded from downstream analyses.
  511. \end_layout
  512. \begin_layout Standard
  513. Peaks are called using epic, an implementation of the SICER algorithm
  514. \begin_inset CommandInset citation
  515. LatexCommand cite
  516. key "Zang2009,gh-epic"
  517. literal "false"
  518. \end_inset
  519. .
  520. Peaks are also called separately using MACS, but MACS was determined to
  521. be a poor fit for the data, and these peak calls are not used further
  522. \begin_inset CommandInset citation
  523. LatexCommand cite
  524. key "Zhang2008"
  525. literal "false"
  526. \end_inset
  527. .
  528. \end_layout
  529. \begin_layout Itemize
  530. Re-analyze previously published CD4 ChIP-seq & RNA-seq data
  531. \end_layout
  532. \begin_deeper
  533. \begin_layout Itemize
  534. Completely reimplement analysis from scratch as a reproducible workflow
  535. \end_layout
  536. \begin_layout Itemize
  537. Use newly published methods & algorithms not available during the original
  538. analysis: SICER, csaw, MOFA
  539. \begin_inset CommandInset citation
  540. LatexCommand cite
  541. key "Argelaguet2018"
  542. literal "false"
  543. \end_inset
  544. , ComBat, sva, GREAT, and more
  545. \end_layout
  546. \end_deeper
  547. \begin_layout Itemize
  548. SICER, IDR, csaw, & GREAT to call ChIP-seq peaks genome-wide, perform differenti
  549. al abundance analysis, and relate those peaks to gene expression
  550. \end_layout
  551. \begin_layout Itemize
  552. Promoter counts in sliding windows around each gene's highest-expressed
  553. TSS to investigate coverage distribution within promoters
  554. \end_layout
  555. \begin_layout Subsection
  556. RNA-seq align+quant method comparison
  557. \end_layout
  558. \begin_layout Standard
  559. \begin_inset Flex TODO Note (inline)
  560. status open
  561. \begin_layout Plain Layout
  562. Maybe fix up the excessive axis ranges for these plots?
  563. \end_layout
  564. \end_inset
  565. \end_layout
  566. \begin_layout Standard
  567. \begin_inset Float figure
  568. wide false
  569. sideways false
  570. status collapsed
  571. \begin_layout Plain Layout
  572. \align center
  573. \begin_inset Graphics
  574. filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-star-CROP.png
  575. lyxscale 25
  576. width 100col%
  577. groupId colwidth-raster
  578. \end_inset
  579. \end_layout
  580. \begin_layout Plain Layout
  581. \begin_inset Caption Standard
  582. \begin_layout Plain Layout
  583. Comparison of STAR quantification between Ensembl and Entrez gene identifiers
  584. \end_layout
  585. \end_inset
  586. \end_layout
  587. \end_inset
  588. \end_layout
  589. \begin_layout Standard
  590. \begin_inset Float figure
  591. wide false
  592. sideways false
  593. status collapsed
  594. \begin_layout Plain Layout
  595. \align center
  596. \begin_inset Graphics
  597. filename graphics/CD4-csaw/rnaseq-compare/ensmebl-vs-entrez-shoal-CROP.png
  598. lyxscale 25
  599. width 100col%
  600. groupId colwidth-raster
  601. \end_inset
  602. \end_layout
  603. \begin_layout Plain Layout
  604. \begin_inset Caption Standard
  605. \begin_layout Plain Layout
  606. Comparison of Salmon+Shoal quantification between Ensembl and Entrez gene
  607. identifiers
  608. \end_layout
  609. \end_inset
  610. \end_layout
  611. \end_inset
  612. \end_layout
  613. \begin_layout Standard
  614. \begin_inset Float figure
  615. wide false
  616. sideways false
  617. status collapsed
  618. \begin_layout Plain Layout
  619. \align center
  620. \begin_inset Graphics
  621. filename graphics/CD4-csaw/rnaseq-compare/star-vs-hisat2-CROP.png
  622. lyxscale 25
  623. width 100col%
  624. groupId colwidth-raster
  625. \end_inset
  626. \end_layout
  627. \begin_layout Plain Layout
  628. \begin_inset Caption Standard
  629. \begin_layout Plain Layout
  630. Comparison of quantification between STAR and HISAT2 for identical annotation
  631. \end_layout
  632. \end_inset
  633. \end_layout
  634. \end_inset
  635. \end_layout
  636. \begin_layout Standard
  637. \begin_inset Float figure
  638. wide false
  639. sideways false
  640. status collapsed
  641. \begin_layout Plain Layout
  642. \align center
  643. \begin_inset Graphics
  644. filename graphics/CD4-csaw/rnaseq-compare/star-vs-salmon-CROP.png
  645. lyxscale 25
  646. width 100col%
  647. groupId colwidth-raster
  648. \end_inset
  649. \end_layout
  650. \begin_layout Plain Layout
  651. \begin_inset Caption Standard
  652. \begin_layout Plain Layout
  653. Comparison of quantification between STAR and Salmon for identical annotation
  654. \end_layout
  655. \end_inset
  656. \end_layout
  657. \end_inset
  658. \end_layout
  659. \begin_layout Standard
  660. \begin_inset Float figure
  661. wide false
  662. sideways false
  663. status collapsed
  664. \begin_layout Plain Layout
  665. \align center
  666. \begin_inset Graphics
  667. filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-kallisto-CROP.png
  668. lyxscale 25
  669. width 100col%
  670. groupId colwidth-raster
  671. \end_inset
  672. \end_layout
  673. \begin_layout Plain Layout
  674. \begin_inset Caption Standard
  675. \begin_layout Plain Layout
  676. Comparison of quantification between Salmon and Kallisto for identical annotatio
  677. n
  678. \end_layout
  679. \end_inset
  680. \end_layout
  681. \end_inset
  682. \end_layout
  683. \begin_layout Standard
  684. \begin_inset Float figure
  685. wide false
  686. sideways false
  687. status collapsed
  688. \begin_layout Plain Layout
  689. \align center
  690. \begin_inset Graphics
  691. filename graphics/CD4-csaw/rnaseq-compare/salmon-vs-shoal-CROP.png
  692. lyxscale 25
  693. width 100col%
  694. groupId colwidth-raster
  695. \end_inset
  696. \end_layout
  697. \begin_layout Plain Layout
  698. \begin_inset Caption Standard
  699. \begin_layout Plain Layout
  700. Comparison of quantification between Salmon with and without Shoal for identical
  701. annotation
  702. \end_layout
  703. \end_inset
  704. \end_layout
  705. \end_inset
  706. \end_layout
  707. \begin_layout Itemize
  708. Ultimately selected shoal as quantification, Ensembl as annotation.
  709. Why? Running downstream analyses with all quant methods and both annotations
  710. showed very little practical difference, so choice was not terribly important.
  711. Prefer shoal due to theoretical advantages.
  712. To note in discussion: reproducible workflow made it easy to do this, enabling
  713. an informed decision.
  714. \end_layout
  715. \begin_layout Standard
  716. \begin_inset ERT
  717. status collapsed
  718. \begin_layout Plain Layout
  719. \backslash
  720. FloatBarrier
  721. \end_layout
  722. \end_inset
  723. \end_layout
  724. \begin_layout Subsection
  725. RNA-seq has a large confounding batch effect
  726. \end_layout
  727. \begin_layout Standard
  728. \begin_inset Float figure
  729. wide false
  730. sideways false
  731. status collapsed
  732. \begin_layout Plain Layout
  733. \begin_inset Flex TODO Note (inline)
  734. status open
  735. \begin_layout Plain Layout
  736. Just take the top row
  737. \end_layout
  738. \end_inset
  739. \end_layout
  740. \begin_layout Plain Layout
  741. \align center
  742. \begin_inset Graphics
  743. filename graphics/CD4-csaw/RNA-seq/weights-vs-covars-CROP.png
  744. lyxscale 25
  745. width 100col%
  746. groupId colwidth-raster
  747. \end_inset
  748. \end_layout
  749. \begin_layout Plain Layout
  750. \begin_inset Caption Standard
  751. \begin_layout Plain Layout
  752. \series bold
  753. \begin_inset CommandInset label
  754. LatexCommand label
  755. name "fig:RNA-seq-weights-vs-covars"
  756. \end_inset
  757. RNA-seq sample weights, grouped by experimental and technical covariates
  758. \end_layout
  759. \end_inset
  760. \end_layout
  761. \end_inset
  762. \end_layout
  763. \begin_layout Standard
  764. \begin_inset Float figure
  765. wide false
  766. sideways false
  767. status collapsed
  768. \begin_layout Plain Layout
  769. \align center
  770. \begin_inset Graphics
  771. filename graphics/CD4-csaw/RNA-seq/PCA-no-batchsub-CROP.png
  772. lyxscale 25
  773. width 100col%
  774. groupId colwidth-raster
  775. \end_inset
  776. \end_layout
  777. \begin_layout Plain Layout
  778. \begin_inset Caption Standard
  779. \begin_layout Plain Layout
  780. \series bold
  781. \begin_inset CommandInset label
  782. LatexCommand label
  783. name "fig:RNA-PCA-no-batchsub"
  784. \end_inset
  785. RNA-seq PCoA plot showing clear batch effect
  786. \end_layout
  787. \end_inset
  788. \end_layout
  789. \end_inset
  790. \end_layout
  791. \begin_layout Standard
  792. \begin_inset Float figure
  793. wide false
  794. sideways false
  795. status collapsed
  796. \begin_layout Plain Layout
  797. \begin_inset Flex TODO Note (inline)
  798. status open
  799. \begin_layout Plain Layout
  800. Probably don't need this
  801. \end_layout
  802. \end_inset
  803. \end_layout
  804. \begin_layout Plain Layout
  805. \align center
  806. \begin_inset Graphics
  807. filename graphics/CD4-csaw/RNA-seq/PCA-naive-batchsub-CROP.png
  808. lyxscale 25
  809. width 100col%
  810. groupId colwidth-raster
  811. \end_inset
  812. \end_layout
  813. \begin_layout Plain Layout
  814. \begin_inset Caption Standard
  815. \begin_layout Plain Layout
  816. \series bold
  817. \begin_inset CommandInset label
  818. LatexCommand label
  819. name "fig:RNA-PCA-limma-batchsub"
  820. \end_inset
  821. RNA-seq PCoA plot showing clear batch effect
  822. \end_layout
  823. \end_inset
  824. \end_layout
  825. \end_inset
  826. \end_layout
  827. \begin_layout Standard
  828. \begin_inset Float figure
  829. wide false
  830. sideways false
  831. status collapsed
  832. \begin_layout Plain Layout
  833. \align center
  834. \begin_inset Graphics
  835. filename graphics/CD4-csaw/RNA-seq/PCA-combat-batchsub-CROP.png
  836. lyxscale 25
  837. width 100col%
  838. groupId colwidth-raster
  839. \end_inset
  840. \end_layout
  841. \begin_layout Plain Layout
  842. \begin_inset Caption Standard
  843. \begin_layout Plain Layout
  844. \series bold
  845. \begin_inset CommandInset label
  846. LatexCommand label
  847. name "fig:RNA-PCA-ComBat-batchsub"
  848. \end_inset
  849. RNA-seq PCoA plot showing clear batch effect
  850. \end_layout
  851. \end_inset
  852. \end_layout
  853. \end_inset
  854. \end_layout
  855. \begin_layout Itemize
  856. RNA-seq batch effect can be partially corrected, but still induces uncorrectable
  857. biases in downstream analysis
  858. \end_layout
  859. \begin_layout Standard
  860. \begin_inset Flex TODO Note (inline)
  861. status open
  862. \begin_layout Plain Layout
  863. Figures showing p-value histograms for within-batch and cross-batch contrasts,
  864. showing that cross-batch contrasts have attenuated signal, as do comparisons
  865. within the bad batch
  866. \end_layout
  867. \end_inset
  868. \end_layout
  869. \begin_layout Standard
  870. \begin_inset ERT
  871. status collapsed
  872. \begin_layout Plain Layout
  873. \backslash
  874. FloatBarrier
  875. \end_layout
  876. \end_inset
  877. \end_layout
  878. \begin_layout Subsection
  879. ChIP-seq blacklisting is important
  880. \end_layout
  881. \begin_layout Standard
  882. \begin_inset Float figure
  883. wide false
  884. sideways false
  885. status collapsed
  886. \begin_layout Plain Layout
  887. \align center
  888. \begin_inset Graphics
  889. filename graphics/CD4-csaw/csaw/CCF-plots-PAGE2-CROP.pdf
  890. lyxscale 50
  891. width 100col%
  892. groupId colwidth
  893. \end_inset
  894. \end_layout
  895. \begin_layout Plain Layout
  896. \begin_inset Caption Standard
  897. \begin_layout Plain Layout
  898. Cross-correlation plots with blacklisted reads removed
  899. \end_layout
  900. \end_inset
  901. \end_layout
  902. \end_inset
  903. \end_layout
  904. \begin_layout Standard
  905. \begin_inset Float figure
  906. wide false
  907. sideways false
  908. status collapsed
  909. \begin_layout Plain Layout
  910. \align center
  911. \begin_inset Graphics
  912. filename graphics/CD4-csaw/csaw/CCF-plots-noBL-PAGE2-CROP.pdf
  913. lyxscale 50
  914. width 100col%
  915. groupId colwidth
  916. \end_inset
  917. \end_layout
  918. \begin_layout Plain Layout
  919. \begin_inset Caption Standard
  920. \begin_layout Plain Layout
  921. Cross-correlation plots without removing blacklisted reads
  922. \end_layout
  923. \end_inset
  924. \end_layout
  925. \end_inset
  926. \end_layout
  927. \begin_layout Subsection
  928. ChIP-seq normalization
  929. \end_layout
  930. \begin_layout Standard
  931. \begin_inset Flex TODO Note (inline)
  932. status open
  933. \begin_layout Plain Layout
  934. Maybe just one of these figures and then say the other 2 were similar
  935. \end_layout
  936. \end_inset
  937. \end_layout
  938. \begin_layout Standard
  939. \begin_inset Float figure
  940. wide false
  941. sideways false
  942. status open
  943. \begin_layout Plain Layout
  944. \align center
  945. \begin_inset Graphics
  946. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-sample-MAplot-bins-CROP.png
  947. lyxscale 25
  948. width 100col%
  949. groupId colwidth-raster
  950. \end_inset
  951. \end_layout
  952. \begin_layout Plain Layout
  953. \begin_inset Caption Standard
  954. \begin_layout Plain Layout
  955. \series bold
  956. MA plot of H3K4me2 read counts in 10kb bins for two arbitrary samples
  957. \end_layout
  958. \end_inset
  959. \end_layout
  960. \end_inset
  961. \end_layout
  962. \begin_layout Standard
  963. \begin_inset Float figure
  964. wide false
  965. sideways false
  966. status open
  967. \begin_layout Plain Layout
  968. \align center
  969. \begin_inset Graphics
  970. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-sample-MAplot-bins-CROP.png
  971. lyxscale 25
  972. width 100col%
  973. groupId colwidth-raster
  974. \end_inset
  975. \end_layout
  976. \begin_layout Plain Layout
  977. \begin_inset Caption Standard
  978. \begin_layout Plain Layout
  979. \series bold
  980. MA plot of H3K4me3 read counts in 10kb bins for two arbitrary samples
  981. \end_layout
  982. \end_inset
  983. \end_layout
  984. \end_inset
  985. \end_layout
  986. \begin_layout Standard
  987. \begin_inset Float figure
  988. wide false
  989. sideways false
  990. status open
  991. \begin_layout Plain Layout
  992. \align center
  993. \begin_inset Graphics
  994. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-sample-MAplot-bins-CROP.png
  995. lyxscale 25
  996. width 100col%
  997. groupId colwidth-raster
  998. \end_inset
  999. \end_layout
  1000. \begin_layout Plain Layout
  1001. \begin_inset Caption Standard
  1002. \begin_layout Plain Layout
  1003. \series bold
  1004. MA plot of H3K27me3 read counts in 10kb bins for two arbitrary samples
  1005. \end_layout
  1006. \end_inset
  1007. \end_layout
  1008. \end_inset
  1009. \end_layout
  1010. \begin_layout Subsection
  1011. ChIP-seq must be corrected for hidden confounding factors
  1012. \end_layout
  1013. \begin_layout Standard
  1014. \begin_inset Flex TODO Note (inline)
  1015. status open
  1016. \begin_layout Plain Layout
  1017. Consolidate these into 1 2x3 grid.
  1018. For now, just refer to them as if they were a single figure.
  1019. \end_layout
  1020. \end_inset
  1021. \end_layout
  1022. \begin_layout Standard
  1023. \begin_inset Float figure
  1024. wide false
  1025. sideways false
  1026. status collapsed
  1027. \begin_layout Plain Layout
  1028. \align center
  1029. \begin_inset Graphics
  1030. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-PCA-raw-CROP.png
  1031. lyxscale 25
  1032. width 100col%
  1033. groupId colwidth-raster
  1034. \end_inset
  1035. \end_layout
  1036. \begin_layout Plain Layout
  1037. \begin_inset Caption Standard
  1038. \begin_layout Plain Layout
  1039. \series bold
  1040. \begin_inset CommandInset label
  1041. LatexCommand label
  1042. name "fig:PCoA-H3K4me2-bad"
  1043. \end_inset
  1044. PCoA plot of H3K4me2 windows, before subtracting surrogate variables
  1045. \end_layout
  1046. \end_inset
  1047. \end_layout
  1048. \end_inset
  1049. \end_layout
  1050. \begin_layout Standard
  1051. \begin_inset Float figure
  1052. wide false
  1053. sideways false
  1054. status collapsed
  1055. \begin_layout Plain Layout
  1056. \align center
  1057. \begin_inset Graphics
  1058. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-PCA-SVsub-CROP.png
  1059. lyxscale 25
  1060. width 100col%
  1061. groupId colwidth-raster
  1062. \end_inset
  1063. \end_layout
  1064. \begin_layout Plain Layout
  1065. \begin_inset Caption Standard
  1066. \begin_layout Plain Layout
  1067. \series bold
  1068. \begin_inset CommandInset label
  1069. LatexCommand label
  1070. name "fig:PCoA-H3K4me2-good"
  1071. \end_inset
  1072. PCoA plot of H3K4me2 windows, after subtracting surrogate variables
  1073. \end_layout
  1074. \end_inset
  1075. \end_layout
  1076. \end_inset
  1077. \end_layout
  1078. \begin_layout Standard
  1079. \begin_inset Float figure
  1080. wide false
  1081. sideways false
  1082. status collapsed
  1083. \begin_layout Plain Layout
  1084. \align center
  1085. \begin_inset Graphics
  1086. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-PCA-raw-CROP.png
  1087. lyxscale 25
  1088. width 100col%
  1089. groupId colwidth-raster
  1090. \end_inset
  1091. \end_layout
  1092. \begin_layout Plain Layout
  1093. \begin_inset Caption Standard
  1094. \begin_layout Plain Layout
  1095. \series bold
  1096. \begin_inset CommandInset label
  1097. LatexCommand label
  1098. name "fig:PCoA-H3K4me3-bad"
  1099. \end_inset
  1100. PCoA plot of H3K4me3 windows, before subtracting surrogate variables
  1101. \end_layout
  1102. \end_inset
  1103. \end_layout
  1104. \end_inset
  1105. \end_layout
  1106. \begin_layout Standard
  1107. \begin_inset Float figure
  1108. wide false
  1109. sideways false
  1110. status collapsed
  1111. \begin_layout Plain Layout
  1112. \align center
  1113. \begin_inset Graphics
  1114. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-PCA-SVsub-CROP.png
  1115. lyxscale 25
  1116. width 100col%
  1117. groupId colwidth-raster
  1118. \end_inset
  1119. \end_layout
  1120. \begin_layout Plain Layout
  1121. \begin_inset Caption Standard
  1122. \begin_layout Plain Layout
  1123. \series bold
  1124. \begin_inset CommandInset label
  1125. LatexCommand label
  1126. name "fig:PCoA-H3K4me3-good"
  1127. \end_inset
  1128. PCoA plot of H3K4me3 windows, after subtracting surrogate variables
  1129. \end_layout
  1130. \end_inset
  1131. \end_layout
  1132. \end_inset
  1133. \end_layout
  1134. \begin_layout Standard
  1135. \begin_inset Float figure
  1136. wide false
  1137. sideways false
  1138. status collapsed
  1139. \begin_layout Plain Layout
  1140. \align center
  1141. \begin_inset Graphics
  1142. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-PCA-raw-CROP.png
  1143. lyxscale 25
  1144. width 100col%
  1145. groupId colwidth-raster
  1146. \end_inset
  1147. \end_layout
  1148. \begin_layout Plain Layout
  1149. \begin_inset Caption Standard
  1150. \begin_layout Plain Layout
  1151. \series bold
  1152. \begin_inset CommandInset label
  1153. LatexCommand label
  1154. name "fig:PCoA-H3K27me3-bad"
  1155. \end_inset
  1156. PCoA plot of H3K27me3 windows, before subtracting surrogate variables
  1157. \end_layout
  1158. \end_inset
  1159. \end_layout
  1160. \end_inset
  1161. \end_layout
  1162. \begin_layout Standard
  1163. \begin_inset Float figure
  1164. wide false
  1165. sideways false
  1166. status collapsed
  1167. \begin_layout Plain Layout
  1168. \align center
  1169. \begin_inset Graphics
  1170. filename graphics/CD4-csaw/ChIP-seq/H3K27me3-PCA-SVsub-CROP.png
  1171. lyxscale 25
  1172. width 100col%
  1173. groupId colwidth-raster
  1174. \end_inset
  1175. \end_layout
  1176. \begin_layout Plain Layout
  1177. \begin_inset Caption Standard
  1178. \begin_layout Plain Layout
  1179. \series bold
  1180. \begin_inset CommandInset label
  1181. LatexCommand label
  1182. name "fig:PCoA-H3K27me3-good"
  1183. \end_inset
  1184. PCoA plot of H3K27me3 windows, after subtracting surrogate variables
  1185. \end_layout
  1186. \end_inset
  1187. \end_layout
  1188. \end_inset
  1189. \end_layout
  1190. \begin_layout Itemize
  1191. Figures showing BCV plots with and without SVA for each histone mark.
  1192. \end_layout
  1193. \begin_layout Standard
  1194. \begin_inset ERT
  1195. status collapsed
  1196. \begin_layout Plain Layout
  1197. \backslash
  1198. FloatBarrier
  1199. \end_layout
  1200. \end_inset
  1201. \end_layout
  1202. \begin_layout Subsection
  1203. MOFA recovers biologically relevant variation from blind analysis by correlating
  1204. across datasets
  1205. \end_layout
  1206. \begin_layout Standard
  1207. \begin_inset Float figure
  1208. wide false
  1209. sideways false
  1210. status open
  1211. \begin_layout Plain Layout
  1212. \align center
  1213. \begin_inset Graphics
  1214. filename graphics/CD4-csaw/MOFA-varExplaiend-matrix-CROP.png
  1215. lyxscale 25
  1216. width 100col%
  1217. groupId colwidth-raster
  1218. \end_inset
  1219. \end_layout
  1220. \begin_layout Plain Layout
  1221. \begin_inset Caption Standard
  1222. \begin_layout Plain Layout
  1223. \series bold
  1224. \begin_inset CommandInset label
  1225. LatexCommand label
  1226. name "fig:mofa-varexplained"
  1227. \end_inset
  1228. Variance explained in each data set by each latent factor estimated by MOFA.
  1229. \end_layout
  1230. \end_inset
  1231. \end_layout
  1232. \end_inset
  1233. \end_layout
  1234. \begin_layout Itemize
  1235. Figure
  1236. \begin_inset CommandInset ref
  1237. LatexCommand ref
  1238. reference "fig:mofa-varexplained"
  1239. plural "false"
  1240. caps "false"
  1241. noprefix "false"
  1242. \end_inset
  1243. shows that LF1, 4, and 5 explain substantial var in all data sets
  1244. \end_layout
  1245. \begin_layout Standard
  1246. \begin_inset Float figure
  1247. wide false
  1248. sideways false
  1249. status open
  1250. \begin_layout Plain Layout
  1251. \begin_inset Flex TODO Note (inline)
  1252. status open
  1253. \begin_layout Plain Layout
  1254. Maybe drop this one
  1255. \end_layout
  1256. \end_inset
  1257. \end_layout
  1258. \begin_layout Plain Layout
  1259. \align center
  1260. \begin_inset Graphics
  1261. filename graphics/CD4-csaw/MOFA-LF-distributions-CROP.png
  1262. lyxscale 25
  1263. width 100col%
  1264. groupId colwidth-raster
  1265. \end_inset
  1266. \end_layout
  1267. \begin_layout Plain Layout
  1268. \begin_inset Caption Standard
  1269. \begin_layout Plain Layout
  1270. \series bold
  1271. \begin_inset CommandInset label
  1272. LatexCommand label
  1273. name "fig:mofa-lf-dist"
  1274. \end_inset
  1275. Sample distribution for each latent factor estimated by MOFA.
  1276. \end_layout
  1277. \end_inset
  1278. \end_layout
  1279. \end_inset
  1280. \end_layout
  1281. \begin_layout Standard
  1282. \begin_inset Float figure
  1283. wide false
  1284. sideways false
  1285. status open
  1286. \begin_layout Plain Layout
  1287. \begin_inset Flex TODO Note (inline)
  1288. status open
  1289. \begin_layout Plain Layout
  1290. Talk about how this supports the convergence hypothesis
  1291. \end_layout
  1292. \end_inset
  1293. \end_layout
  1294. \begin_layout Plain Layout
  1295. \align center
  1296. \begin_inset Graphics
  1297. filename graphics/CD4-csaw/MOFA-LF-scatter-CROP.png
  1298. lyxscale 25
  1299. width 100col%
  1300. groupId colwidth-raster
  1301. \end_inset
  1302. \end_layout
  1303. \begin_layout Plain Layout
  1304. \begin_inset Caption Standard
  1305. \begin_layout Plain Layout
  1306. \series bold
  1307. \begin_inset CommandInset label
  1308. LatexCommand label
  1309. name "fig:mofa-lf-scatter"
  1310. \end_inset
  1311. Scatter plots of specific pairs of MOFA latent factors.
  1312. \end_layout
  1313. \end_inset
  1314. \end_layout
  1315. \end_inset
  1316. \end_layout
  1317. \begin_layout Itemize
  1318. Figures
  1319. \begin_inset CommandInset ref
  1320. LatexCommand ref
  1321. reference "fig:mofa-lf-dist"
  1322. plural "false"
  1323. caps "false"
  1324. noprefix "false"
  1325. \end_inset
  1326. and
  1327. \begin_inset CommandInset ref
  1328. LatexCommand ref
  1329. reference "fig:mofa-lf-scatter"
  1330. plural "false"
  1331. caps "false"
  1332. noprefix "false"
  1333. \end_inset
  1334. show that those same 3 LFs, (1, 4, & 5) also correlate best with the experiment
  1335. al factors (cell type & time point)
  1336. \end_layout
  1337. \begin_layout Itemize
  1338. LF2 is clearly the RNA-seq batch effect
  1339. \end_layout
  1340. \begin_layout Standard
  1341. \begin_inset Float figure
  1342. wide false
  1343. sideways false
  1344. status open
  1345. \begin_layout Plain Layout
  1346. \align center
  1347. \begin_inset Graphics
  1348. filename graphics/CD4-csaw/MOFA-batch-correct-CROP.png
  1349. lyxscale 25
  1350. width 100col%
  1351. groupId colwidth-raster
  1352. \end_inset
  1353. \end_layout
  1354. \begin_layout Plain Layout
  1355. \begin_inset Caption Standard
  1356. \begin_layout Plain Layout
  1357. \series bold
  1358. \begin_inset CommandInset label
  1359. LatexCommand label
  1360. name "fig:mofa-batchsub"
  1361. \end_inset
  1362. Result of RNA-seq batch-correction using MOFA latent factors
  1363. \end_layout
  1364. \end_inset
  1365. \end_layout
  1366. \end_inset
  1367. \end_layout
  1368. \begin_layout Itemize
  1369. Attempting to remove the effect of LF2 (Figure
  1370. \begin_inset CommandInset ref
  1371. LatexCommand ref
  1372. reference "fig:mofa-batchsub"
  1373. plural "false"
  1374. caps "false"
  1375. noprefix "false"
  1376. \end_inset
  1377. ) results in batch correction comparable to ComBat (Figure
  1378. \begin_inset CommandInset ref
  1379. LatexCommand ref
  1380. reference "fig:RNA-PCA-ComBat-batchsub"
  1381. plural "false"
  1382. caps "false"
  1383. noprefix "false"
  1384. \end_inset
  1385. )
  1386. \end_layout
  1387. \begin_layout Itemize
  1388. MOFA was able to do this batch subtraction without directly using the sample
  1389. labels (sample labels were used implicitly to select which factor to subtract)
  1390. \end_layout
  1391. \begin_layout Itemize
  1392. Similarity of results shows that batch correction can't get much better
  1393. than ComBat (despite ComBat ignoring time point)
  1394. \end_layout
  1395. \begin_layout Subsection
  1396. MOFA does some interesting stuff but is mostly confirmatory in this context
  1397. \end_layout
  1398. \begin_layout Standard
  1399. \begin_inset Flex TODO Note (inline)
  1400. status open
  1401. \begin_layout Plain Layout
  1402. MOFA should be a footnote to something else, not its own point
  1403. \end_layout
  1404. \end_inset
  1405. \end_layout
  1406. \begin_layout Standard
  1407. \begin_inset Flex TODO Note (inline)
  1408. status open
  1409. \begin_layout Plain Layout
  1410. Combine with previous subsection
  1411. \end_layout
  1412. \end_inset
  1413. \end_layout
  1414. \begin_layout Itemize
  1415. MOFA shows great promise for accelerating discovery of major biological
  1416. effects in multi-omics datasets
  1417. \end_layout
  1418. \begin_deeper
  1419. \begin_layout Itemize
  1420. MOFA successfully separates biologically relevant patterns of variation
  1421. from technical confounding factors without knowing the sample labels, by
  1422. finding latent factors that explain variation across multiple data sets.
  1423. \end_layout
  1424. \begin_layout Itemize
  1425. MOFA was added to this analysis late and played primarily a confirmatory
  1426. role, but it was able to confirm earlier conclusions with much less prior
  1427. information (no sample labels) and much less analyst effort/input
  1428. \end_layout
  1429. \begin_layout Itemize
  1430. Less input from analyst means less opportunity to introduce unwanted bias
  1431. into results
  1432. \end_layout
  1433. \begin_layout Itemize
  1434. MOFA confirmed that the already-implemented batch correction in the RNA-seq
  1435. data was already performing as well as possible given the limitations of
  1436. the data
  1437. \end_layout
  1438. \end_deeper
  1439. \begin_layout Section
  1440. Results
  1441. \end_layout
  1442. \begin_layout Standard
  1443. \begin_inset Note Note
  1444. status open
  1445. \begin_layout Plain Layout
  1446. Focus on what hypotheses were tested, then select figures that show how
  1447. those hypotheses were tested, even if the result is a negative.
  1448. \end_layout
  1449. \begin_layout Plain Layout
  1450. Not every interesting result needs to be in here.
  1451. Chapter should tell a story.
  1452. \end_layout
  1453. \end_inset
  1454. \end_layout
  1455. \begin_layout Standard
  1456. \begin_inset Flex TODO Note (inline)
  1457. status open
  1458. \begin_layout Plain Layout
  1459. Maybe reorder these sections to do RNA-seq, then ChIP-seq, then combined
  1460. analyses?
  1461. \end_layout
  1462. \end_inset
  1463. \end_layout
  1464. \begin_layout Subsection
  1465. H3K4 and H3K27 methylation occur in broad regions and are enriched near
  1466. promoters
  1467. \end_layout
  1468. \begin_layout Standard
  1469. \begin_inset Flex TODO Note (inline)
  1470. status open
  1471. \begin_layout Plain Layout
  1472. Replace these figures with a single table of # of peaks called at chosen
  1473. IDR threshold, showing that SICER has more
  1474. \end_layout
  1475. \end_inset
  1476. \end_layout
  1477. \begin_layout Standard
  1478. \begin_inset Float figure
  1479. wide false
  1480. sideways false
  1481. status open
  1482. \begin_layout Plain Layout
  1483. \begin_inset Flex TODO Note (inline)
  1484. status open
  1485. \begin_layout Plain Layout
  1486. Re-generate IDR rank consistency plots for SICER and MACS side-by-side
  1487. \end_layout
  1488. \end_inset
  1489. \end_layout
  1490. \begin_layout Plain Layout
  1491. \begin_inset Caption Standard
  1492. \begin_layout Plain Layout
  1493. \series bold
  1494. \begin_inset CommandInset label
  1495. LatexCommand label
  1496. name "fig:IDR-RC-H3K4me2"
  1497. \end_inset
  1498. Irreproducible Discovery Rate consistency plots for H3K4me2
  1499. \end_layout
  1500. \end_inset
  1501. \end_layout
  1502. \end_inset
  1503. \end_layout
  1504. \begin_layout Standard
  1505. \begin_inset Float figure
  1506. wide false
  1507. sideways false
  1508. status open
  1509. \begin_layout Plain Layout
  1510. \begin_inset Flex TODO Note (inline)
  1511. status open
  1512. \begin_layout Plain Layout
  1513. Re-generate IDR rank consistency plots for SICER and MACS side-by-side
  1514. \end_layout
  1515. \end_inset
  1516. \end_layout
  1517. \begin_layout Plain Layout
  1518. \begin_inset Caption Standard
  1519. \begin_layout Plain Layout
  1520. \series bold
  1521. \begin_inset CommandInset label
  1522. LatexCommand label
  1523. name "fig:IDR-RC-H3K4me3"
  1524. \end_inset
  1525. Irreproducible Discovery Rate consistency plots for H3K4me3
  1526. \end_layout
  1527. \end_inset
  1528. \end_layout
  1529. \end_inset
  1530. \end_layout
  1531. \begin_layout Standard
  1532. \begin_inset Float figure
  1533. wide false
  1534. sideways false
  1535. status open
  1536. \begin_layout Plain Layout
  1537. \begin_inset Flex TODO Note (inline)
  1538. status open
  1539. \begin_layout Plain Layout
  1540. Re-generate IDR rank consistency plots for SICER and MACS side-by-side
  1541. \end_layout
  1542. \end_inset
  1543. \end_layout
  1544. \begin_layout Plain Layout
  1545. \begin_inset Caption Standard
  1546. \begin_layout Plain Layout
  1547. \series bold
  1548. \begin_inset CommandInset label
  1549. LatexCommand label
  1550. name "fig:IDR-RC-H3K27me3"
  1551. \end_inset
  1552. Irreproducible Discovery Rate consistency plots for H3K27me3
  1553. \end_layout
  1554. \end_inset
  1555. \end_layout
  1556. \end_inset
  1557. \end_layout
  1558. \begin_layout Standard
  1559. \begin_inset Float table
  1560. wide false
  1561. sideways false
  1562. status open
  1563. \begin_layout Plain Layout
  1564. \align center
  1565. \begin_inset Flex TODO Note (inline)
  1566. status open
  1567. \begin_layout Plain Layout
  1568. Need
  1569. \emph on
  1570. median
  1571. \emph default
  1572. peak width, not mean
  1573. \end_layout
  1574. \end_inset
  1575. \end_layout
  1576. \begin_layout Plain Layout
  1577. \align center
  1578. \begin_inset Tabular
  1579. <lyxtabular version="3" rows="4" columns="5">
  1580. <features tabularvalignment="middle">
  1581. <column alignment="center" valignment="top">
  1582. <column alignment="center" valignment="top">
  1583. <column alignment="center" valignment="top">
  1584. <column alignment="center" valignment="top">
  1585. <column alignment="center" valignment="top">
  1586. <row>
  1587. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1588. \begin_inset Text
  1589. \begin_layout Plain Layout
  1590. Histone Mark
  1591. \end_layout
  1592. \end_inset
  1593. </cell>
  1594. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1595. \begin_inset Text
  1596. \begin_layout Plain Layout
  1597. # Peaks
  1598. \end_layout
  1599. \end_inset
  1600. </cell>
  1601. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1602. \begin_inset Text
  1603. \begin_layout Plain Layout
  1604. Mean peak width
  1605. \end_layout
  1606. \end_inset
  1607. </cell>
  1608. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1609. \begin_inset Text
  1610. \begin_layout Plain Layout
  1611. genome coverage
  1612. \end_layout
  1613. \end_inset
  1614. </cell>
  1615. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  1616. \begin_inset Text
  1617. \begin_layout Plain Layout
  1618. read coverage
  1619. \end_layout
  1620. \end_inset
  1621. </cell>
  1622. </row>
  1623. <row>
  1624. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1625. \begin_inset Text
  1626. \begin_layout Plain Layout
  1627. H3K4me2
  1628. \end_layout
  1629. \end_inset
  1630. </cell>
  1631. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1632. \begin_inset Text
  1633. \begin_layout Plain Layout
  1634. 14965
  1635. \end_layout
  1636. \end_inset
  1637. </cell>
  1638. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1639. \begin_inset Text
  1640. \begin_layout Plain Layout
  1641. 3970
  1642. \end_layout
  1643. \end_inset
  1644. </cell>
  1645. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1646. \begin_inset Text
  1647. \begin_layout Plain Layout
  1648. 1.92%
  1649. \end_layout
  1650. \end_inset
  1651. </cell>
  1652. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1653. \begin_inset Text
  1654. \begin_layout Plain Layout
  1655. 14.2%
  1656. \end_layout
  1657. \end_inset
  1658. </cell>
  1659. </row>
  1660. <row>
  1661. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1662. \begin_inset Text
  1663. \begin_layout Plain Layout
  1664. H3K4me3
  1665. \end_layout
  1666. \end_inset
  1667. </cell>
  1668. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1669. \begin_inset Text
  1670. \begin_layout Plain Layout
  1671. 6163
  1672. \end_layout
  1673. \end_inset
  1674. </cell>
  1675. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1676. \begin_inset Text
  1677. \begin_layout Plain Layout
  1678. 2946
  1679. \end_layout
  1680. \end_inset
  1681. </cell>
  1682. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  1683. \begin_inset Text
  1684. \begin_layout Plain Layout
  1685. 0.588%
  1686. \end_layout
  1687. \end_inset
  1688. </cell>
  1689. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  1690. \begin_inset Text
  1691. \begin_layout Plain Layout
  1692. 6.57%
  1693. \end_layout
  1694. \end_inset
  1695. </cell>
  1696. </row>
  1697. <row>
  1698. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1699. \begin_inset Text
  1700. \begin_layout Plain Layout
  1701. H3K27me3
  1702. \end_layout
  1703. \end_inset
  1704. </cell>
  1705. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1706. \begin_inset Text
  1707. \begin_layout Plain Layout
  1708. 18139
  1709. \end_layout
  1710. \end_inset
  1711. </cell>
  1712. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1713. \begin_inset Text
  1714. \begin_layout Plain Layout
  1715. 18967
  1716. \end_layout
  1717. \end_inset
  1718. </cell>
  1719. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  1720. \begin_inset Text
  1721. \begin_layout Plain Layout
  1722. 11.1%
  1723. \end_layout
  1724. \end_inset
  1725. </cell>
  1726. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  1727. \begin_inset Text
  1728. \begin_layout Plain Layout
  1729. 22.5%
  1730. \end_layout
  1731. \end_inset
  1732. </cell>
  1733. </row>
  1734. </lyxtabular>
  1735. \end_inset
  1736. \end_layout
  1737. \begin_layout Plain Layout
  1738. \begin_inset Caption Standard
  1739. \begin_layout Plain Layout
  1740. \series bold
  1741. \begin_inset CommandInset label
  1742. LatexCommand label
  1743. name "tab:peak-calling-summary"
  1744. \end_inset
  1745. SICER+IDR peak-calling summary
  1746. \end_layout
  1747. \end_inset
  1748. \end_layout
  1749. \end_inset
  1750. \end_layout
  1751. \begin_layout Standard
  1752. Figures
  1753. \begin_inset CommandInset ref
  1754. LatexCommand ref
  1755. reference "fig:IDR-RC-H3K4me2"
  1756. plural "false"
  1757. caps "false"
  1758. noprefix "false"
  1759. \end_inset
  1760. ,
  1761. \begin_inset CommandInset ref
  1762. LatexCommand ref
  1763. reference "fig:IDR-RC-H3K4me3"
  1764. plural "false"
  1765. caps "false"
  1766. noprefix "false"
  1767. \end_inset
  1768. , and
  1769. \begin_inset CommandInset ref
  1770. LatexCommand ref
  1771. reference "fig:IDR-RC-H3K27me3"
  1772. plural "false"
  1773. caps "false"
  1774. noprefix "false"
  1775. \end_inset
  1776. show the IDR rank-consistency plots for peaks called in an arbitrarily-chosen
  1777. pair of donors.
  1778. For all 3 histone marks, when the peaks for each donor are ranked according
  1779. to their scores, SICER produces much more reproducible results between
  1780. donors.
  1781. This is consistent with SICER's stated goal of identifying broad peaks,
  1782. in contrast to MACS, which is designed for identifying sharp peaks.
  1783. Based on this observation, the SICER peak calls were used for all downstream
  1784. analyses that involved ChIP-seq peaks.
  1785. Table
  1786. \begin_inset CommandInset ref
  1787. LatexCommand ref
  1788. reference "tab:peak-calling-summary"
  1789. plural "false"
  1790. caps "false"
  1791. noprefix "false"
  1792. \end_inset
  1793. gives a summary of the peak calling statistics for each histone mark.
  1794. \end_layout
  1795. \begin_layout Standard
  1796. \begin_inset Float figure
  1797. wide false
  1798. sideways false
  1799. status open
  1800. \begin_layout Plain Layout
  1801. \align center
  1802. \begin_inset Graphics
  1803. filename graphics/CD4-csaw/Promoter Peak Distance Profile-PAGE1-CROP.pdf
  1804. lyxscale 50
  1805. width 100col%
  1806. groupId colwidth
  1807. \end_inset
  1808. \end_layout
  1809. \begin_layout Plain Layout
  1810. \begin_inset Caption Standard
  1811. \begin_layout Plain Layout
  1812. \series bold
  1813. \begin_inset CommandInset label
  1814. LatexCommand label
  1815. name "fig:effective-promoter-radius"
  1816. \end_inset
  1817. Enrichment of peaks in promoter neighborhoods.
  1818. \end_layout
  1819. \end_inset
  1820. \end_layout
  1821. \begin_layout Plain Layout
  1822. \end_layout
  1823. \end_inset
  1824. \end_layout
  1825. \begin_layout Itemize
  1826. Each histone mark is enriched within a certain radius of gene TSS positions,
  1827. but that radius is different for each mark (figure
  1828. \begin_inset CommandInset ref
  1829. LatexCommand ref
  1830. reference "fig:effective-promoter-radius"
  1831. plural "false"
  1832. caps "false"
  1833. noprefix "false"
  1834. \end_inset
  1835. , previously in
  1836. \begin_inset CommandInset citation
  1837. LatexCommand cite
  1838. key "LaMere2016"
  1839. literal "false"
  1840. \end_inset
  1841. Fig.
  1842. S2)
  1843. \end_layout
  1844. \begin_layout Subsection
  1845. H3K4 and H3K27 promoter methylation has broadly the expected correlation
  1846. with gene expression
  1847. \end_layout
  1848. \begin_layout Standard
  1849. \begin_inset Flex TODO Note (inline)
  1850. status open
  1851. \begin_layout Plain Layout
  1852. This section can easily be cut, especially if I can't find those plots.
  1853. \end_layout
  1854. \end_inset
  1855. \end_layout
  1856. \begin_layout Itemize
  1857. H3K4 is correlated with higher expression, and H3K27 is correlated with
  1858. lower expression genome-wide
  1859. \end_layout
  1860. \begin_layout Standard
  1861. \begin_inset Flex TODO Note (inline)
  1862. status open
  1863. \begin_layout Plain Layout
  1864. Grr, gotta find these figures.
  1865. Maybe in the old analysis?
  1866. \end_layout
  1867. \end_inset
  1868. \end_layout
  1869. \begin_layout Itemize
  1870. Figures showing these correlations: box/violin plots of expression distributions
  1871. with every combination of peak presence/absence in promoter
  1872. \end_layout
  1873. \begin_layout Itemize
  1874. Appropriate statistical tests showing significant differences in expected
  1875. directions
  1876. \end_layout
  1877. \begin_layout Subsection
  1878. Naive-to-memory convergence observed in H3K4 and RNA-seq data, not in H3K27me3
  1879. \end_layout
  1880. \begin_layout Standard
  1881. \begin_inset Float figure
  1882. wide false
  1883. sideways false
  1884. status open
  1885. \begin_layout Plain Layout
  1886. \align center
  1887. \begin_inset Graphics
  1888. filename graphics/CD4-csaw/RNA-seq/PCA-final-23-CROP.png
  1889. lyxscale 25
  1890. width 100col%
  1891. groupId colwidth-raster
  1892. \end_inset
  1893. \end_layout
  1894. \begin_layout Plain Layout
  1895. \begin_inset Caption Standard
  1896. \begin_layout Plain Layout
  1897. \series bold
  1898. \begin_inset CommandInset label
  1899. LatexCommand label
  1900. name "fig:RNA-PCA-group"
  1901. \end_inset
  1902. RNA-seq PCoA showing principal coordiantes 2 and 3.
  1903. \end_layout
  1904. \end_inset
  1905. \end_layout
  1906. \end_inset
  1907. \end_layout
  1908. \begin_layout Itemize
  1909. H3K4 and RNA-seq data show clear evidence of naive convergence with memory
  1910. between days 1 and 5 (Figures
  1911. \begin_inset CommandInset ref
  1912. LatexCommand ref
  1913. reference "fig:PCoA-H3K4me2-good"
  1914. plural "false"
  1915. caps "false"
  1916. noprefix "false"
  1917. \end_inset
  1918. ,
  1919. \begin_inset CommandInset ref
  1920. LatexCommand ref
  1921. reference "fig:PCoA-H3K4me3-good"
  1922. plural "false"
  1923. caps "false"
  1924. noprefix "false"
  1925. \end_inset
  1926. , and
  1927. \begin_inset CommandInset ref
  1928. LatexCommand ref
  1929. reference "fig:RNA-PCA-group"
  1930. plural "false"
  1931. caps "false"
  1932. noprefix "false"
  1933. \end_inset
  1934. .
  1935. \end_layout
  1936. \begin_layout Itemize
  1937. Table of numbers of genes different between N & M at each time point, showing
  1938. dwindling differences at later time points, consistent with convergence
  1939. \end_layout
  1940. \begin_layout Itemize
  1941. Similar figure for H3K27me3 showing lack of convergence (Figure
  1942. \begin_inset CommandInset ref
  1943. LatexCommand ref
  1944. reference "fig:PCoA-H3K27me3-good"
  1945. plural "false"
  1946. caps "false"
  1947. noprefix "false"
  1948. \end_inset
  1949. )
  1950. \end_layout
  1951. \begin_layout Subsection
  1952. Effect of promoter coverage upstream vs downstream of TSS
  1953. \end_layout
  1954. \begin_layout Standard
  1955. \begin_inset Float figure
  1956. wide false
  1957. sideways false
  1958. status open
  1959. \begin_layout Plain Layout
  1960. \align center
  1961. \begin_inset Graphics
  1962. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-neighborhood-clusters-CROP.png
  1963. lyxscale 25
  1964. width 100col%
  1965. groupId colwidth-raster
  1966. \end_inset
  1967. \end_layout
  1968. \begin_layout Plain Layout
  1969. \begin_inset Caption Standard
  1970. \begin_layout Plain Layout
  1971. \series bold
  1972. \begin_inset CommandInset label
  1973. LatexCommand label
  1974. name "fig:H3K4me2-neighborhood-clusters"
  1975. \end_inset
  1976. RNA-seq PCoA showing principal coordiantes 2 and 3.
  1977. \end_layout
  1978. \end_inset
  1979. \end_layout
  1980. \end_inset
  1981. \end_layout
  1982. \begin_layout Standard
  1983. \begin_inset Float figure
  1984. wide false
  1985. sideways false
  1986. status open
  1987. \begin_layout Plain Layout
  1988. \align center
  1989. \begin_inset Graphics
  1990. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-neighborhood-PCA-CROP.png
  1991. lyxscale 25
  1992. width 100col%
  1993. groupId colwidth-raster
  1994. \end_inset
  1995. \end_layout
  1996. \begin_layout Plain Layout
  1997. \begin_inset Caption Standard
  1998. \begin_layout Plain Layout
  1999. \series bold
  2000. \begin_inset CommandInset label
  2001. LatexCommand label
  2002. name "fig:H3K4me2-neighborhood-pca"
  2003. \end_inset
  2004. RNA-seq PCoA showing principal coordiantes 2 and 3.
  2005. \end_layout
  2006. \end_inset
  2007. \end_layout
  2008. \end_inset
  2009. \end_layout
  2010. \begin_layout Standard
  2011. \begin_inset Float figure
  2012. wide false
  2013. sideways false
  2014. status open
  2015. \begin_layout Plain Layout
  2016. \align center
  2017. \begin_inset Graphics
  2018. filename graphics/CD4-csaw/ChIP-seq/H3K4me2-neighborhood-expression-CROP.png
  2019. lyxscale 25
  2020. width 100col%
  2021. groupId colwidth-raster
  2022. \end_inset
  2023. \end_layout
  2024. \begin_layout Plain Layout
  2025. \begin_inset Caption Standard
  2026. \begin_layout Plain Layout
  2027. \series bold
  2028. \begin_inset CommandInset label
  2029. LatexCommand label
  2030. name "fig:H3K4me2-neighborhood-expression"
  2031. \end_inset
  2032. RNA-seq PCoA showing principal coordiantes 2 and 3.
  2033. \end_layout
  2034. \end_inset
  2035. \end_layout
  2036. \end_inset
  2037. \end_layout
  2038. \begin_layout Standard
  2039. \begin_inset Float figure
  2040. wide false
  2041. sideways false
  2042. status open
  2043. \begin_layout Plain Layout
  2044. \align center
  2045. \begin_inset Graphics
  2046. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-neighborhood-clusters-CROP.png
  2047. lyxscale 25
  2048. width 100col%
  2049. groupId colwidth-raster
  2050. \end_inset
  2051. \end_layout
  2052. \begin_layout Plain Layout
  2053. \begin_inset Caption Standard
  2054. \begin_layout Plain Layout
  2055. \series bold
  2056. \begin_inset CommandInset label
  2057. LatexCommand label
  2058. name "fig:H3K4me3-neighborhood-clusters"
  2059. \end_inset
  2060. RNA-seq PCoA showing principal coordiantes 2 and 3.
  2061. \end_layout
  2062. \end_inset
  2063. \end_layout
  2064. \end_inset
  2065. \end_layout
  2066. \begin_layout Standard
  2067. \begin_inset Float figure
  2068. wide false
  2069. sideways false
  2070. status open
  2071. \begin_layout Plain Layout
  2072. \align center
  2073. \begin_inset Graphics
  2074. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-neighborhood-PCA-CROP.png
  2075. lyxscale 25
  2076. width 100col%
  2077. groupId colwidth-raster
  2078. \end_inset
  2079. \end_layout
  2080. \begin_layout Plain Layout
  2081. \begin_inset Caption Standard
  2082. \begin_layout Plain Layout
  2083. \series bold
  2084. \begin_inset CommandInset label
  2085. LatexCommand label
  2086. name "fig:H3K4me3-neighborhood-pca-1"
  2087. \end_inset
  2088. RNA-seq PCoA showing principal coordiantes 2 and 3.
  2089. \end_layout
  2090. \end_inset
  2091. \end_layout
  2092. \end_inset
  2093. \end_layout
  2094. \begin_layout Standard
  2095. \begin_inset Float figure
  2096. wide false
  2097. sideways false
  2098. status open
  2099. \begin_layout Plain Layout
  2100. \align center
  2101. \begin_inset Graphics
  2102. filename graphics/CD4-csaw/ChIP-seq/H3K4me3-neighborhood-expression-CROP.png
  2103. lyxscale 25
  2104. width 100col%
  2105. groupId colwidth-raster
  2106. \end_inset
  2107. \end_layout
  2108. \begin_layout Plain Layout
  2109. \begin_inset Caption Standard
  2110. \begin_layout Plain Layout
  2111. \series bold
  2112. \begin_inset CommandInset label
  2113. LatexCommand label
  2114. name "fig:H3K4me3-neighborhood-expression-1"
  2115. \end_inset
  2116. RNA-seq PCoA showing principal coordiantes 2 and 3.
  2117. \end_layout
  2118. \end_inset
  2119. \end_layout
  2120. \end_inset
  2121. \end_layout
  2122. \begin_layout Itemize
  2123. H3K4me peaks seem to correlate with increased expression as long as they
  2124. are anywhere near the TSS
  2125. \end_layout
  2126. \begin_layout Itemize
  2127. H3K27me3 peaks can have different correlations to gene expression depending
  2128. on their position relative to TSS (e.g.
  2129. upstream vs downstream) Results consistent with
  2130. \begin_inset CommandInset citation
  2131. LatexCommand cite
  2132. key "Young2011"
  2133. literal "false"
  2134. \end_inset
  2135. \end_layout
  2136. \begin_layout Standard
  2137. \begin_inset Flex TODO Note (inline)
  2138. status open
  2139. \begin_layout Plain Layout
  2140. Show the figures where the negative result ended this line of inquiry
  2141. \end_layout
  2142. \end_inset
  2143. \end_layout
  2144. \begin_layout Section
  2145. Discussion
  2146. \end_layout
  2147. \begin_layout Standard
  2148. \begin_inset Flex TODO Note (inline)
  2149. status open
  2150. \begin_layout Plain Layout
  2151. Try to boil it down to 3 main messages to get across
  2152. \end_layout
  2153. \end_inset
  2154. \end_layout
  2155. \begin_layout Itemize
  2156. 3 Main points
  2157. \end_layout
  2158. \begin_deeper
  2159. \begin_layout Itemize
  2160. "Promoter radius" is not constant and must be defined empirically for a
  2161. given data set.
  2162. Coverage within promoter radius has an expression correlation as well
  2163. \end_layout
  2164. \begin_layout Itemize
  2165. Naive-to-memory convergence in certain data sets but not others, implies
  2166. which marks are involved in memory differentiation
  2167. \end_layout
  2168. \begin_layout Itemize
  2169. TSS positional coverage, hints of something interesting but no clear conclusions
  2170. \end_layout
  2171. \end_deeper
  2172. \begin_layout Standard
  2173. \begin_inset Float figure
  2174. wide false
  2175. sideways false
  2176. status collapsed
  2177. \begin_layout Plain Layout
  2178. \align center
  2179. \begin_inset Graphics
  2180. filename graphics/CD4-csaw/LaMere2016_fig8.pdf
  2181. lyxscale 50
  2182. width 100col%
  2183. groupId colwidth
  2184. \end_inset
  2185. \end_layout
  2186. \begin_layout Plain Layout
  2187. \begin_inset Caption Standard
  2188. \begin_layout Plain Layout
  2189. \series bold
  2190. LaMere 2016 Figure 8, reproduced with permission.
  2191. \end_layout
  2192. \end_inset
  2193. \end_layout
  2194. \end_inset
  2195. \end_layout
  2196. \begin_layout Itemize
  2197. Naive-to-memory convergence implies that naive cells are differentiating
  2198. into memory cells, and that gene expression and H3K4 methylation are involved
  2199. in this differentiation while H3K27me3 is less involved
  2200. \end_layout
  2201. \begin_deeper
  2202. \begin_layout Itemize
  2203. Convergence is consistent with Lamere2016 fig 8
  2204. \begin_inset CommandInset citation
  2205. LatexCommand cite
  2206. key "LaMere2016"
  2207. literal "false"
  2208. \end_inset
  2209. (which was created without the benefit of SVA)
  2210. \end_layout
  2211. \begin_layout Itemize
  2212. H3K27me3, canonically regarded as a deactivating mark, seems to have a more
  2213. complex effect
  2214. \end_layout
  2215. \end_deeper
  2216. \begin_layout Itemize
  2217. TSS positional coverage
  2218. \end_layout
  2219. \begin_layout Standard
  2220. \begin_inset Float figure
  2221. wide false
  2222. sideways true
  2223. status open
  2224. \begin_layout Plain Layout
  2225. \align center
  2226. \begin_inset Graphics
  2227. filename graphics/CD4-csaw/rulegraphs/rulegraph-all.pdf
  2228. lyxscale 50
  2229. width 100theight%
  2230. \end_inset
  2231. \end_layout
  2232. \begin_layout Plain Layout
  2233. \begin_inset Caption Standard
  2234. \begin_layout Plain Layout
  2235. \begin_inset CommandInset label
  2236. LatexCommand label
  2237. name "fig:rulegraph"
  2238. \end_inset
  2239. \series bold
  2240. Dependency graph of steps in reproducible workflow
  2241. \end_layout
  2242. \end_inset
  2243. \end_layout
  2244. \end_inset
  2245. \end_layout
  2246. \begin_layout Itemize
  2247. Discuss advantages of developing using a reproducible workflow
  2248. \end_layout
  2249. \begin_deeper
  2250. \begin_layout Itemize
  2251. Decision-making based on trying every option and running the workflow downstream
  2252. to see the effects
  2253. \end_layout
  2254. \end_deeper
  2255. \begin_layout Chapter
  2256. Improving array-based analyses of transplant rejection by optimizing data
  2257. preprocessing
  2258. \end_layout
  2259. \begin_layout Standard
  2260. \begin_inset Note Note
  2261. status open
  2262. \begin_layout Plain Layout
  2263. Chapter author list: Me, Sunil, Tom, Padma, Dan
  2264. \end_layout
  2265. \end_inset
  2266. \end_layout
  2267. \begin_layout Section
  2268. Approach
  2269. \end_layout
  2270. \begin_layout Subsection
  2271. Proper pre-processing is essential for array data
  2272. \end_layout
  2273. \begin_layout Standard
  2274. \begin_inset Flex TODO Note (inline)
  2275. status open
  2276. \begin_layout Plain Layout
  2277. This section could probably use some citations
  2278. \end_layout
  2279. \end_inset
  2280. \end_layout
  2281. \begin_layout Standard
  2282. Microarrays, bead arrays, and similar assays produce raw data in the form
  2283. of fluorescence intensity measurements, with the each intensity measurement
  2284. proportional to the abundance of some fluorescently-labelled target DNA
  2285. or RNA sequence that base pairs to a specific probe sequence.
  2286. However, these measurements for each probe are also affected my many technical
  2287. confounding factors, such as the concentration of target material, strength
  2288. of off-target binding, and the sensitivity of the imaging sensor.
  2289. Some array designs also use multiple probe sequences for each target.
  2290. Hence, extensive pre-processing of array data is necessary to normalize
  2291. out the effects of these technical factors and summarize the information
  2292. from multiple probes to arrive at a single usable estimate of abundance
  2293. or other relevant quantity, such as a ratio of two abundances, for each
  2294. target.
  2295. \end_layout
  2296. \begin_layout Standard
  2297. The choice of pre-processing algorithms used in the analysis of an array
  2298. data set can have a large effect on the results of that analysis.
  2299. However, despite their importance, these steps are often neglected or rushed
  2300. in order to get to the more scientifically interesting analysis steps involving
  2301. the actual biology of the system under study.
  2302. Hence, it is often possible to achieve substantial gains in statistical
  2303. power, model goodness-of-fit, or other relevant performance measures, by
  2304. checking the assumptions made by each preprocessing step and choosing specific
  2305. normalization methods tailored to the specific goals of the current analysis.
  2306. \end_layout
  2307. \begin_layout Subsection
  2308. Normalization for clinical microarray classifiers must be single-channel
  2309. \end_layout
  2310. \begin_layout Subsubsection
  2311. Standard normalization methods are unsuitable for clinical application
  2312. \end_layout
  2313. \begin_layout Standard
  2314. As the cost of performing microarray assays falls, there is increasing interest
  2315. in using genomic assays for diagnostic purposes, such as distinguishing
  2316. healthy transplants (TX) from transplants undergoing acute rejection (AR)
  2317. or acute dysfunction with no rejection (ADNR).
  2318. However, the the standard normalization algorithm used for microarray data,
  2319. Robust Multi-chip Average (RMA)
  2320. \begin_inset CommandInset citation
  2321. LatexCommand cite
  2322. key "Irizarry2003a"
  2323. literal "false"
  2324. \end_inset
  2325. , is not applicable in a clinical setting.
  2326. Two of the steps in RMA, quantile normalization and probe summarization
  2327. by median polish, depend on every array in the data set being normalized.
  2328. This means that adding or removing any arrays from a data set changes the
  2329. normalized values for all arrays, and data sets that have been normalized
  2330. separately cannot be compared to each other.
  2331. Hence, when using RMA, any arrays to be analyzed together must also be
  2332. normalized together, and the set of arrays included in the data set must
  2333. be held constant throughout an analysis.
  2334. \end_layout
  2335. \begin_layout Standard
  2336. These limitations present serious impediments to the use of arrays as a
  2337. diagnostic tool.
  2338. When training a classifier, the samples to be classified must not be involved
  2339. in any step of the training process, lest their inclusion bias the training
  2340. process.
  2341. Once a classifier is deployed in a clinical setting, the samples to be
  2342. classified will not even
  2343. \emph on
  2344. exist
  2345. \emph default
  2346. at the time of training, so including them would be impossible even if
  2347. it were statistically justifiable.
  2348. Therefore, any machine learning application for microarrays demands that
  2349. the normalized expression values computed for an array must depend only
  2350. on information contained within that array.
  2351. This would ensure that each array's normalization is independent of every
  2352. other array, and that arrays normalized separately can still be compared
  2353. to each other without bias.
  2354. Such a normalization is commonly referred to as
  2355. \begin_inset Quotes eld
  2356. \end_inset
  2357. single-channel normalization
  2358. \begin_inset Quotes erd
  2359. \end_inset
  2360. .
  2361. \end_layout
  2362. \begin_layout Subsubsection
  2363. Several strategies are available to meet clinical normalization requirements
  2364. \end_layout
  2365. \begin_layout Standard
  2366. Frozen RMA (fRMA) addresses these concerns by replacing the quantile normalizati
  2367. on and median polish with alternatives that do not introduce inter-array
  2368. dependence, allowing each array to be normalized independently of all others
  2369. \begin_inset CommandInset citation
  2370. LatexCommand cite
  2371. key "McCall2010"
  2372. literal "false"
  2373. \end_inset
  2374. .
  2375. Quantile normalization is performed against a pre-generated set of quantiles
  2376. learned from a collection of 850 publically available arrays sampled from
  2377. a wide variety of tissues in the Gene Expression Omnibus (GEO).
  2378. Each array's probe intensity distribution is normalized against these pre-gener
  2379. ated quantiles.
  2380. The median polish step is replaced with a robust weighted average of probe
  2381. intensities, using inverse variance weights learned from the same public
  2382. GEO data.
  2383. The result is a normalization that satisfies the requirements mentioned
  2384. above: each array is normalized independently of all others, and any two
  2385. normalized arrays can be compared directly to each other.
  2386. \end_layout
  2387. \begin_layout Standard
  2388. One important limitation of fRMA is that it requires a separate reference
  2389. data set from which to learn the parameters (reference quantiles and probe
  2390. weights) that will be used to normalize each array.
  2391. These parameters are specific to a given array platform, and pre-generated
  2392. parameters are only provided for the most common platforms, such as Affymetrix
  2393. hgu133plus2.
  2394. For a less common platform, such as hthgu133pluspm, is is necessary to
  2395. learn custom parameters from in-house data before fRMA can be used to normalize
  2396. samples on that platform
  2397. \begin_inset CommandInset citation
  2398. LatexCommand cite
  2399. key "McCall2011"
  2400. literal "false"
  2401. \end_inset
  2402. .
  2403. \end_layout
  2404. \begin_layout Standard
  2405. One other option is the aptly-named Single Channel Array Normalization (SCAN),
  2406. which adapts a normalization method originally designed for tiling arrays
  2407. \begin_inset CommandInset citation
  2408. LatexCommand cite
  2409. key "Piccolo2012"
  2410. literal "false"
  2411. \end_inset
  2412. .
  2413. SCAN is truly single-channel in that it does not require a set of normalization
  2414. paramters estimated from an external set of reference samples like fRMA
  2415. does.
  2416. \end_layout
  2417. \begin_layout Subsection
  2418. Heteroskedasticity must be accounted for in methylation array data
  2419. \end_layout
  2420. \begin_layout Subsubsection
  2421. Methylation array preprocessing induces heteroskedasticity
  2422. \end_layout
  2423. \begin_layout Standard
  2424. DNA methylation arrays are a relatively new kind of assay that uses microarrays
  2425. to measure the degree of methylation on cytosines in specific regions arrayed
  2426. across the genome.
  2427. First, bisulfite treatment converts all unmethylated cytosines to uracil
  2428. (which then become thymine after amplication) while leaving methylated
  2429. cytosines unaffected.
  2430. Then, each target region is interrogated with two probes: one binds to
  2431. the original genomic sequence and interrogates the level of methylated
  2432. DNA, and the other binds to the same sequence with all cytosines replaced
  2433. by thymidines and interrogates the level of unmethylated DNA.
  2434. \end_layout
  2435. \begin_layout Standard
  2436. \begin_inset Float figure
  2437. wide false
  2438. sideways false
  2439. status collapsed
  2440. \begin_layout Plain Layout
  2441. \align center
  2442. \begin_inset Graphics
  2443. filename graphics/methylvoom/sigmoid.pdf
  2444. \end_inset
  2445. \end_layout
  2446. \begin_layout Plain Layout
  2447. \begin_inset Caption Standard
  2448. \begin_layout Plain Layout
  2449. \begin_inset CommandInset label
  2450. LatexCommand label
  2451. name "fig:Sigmoid-beta-m-mapping"
  2452. \end_inset
  2453. \series bold
  2454. Sigmoid shape of the mapping between β and M values
  2455. \end_layout
  2456. \end_inset
  2457. \end_layout
  2458. \end_inset
  2459. \end_layout
  2460. \begin_layout Standard
  2461. After normalization, these two probe intensities are summarized in one of
  2462. two ways, each with advantages and disadvantages.
  2463. β
  2464. \series bold
  2465. \series default
  2466. values, interpreted as fraction of DNA copies methylated, range from 0 to
  2467. 1.
  2468. β
  2469. \series bold
  2470. \series default
  2471. values are conceptually easy to interpret, but the constrained range makes
  2472. them unsuitable for linear modeling, and their error distributions are
  2473. highly non-normal, which also frustrates linear modeling.
  2474. M-values, interpreted as the log ratio of methylated to unmethylated copies,
  2475. are computed by mapping the beta values from
  2476. \begin_inset Formula $[0,1]$
  2477. \end_inset
  2478. onto
  2479. \begin_inset Formula $(-\infty,+\infty)$
  2480. \end_inset
  2481. using a sigmoid curve (Figure
  2482. \begin_inset CommandInset ref
  2483. LatexCommand ref
  2484. reference "fig:Sigmoid-beta-m-mapping"
  2485. plural "false"
  2486. caps "false"
  2487. noprefix "false"
  2488. \end_inset
  2489. ).
  2490. This transformation results in values with better statistical perperties:
  2491. the unconstrained range is suitable for linear modeling, and the error
  2492. distributions are more normal.
  2493. Hence, most linear modeling and other statistical testing on methylation
  2494. arrays is performed using M-values.
  2495. \end_layout
  2496. \begin_layout Standard
  2497. However, the steep slope of the sigmoid transformation near 0 and 1 tends
  2498. to over-exaggerate small differences in β values near those extremes, which
  2499. in turn amplifies the error in those values, leading to a U-shaped trend
  2500. in the mean-variance curve: extreme values have higher variances than values
  2501. near the middle.
  2502. This mean-variance dependency must be accounted for when fitting the linear
  2503. model for differential methylation, or else the variance will be systematically
  2504. overestimated for probes with moderate M-values and underestimated for
  2505. probes with extreme M-values.
  2506. \end_layout
  2507. \begin_layout Subsubsection
  2508. The voom method for RNA-seq data can model M-value heteroskedasticity
  2509. \end_layout
  2510. \begin_layout Standard
  2511. RNA-seq read count data are also known to show heteroskedasticity, and the
  2512. voom method was developed for modeling this heteroskedasticity by estimating
  2513. the mean-variance trend in the data and using this trend to assign precision
  2514. weights to each observation
  2515. \begin_inset CommandInset citation
  2516. LatexCommand cite
  2517. key "Law2013"
  2518. literal "false"
  2519. \end_inset
  2520. .
  2521. While methylation array data are not derived from counts and have a very
  2522. different mean-variance relationship from that of typical RNA-seq data,
  2523. the voom method makes no specific assumptions on the shape of the mean-variance
  2524. relationship - it only assumes that the relationship is smooth enough to
  2525. model using a lowess curve.
  2526. Hence, the method is sufficiently general to model the mean-variance relationsh
  2527. ip in methylation array data.
  2528. However, the standard implementation of voom assumes that the input is
  2529. given in raw read counts, and it must be adapted to run on methylation
  2530. M-values.
  2531. \end_layout
  2532. \begin_layout Section
  2533. Methods
  2534. \end_layout
  2535. \begin_layout Subsection
  2536. Evaluation of classifier performance with different normalization methods
  2537. \end_layout
  2538. \begin_layout Standard
  2539. For testing different expression microarray normalizations, a data set of
  2540. 157 hgu133plus2 arrays was used, consisting of blood samples from kidney
  2541. transplant patients whose grafts had been graded as TX, AR, or ADNR via
  2542. biopsy and histology (46 TX, 69 AR, 42 ADNR)
  2543. \begin_inset CommandInset citation
  2544. LatexCommand cite
  2545. key "Kurian2014"
  2546. literal "true"
  2547. \end_inset
  2548. .
  2549. Additionally, an external validation set of 75 samples was gathered from
  2550. public GEO data (37 TX, 38 AR, no ADNR).
  2551. \end_layout
  2552. \begin_layout Standard
  2553. \begin_inset Flex TODO Note (inline)
  2554. status open
  2555. \begin_layout Plain Layout
  2556. Find appropriate GEO identifiers if possible.
  2557. Kurian 2014 says GSE15296, but this seems to be different data.
  2558. I also need to look up the GEO accession for the external validation set.
  2559. \end_layout
  2560. \end_inset
  2561. \end_layout
  2562. \begin_layout Standard
  2563. To evaluate the effect of each normalization on classifier performance,
  2564. the same classifier training and validation procedure was used after each
  2565. normalization method.
  2566. The PAM package was used to train a nearest shrunken centroid classifier
  2567. on the training set and select the appropriate threshold for centroid shrinking.
  2568. Then the trained classifier was used to predict the class probabilities
  2569. of each validation sample.
  2570. From these class probabilities, ROC curves and area-under-curve (AUC) values
  2571. were generated
  2572. \begin_inset CommandInset citation
  2573. LatexCommand cite
  2574. key "Turck2011"
  2575. literal "false"
  2576. \end_inset
  2577. .
  2578. Each normalization was tested on two different sets of training and validation
  2579. samples.
  2580. For internal validation, the 115 TX and AR arrays in the internal set were
  2581. split at random into two equal sized sets, one for training and one for
  2582. validation, each containing the same numbers of TX and AR samples as the
  2583. other set.
  2584. For external validation, the full set of 115 TX and AR samples were used
  2585. as a training set, and the 75 external TX and AR samples were used as the
  2586. validation set.
  2587. Thus, 2 ROC curves and AUC values were generated for each normalization
  2588. method: one internal and one external.
  2589. Because the external validation set contains no ADNR samples, only classificati
  2590. on of TX and AR samples was considered.
  2591. The ADNR samples were included during normalization but excluded from all
  2592. classifier training and validation.
  2593. This ensures that the performance on internal and external validation sets
  2594. is directly comparable, since both are performing the same task: distinguising
  2595. TX from AR.
  2596. \end_layout
  2597. \begin_layout Standard
  2598. \begin_inset Flex TODO Note (inline)
  2599. status open
  2600. \begin_layout Plain Layout
  2601. Summarize the get.best.threshold algorithm for PAM threshold selection, or
  2602. just put the code online?
  2603. \end_layout
  2604. \end_inset
  2605. \end_layout
  2606. \begin_layout Standard
  2607. Six different normalization strategies were evaluated.
  2608. First, 2 well-known non-single-channel normalization methods were considered:
  2609. RMA and dChip
  2610. \begin_inset CommandInset citation
  2611. LatexCommand cite
  2612. key "Li2001,Irizarry2003a"
  2613. literal "false"
  2614. \end_inset
  2615. .
  2616. Since RMA produces expression values on a log2 scale and dChip does not,
  2617. the values from dChip were log2 transformed after normalization.
  2618. Next, RMA and dChip followed by Global Rank-invariant Set Normalization
  2619. (GRSN) were tested
  2620. \begin_inset CommandInset citation
  2621. LatexCommand cite
  2622. key "Pelz2008"
  2623. literal "false"
  2624. \end_inset
  2625. .
  2626. Post-processing with GRSN does not turn RMA or dChip into single-channel
  2627. methods, but it may help mitigate batch effects and is therefore useful
  2628. as a benchmark.
  2629. Lastly, the two single-channel normalization methods, fRMA and SCAN, were
  2630. tested
  2631. \begin_inset CommandInset citation
  2632. LatexCommand cite
  2633. key "McCall2010,Piccolo2012"
  2634. literal "false"
  2635. \end_inset
  2636. .
  2637. When evaluting internal validation performance, only the 157 internal samples
  2638. were normalized; when evaluating external validation performance, all 157
  2639. internal samples and 75 external samples were normalized together.
  2640. \end_layout
  2641. \begin_layout Standard
  2642. For demonstrating the problem with separate normalization of training and
  2643. validation data, one additional normalization was performed: the internal
  2644. and external sets were each normalized separately using RMA, and the normalized
  2645. data for each set were combined into a single set with no further attempts
  2646. at normalizing between the two sets.
  2647. The represents approximately how RMA would have to be used in a clinical
  2648. setting, where the samples to be classified are not available at the time
  2649. the classifier is trained.
  2650. \end_layout
  2651. \begin_layout Subsection
  2652. Generating custom fRMA vectors for hthgu133pluspm array platform
  2653. \end_layout
  2654. \begin_layout Standard
  2655. In order to enable fRMA normalization for the hthgu133pluspm array platform,
  2656. custom fRMA normalization vectors were trained using the frmaTools package
  2657. \begin_inset CommandInset citation
  2658. LatexCommand cite
  2659. key "McCall2011"
  2660. literal "false"
  2661. \end_inset
  2662. .
  2663. Separate vectors were created for two types of samples: kidney graft biopsy
  2664. samples and blood samples from graft recipients.
  2665. For training, a 341 kidney biopsy samples from 2 data sets and 965 blood
  2666. samples from 5 data sets were used as the reference set.
  2667. Arrays were groups into batches based on unique combinations of sample
  2668. type (blood or biopsy), diagnosis (TX, AR, etc.), data set, and scan date.
  2669. Thus, each batch represents arrays of the same kind that were run together
  2670. on the same day.
  2671. For estimating the probe inverse variance weights, frmaTools requires equal-siz
  2672. ed batches, which means a batch size must be chosen, and then batches smaller
  2673. than that size must be ignored, while batches larger than the chosen size
  2674. must be downsampled.
  2675. This downsampling is performed randomly, so the sampling process is repeated
  2676. 5 times and the resulting normalizations are compared to each other.
  2677. \end_layout
  2678. \begin_layout Standard
  2679. To evaluate the consistency of the generated normalization vectors, the
  2680. 5 fRMA vector sets generated from 5 random batch samplings were each used
  2681. to normalize the same 20 randomly selected samples from each tissue.
  2682. Then the normalized expression values for each probe on each array were
  2683. compared across all normalizations.
  2684. Each fRMA normalization was also compared against the normalized expression
  2685. values obtained by normalizing the same 20 samples with ordinary RMA.
  2686. \end_layout
  2687. \begin_layout Subsection
  2688. Modeling methylation array M-value heteroskedasticy in linear models with
  2689. modified voom implementation
  2690. \end_layout
  2691. \begin_layout Standard
  2692. \begin_inset Flex TODO Note (inline)
  2693. status open
  2694. \begin_layout Plain Layout
  2695. Put code on Github and reference it.
  2696. \end_layout
  2697. \end_inset
  2698. \end_layout
  2699. \begin_layout Standard
  2700. To investigate the whether DNA methylation could be used to distinguish
  2701. between healthy and dysfunctional transplants, a data set of 78 Illumina
  2702. 450k methylation arrays from human kidney graft biopsies was analyzed for
  2703. differential metylation between 4 transplant statuses: healthy transplant
  2704. (TX), transplants undergoing acute rejection (AR), acute dysfunction with
  2705. no rejection (ADNR), and chronic allograpft nephropathy (CAN).
  2706. The data consisted of 33 TX, 9 AR, 8 ADNR, and 28 CAN samples.
  2707. The uneven group sizes are a result of taking the biopsy samples before
  2708. the eventual fate of the transplant was known.
  2709. Each sample was additionally annotated with a donor ID (anonymized), Sex,
  2710. Age, Ethnicity, Creatinine Level, and Diabetes diagnosois (all samples
  2711. in this data set came from patients with either Type 1 or Type 2 diabetes).
  2712. \end_layout
  2713. \begin_layout Standard
  2714. The intensity data were first normalized using subset-quantile within array
  2715. normalization (SWAN)
  2716. \begin_inset CommandInset citation
  2717. LatexCommand cite
  2718. key "Maksimovic2012"
  2719. literal "false"
  2720. \end_inset
  2721. , then converted to intensity ratios (beta values)
  2722. \begin_inset CommandInset citation
  2723. LatexCommand cite
  2724. key "Aryee2014"
  2725. literal "false"
  2726. \end_inset
  2727. .
  2728. Any probes binding to loci that overlapped annotated SNPs were dropped,
  2729. and the annotated sex of each sample was verified against the sex inferred
  2730. from the ratio of median probe intensities for the X and Y chromosomes.
  2731. Then, the ratios were transformed to M-values.
  2732. \end_layout
  2733. \begin_layout Standard
  2734. \begin_inset Float table
  2735. wide false
  2736. sideways false
  2737. status collapsed
  2738. \begin_layout Plain Layout
  2739. \begin_inset Tabular
  2740. <lyxtabular version="3" rows="4" columns="6">
  2741. <features tabularvalignment="middle">
  2742. <column alignment="center" valignment="top">
  2743. <column alignment="center" valignment="top">
  2744. <column alignment="center" valignment="top">
  2745. <column alignment="center" valignment="top">
  2746. <column alignment="center" valignment="top">
  2747. <column alignment="center" valignment="top">
  2748. <row>
  2749. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2750. \begin_inset Text
  2751. \begin_layout Plain Layout
  2752. Analysis
  2753. \end_layout
  2754. \end_inset
  2755. </cell>
  2756. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2757. \begin_inset Text
  2758. \begin_layout Plain Layout
  2759. patient random effect
  2760. \end_layout
  2761. \end_inset
  2762. </cell>
  2763. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2764. \begin_inset Text
  2765. \begin_layout Plain Layout
  2766. empirical Bayes
  2767. \end_layout
  2768. \end_inset
  2769. </cell>
  2770. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2771. \begin_inset Text
  2772. \begin_layout Plain Layout
  2773. SVA
  2774. \end_layout
  2775. \end_inset
  2776. </cell>
  2777. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2778. \begin_inset Text
  2779. \begin_layout Plain Layout
  2780. sample weights
  2781. \end_layout
  2782. \end_inset
  2783. </cell>
  2784. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2785. \begin_inset Text
  2786. \begin_layout Plain Layout
  2787. voom
  2788. \end_layout
  2789. \end_inset
  2790. </cell>
  2791. </row>
  2792. <row>
  2793. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2794. \begin_inset Text
  2795. \begin_layout Plain Layout
  2796. A
  2797. \end_layout
  2798. \end_inset
  2799. </cell>
  2800. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2801. \begin_inset Text
  2802. \begin_layout Plain Layout
  2803. Yes
  2804. \end_layout
  2805. \end_inset
  2806. </cell>
  2807. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2808. \begin_inset Text
  2809. \begin_layout Plain Layout
  2810. Yes
  2811. \end_layout
  2812. \end_inset
  2813. </cell>
  2814. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2815. \begin_inset Text
  2816. \begin_layout Plain Layout
  2817. No
  2818. \end_layout
  2819. \end_inset
  2820. </cell>
  2821. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2822. \begin_inset Text
  2823. \begin_layout Plain Layout
  2824. No
  2825. \end_layout
  2826. \end_inset
  2827. </cell>
  2828. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2829. \begin_inset Text
  2830. \begin_layout Plain Layout
  2831. No
  2832. \end_layout
  2833. \end_inset
  2834. </cell>
  2835. </row>
  2836. <row>
  2837. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2838. \begin_inset Text
  2839. \begin_layout Plain Layout
  2840. B
  2841. \end_layout
  2842. \end_inset
  2843. </cell>
  2844. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2845. \begin_inset Text
  2846. \begin_layout Plain Layout
  2847. Yes
  2848. \end_layout
  2849. \end_inset
  2850. </cell>
  2851. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2852. \begin_inset Text
  2853. \begin_layout Plain Layout
  2854. Yes
  2855. \end_layout
  2856. \end_inset
  2857. </cell>
  2858. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2859. \begin_inset Text
  2860. \begin_layout Plain Layout
  2861. Yes
  2862. \end_layout
  2863. \end_inset
  2864. </cell>
  2865. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  2866. \begin_inset Text
  2867. \begin_layout Plain Layout
  2868. Yes
  2869. \end_layout
  2870. \end_inset
  2871. </cell>
  2872. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  2873. \begin_inset Text
  2874. \begin_layout Plain Layout
  2875. No
  2876. \end_layout
  2877. \end_inset
  2878. </cell>
  2879. </row>
  2880. <row>
  2881. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2882. \begin_inset Text
  2883. \begin_layout Plain Layout
  2884. C
  2885. \end_layout
  2886. \end_inset
  2887. </cell>
  2888. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2889. \begin_inset Text
  2890. \begin_layout Plain Layout
  2891. Yes
  2892. \end_layout
  2893. \end_inset
  2894. </cell>
  2895. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2896. \begin_inset Text
  2897. \begin_layout Plain Layout
  2898. Yes
  2899. \end_layout
  2900. \end_inset
  2901. </cell>
  2902. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2903. \begin_inset Text
  2904. \begin_layout Plain Layout
  2905. Yes
  2906. \end_layout
  2907. \end_inset
  2908. </cell>
  2909. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  2910. \begin_inset Text
  2911. \begin_layout Plain Layout
  2912. Yes
  2913. \end_layout
  2914. \end_inset
  2915. </cell>
  2916. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  2917. \begin_inset Text
  2918. \begin_layout Plain Layout
  2919. Yes
  2920. \end_layout
  2921. \end_inset
  2922. </cell>
  2923. </row>
  2924. </lyxtabular>
  2925. \end_inset
  2926. \end_layout
  2927. \begin_layout Plain Layout
  2928. \begin_inset Caption Standard
  2929. \begin_layout Plain Layout
  2930. \series bold
  2931. \begin_inset CommandInset label
  2932. LatexCommand label
  2933. name "tab:Summary-of-meth-analysis"
  2934. \end_inset
  2935. Summary of analysis variants for methylation array data.
  2936. \series default
  2937. Each analysis included a different set of steps to adjust or account for
  2938. various systematic features of the data.
  2939. See the text for a more detailed explanation of each step.
  2940. \end_layout
  2941. \end_inset
  2942. \end_layout
  2943. \end_inset
  2944. \end_layout
  2945. \begin_layout Standard
  2946. From the M-values, a series of parallel analyses was performed, each adding
  2947. additional steps into the model fit to accomodate a feature of the data
  2948. (see Table
  2949. \begin_inset CommandInset ref
  2950. LatexCommand ref
  2951. reference "tab:Summary-of-meth-analysis"
  2952. plural "false"
  2953. caps "false"
  2954. noprefix "false"
  2955. \end_inset
  2956. ).
  2957. For analysis A, a
  2958. \begin_inset Quotes eld
  2959. \end_inset
  2960. basic
  2961. \begin_inset Quotes erd
  2962. \end_inset
  2963. linear modeling analysis was performed, compensating for known confounders
  2964. by including terms for the factor of interest (transplant status) as well
  2965. as the known biological confounders: sex, age, ethnicity, and diabetes.
  2966. Since some samples came from the same patients at different times, the
  2967. intra-patient correlation was modeled as a random effect, estimating a
  2968. shared correlation value across all probes
  2969. \begin_inset CommandInset citation
  2970. LatexCommand cite
  2971. key "Smyth2005a"
  2972. literal "false"
  2973. \end_inset
  2974. .
  2975. Then the linear model was fit, and the variance was modeled using empirical
  2976. Bayes squeezing toward the mean-variance trend
  2977. \begin_inset CommandInset citation
  2978. LatexCommand cite
  2979. key "Ritchie2015"
  2980. literal "false"
  2981. \end_inset
  2982. .
  2983. Finally, t-tests or F-tests were performed as appropriate for each test:
  2984. t-tests for single contrasts, and F-tests for multiple contrasts.
  2985. P-values were corrected for multiple testing using the Benjamini-Hochberg
  2986. procedure for FDR control
  2987. \begin_inset CommandInset citation
  2988. LatexCommand cite
  2989. key "Benjamini1995"
  2990. literal "false"
  2991. \end_inset
  2992. .
  2993. \end_layout
  2994. \begin_layout Standard
  2995. For the analysis B, surrogate variable analysis (SVA) was used to infer
  2996. additional unobserved sources of heterogeneity in the data
  2997. \begin_inset CommandInset citation
  2998. LatexCommand cite
  2999. key "Leek2007"
  3000. literal "false"
  3001. \end_inset
  3002. .
  3003. These surrogate variables were added to the design matrix before fitting
  3004. the linear model.
  3005. In addition, sample quality weights were estimated from the data and used
  3006. during linear modeling to down-weight the contribution of highly variable
  3007. arrays while increasing the weight to arrays with lower variability
  3008. \begin_inset CommandInset citation
  3009. LatexCommand cite
  3010. key "Ritchie2006"
  3011. literal "false"
  3012. \end_inset
  3013. .
  3014. The remainder of the analysis proceeded as in analysis A.
  3015. For analysis C, the voom method was adapted to run on methylation array
  3016. data and used to model and correct for the mean-variance trend using individual
  3017. observation weights
  3018. \begin_inset CommandInset citation
  3019. LatexCommand cite
  3020. key "Law2013"
  3021. literal "false"
  3022. \end_inset
  3023. , which were combined with the sample weights
  3024. \begin_inset CommandInset citation
  3025. LatexCommand cite
  3026. key "Liu2015"
  3027. literal "false"
  3028. \end_inset
  3029. .
  3030. Each time weights were used, they were estimated once before estimating
  3031. the random effect correlation value, and then the weights were re-estimated
  3032. taking the random effect into account.
  3033. The remainder of the analysis proceeded as in analysis B.
  3034. \end_layout
  3035. \begin_layout Section
  3036. Results
  3037. \end_layout
  3038. \begin_layout Standard
  3039. \begin_inset Flex TODO Note (inline)
  3040. status open
  3041. \begin_layout Plain Layout
  3042. Improve subsection titles in this section
  3043. \end_layout
  3044. \end_inset
  3045. \end_layout
  3046. \begin_layout Subsection
  3047. fRMA eliminates unwanted dependence of classifier training on normalization
  3048. strategy caused by RMA
  3049. \end_layout
  3050. \begin_layout Standard
  3051. \begin_inset Flex TODO Note (inline)
  3052. status open
  3053. \begin_layout Plain Layout
  3054. Write figure legends
  3055. \end_layout
  3056. \end_inset
  3057. \end_layout
  3058. \begin_layout Subsubsection
  3059. Separate normalization with RMA introduces unwanted biases in classification
  3060. \end_layout
  3061. \begin_layout Standard
  3062. \begin_inset Float figure
  3063. wide false
  3064. sideways false
  3065. status collapsed
  3066. \begin_layout Plain Layout
  3067. \align center
  3068. \begin_inset Graphics
  3069. filename graphics/PAM/predplot.pdf
  3070. lyxscale 50
  3071. width 100col%
  3072. groupId colwidth
  3073. \end_inset
  3074. \end_layout
  3075. \begin_layout Plain Layout
  3076. \begin_inset Caption Standard
  3077. \begin_layout Plain Layout
  3078. \begin_inset CommandInset label
  3079. LatexCommand label
  3080. name "fig:Classifier-probabilities-RMA"
  3081. \end_inset
  3082. \series bold
  3083. Classifier probabilities on validation samples when normalized with RMA
  3084. together vs.
  3085. separately.
  3086. \end_layout
  3087. \end_inset
  3088. \end_layout
  3089. \end_inset
  3090. \end_layout
  3091. \begin_layout Standard
  3092. To demonstrate the problem with non-single-channel normalization methods,
  3093. we considered the problem of training a classifier to distinguish TX from
  3094. AR using the samples from the internal set as training data, evaluating
  3095. performance on the external set.
  3096. First, training and evaluation were performed after normalizing all array
  3097. samples together as a single set using RMA, and second, the internal samples
  3098. were normalized separately from the external samples and the training and
  3099. evaluation were repeated.
  3100. For each sample in the validation set, the classifier probabilities from
  3101. both classifiers were plotted against each other (Fig.
  3102. \begin_inset CommandInset ref
  3103. LatexCommand ref
  3104. reference "fig:Classifier-probabilities-RMA"
  3105. plural "false"
  3106. caps "false"
  3107. noprefix "false"
  3108. \end_inset
  3109. ).
  3110. As expected, separate normalization biases the classifier probabilities,
  3111. resulting in several misclassifications.
  3112. In this case, the bias from separate normalization causes the classifier
  3113. to assign a lower probability of AR to every sample.
  3114. \end_layout
  3115. \begin_layout Subsubsection
  3116. fRMA and SCAN achieve maintain classification performance while eliminating
  3117. dependence on normalization strategy
  3118. \end_layout
  3119. \begin_layout Standard
  3120. \begin_inset Float figure
  3121. placement tb
  3122. wide false
  3123. sideways false
  3124. status collapsed
  3125. \begin_layout Plain Layout
  3126. \align center
  3127. \begin_inset Graphics
  3128. filename graphics/PAM/ROC-TXvsAR-internal.pdf
  3129. lyxscale 50
  3130. width 100col%
  3131. groupId colwidth
  3132. \end_inset
  3133. \end_layout
  3134. \begin_layout Plain Layout
  3135. \begin_inset Caption Standard
  3136. \begin_layout Plain Layout
  3137. \series bold
  3138. \begin_inset CommandInset label
  3139. LatexCommand label
  3140. name "fig:ROC-PAM-int"
  3141. \end_inset
  3142. ROC curves for PAM on internal validation data using different normalization
  3143. strategies
  3144. \end_layout
  3145. \end_inset
  3146. \end_layout
  3147. \end_inset
  3148. \end_layout
  3149. \begin_layout Standard
  3150. \begin_inset Float table
  3151. wide false
  3152. sideways false
  3153. status collapsed
  3154. \begin_layout Plain Layout
  3155. \align center
  3156. \begin_inset Tabular
  3157. <lyxtabular version="3" rows="7" columns="4">
  3158. <features tabularvalignment="middle">
  3159. <column alignment="center" valignment="top">
  3160. <column alignment="center" valignment="top">
  3161. <column alignment="center" valignment="top">
  3162. <column alignment="center" valignment="top">
  3163. <row>
  3164. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3165. \begin_inset Text
  3166. \begin_layout Plain Layout
  3167. \family roman
  3168. \series medium
  3169. \shape up
  3170. \size normal
  3171. \emph off
  3172. \bar no
  3173. \strikeout off
  3174. \xout off
  3175. \uuline off
  3176. \uwave off
  3177. \noun off
  3178. \color none
  3179. Normalization
  3180. \end_layout
  3181. \end_inset
  3182. </cell>
  3183. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3184. \begin_inset Text
  3185. \begin_layout Plain Layout
  3186. Single-channel?
  3187. \end_layout
  3188. \end_inset
  3189. </cell>
  3190. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3191. \begin_inset Text
  3192. \begin_layout Plain Layout
  3193. \family roman
  3194. \series medium
  3195. \shape up
  3196. \size normal
  3197. \emph off
  3198. \bar no
  3199. \strikeout off
  3200. \xout off
  3201. \uuline off
  3202. \uwave off
  3203. \noun off
  3204. \color none
  3205. Internal Val.
  3206. AUC
  3207. \end_layout
  3208. \end_inset
  3209. </cell>
  3210. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3211. \begin_inset Text
  3212. \begin_layout Plain Layout
  3213. External Val.
  3214. AUC
  3215. \end_layout
  3216. \end_inset
  3217. </cell>
  3218. </row>
  3219. <row>
  3220. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3221. \begin_inset Text
  3222. \begin_layout Plain Layout
  3223. \family roman
  3224. \series medium
  3225. \shape up
  3226. \size normal
  3227. \emph off
  3228. \bar no
  3229. \strikeout off
  3230. \xout off
  3231. \uuline off
  3232. \uwave off
  3233. \noun off
  3234. \color none
  3235. RMA
  3236. \end_layout
  3237. \end_inset
  3238. </cell>
  3239. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3240. \begin_inset Text
  3241. \begin_layout Plain Layout
  3242. No
  3243. \end_layout
  3244. \end_inset
  3245. </cell>
  3246. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3247. \begin_inset Text
  3248. \begin_layout Plain Layout
  3249. \family roman
  3250. \series medium
  3251. \shape up
  3252. \size normal
  3253. \emph off
  3254. \bar no
  3255. \strikeout off
  3256. \xout off
  3257. \uuline off
  3258. \uwave off
  3259. \noun off
  3260. \color none
  3261. 0.852
  3262. \end_layout
  3263. \end_inset
  3264. </cell>
  3265. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3266. \begin_inset Text
  3267. \begin_layout Plain Layout
  3268. \family roman
  3269. \series medium
  3270. \shape up
  3271. \size normal
  3272. \emph off
  3273. \bar no
  3274. \strikeout off
  3275. \xout off
  3276. \uuline off
  3277. \uwave off
  3278. \noun off
  3279. \color none
  3280. 0.713
  3281. \end_layout
  3282. \end_inset
  3283. </cell>
  3284. </row>
  3285. <row>
  3286. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3287. \begin_inset Text
  3288. \begin_layout Plain Layout
  3289. \family roman
  3290. \series medium
  3291. \shape up
  3292. \size normal
  3293. \emph off
  3294. \bar no
  3295. \strikeout off
  3296. \xout off
  3297. \uuline off
  3298. \uwave off
  3299. \noun off
  3300. \color none
  3301. dChip
  3302. \end_layout
  3303. \end_inset
  3304. </cell>
  3305. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3306. \begin_inset Text
  3307. \begin_layout Plain Layout
  3308. No
  3309. \end_layout
  3310. \end_inset
  3311. </cell>
  3312. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3313. \begin_inset Text
  3314. \begin_layout Plain Layout
  3315. \family roman
  3316. \series medium
  3317. \shape up
  3318. \size normal
  3319. \emph off
  3320. \bar no
  3321. \strikeout off
  3322. \xout off
  3323. \uuline off
  3324. \uwave off
  3325. \noun off
  3326. \color none
  3327. 0.891
  3328. \end_layout
  3329. \end_inset
  3330. </cell>
  3331. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3332. \begin_inset Text
  3333. \begin_layout Plain Layout
  3334. \family roman
  3335. \series medium
  3336. \shape up
  3337. \size normal
  3338. \emph off
  3339. \bar no
  3340. \strikeout off
  3341. \xout off
  3342. \uuline off
  3343. \uwave off
  3344. \noun off
  3345. \color none
  3346. 0.657
  3347. \end_layout
  3348. \end_inset
  3349. </cell>
  3350. </row>
  3351. <row>
  3352. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3353. \begin_inset Text
  3354. \begin_layout Plain Layout
  3355. \family roman
  3356. \series medium
  3357. \shape up
  3358. \size normal
  3359. \emph off
  3360. \bar no
  3361. \strikeout off
  3362. \xout off
  3363. \uuline off
  3364. \uwave off
  3365. \noun off
  3366. \color none
  3367. RMA + GRSN
  3368. \end_layout
  3369. \end_inset
  3370. </cell>
  3371. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3372. \begin_inset Text
  3373. \begin_layout Plain Layout
  3374. No
  3375. \end_layout
  3376. \end_inset
  3377. </cell>
  3378. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3379. \begin_inset Text
  3380. \begin_layout Plain Layout
  3381. \family roman
  3382. \series medium
  3383. \shape up
  3384. \size normal
  3385. \emph off
  3386. \bar no
  3387. \strikeout off
  3388. \xout off
  3389. \uuline off
  3390. \uwave off
  3391. \noun off
  3392. \color none
  3393. 0.816
  3394. \end_layout
  3395. \end_inset
  3396. </cell>
  3397. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3398. \begin_inset Text
  3399. \begin_layout Plain Layout
  3400. \family roman
  3401. \series medium
  3402. \shape up
  3403. \size normal
  3404. \emph off
  3405. \bar no
  3406. \strikeout off
  3407. \xout off
  3408. \uuline off
  3409. \uwave off
  3410. \noun off
  3411. \color none
  3412. 0.750
  3413. \end_layout
  3414. \end_inset
  3415. </cell>
  3416. </row>
  3417. <row>
  3418. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3419. \begin_inset Text
  3420. \begin_layout Plain Layout
  3421. \family roman
  3422. \series medium
  3423. \shape up
  3424. \size normal
  3425. \emph off
  3426. \bar no
  3427. \strikeout off
  3428. \xout off
  3429. \uuline off
  3430. \uwave off
  3431. \noun off
  3432. \color none
  3433. dChip + GRSN
  3434. \end_layout
  3435. \end_inset
  3436. </cell>
  3437. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3438. \begin_inset Text
  3439. \begin_layout Plain Layout
  3440. No
  3441. \end_layout
  3442. \end_inset
  3443. </cell>
  3444. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3445. \begin_inset Text
  3446. \begin_layout Plain Layout
  3447. \family roman
  3448. \series medium
  3449. \shape up
  3450. \size normal
  3451. \emph off
  3452. \bar no
  3453. \strikeout off
  3454. \xout off
  3455. \uuline off
  3456. \uwave off
  3457. \noun off
  3458. \color none
  3459. 0.875
  3460. \end_layout
  3461. \end_inset
  3462. </cell>
  3463. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3464. \begin_inset Text
  3465. \begin_layout Plain Layout
  3466. \family roman
  3467. \series medium
  3468. \shape up
  3469. \size normal
  3470. \emph off
  3471. \bar no
  3472. \strikeout off
  3473. \xout off
  3474. \uuline off
  3475. \uwave off
  3476. \noun off
  3477. \color none
  3478. 0.642
  3479. \end_layout
  3480. \end_inset
  3481. </cell>
  3482. </row>
  3483. <row>
  3484. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3485. \begin_inset Text
  3486. \begin_layout Plain Layout
  3487. \family roman
  3488. \series medium
  3489. \shape up
  3490. \size normal
  3491. \emph off
  3492. \bar no
  3493. \strikeout off
  3494. \xout off
  3495. \uuline off
  3496. \uwave off
  3497. \noun off
  3498. \color none
  3499. fRMA
  3500. \end_layout
  3501. \end_inset
  3502. </cell>
  3503. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3504. \begin_inset Text
  3505. \begin_layout Plain Layout
  3506. Yes
  3507. \end_layout
  3508. \end_inset
  3509. </cell>
  3510. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  3511. \begin_inset Text
  3512. \begin_layout Plain Layout
  3513. \family roman
  3514. \series medium
  3515. \shape up
  3516. \size normal
  3517. \emph off
  3518. \bar no
  3519. \strikeout off
  3520. \xout off
  3521. \uuline off
  3522. \uwave off
  3523. \noun off
  3524. \color none
  3525. 0.863
  3526. \end_layout
  3527. \end_inset
  3528. </cell>
  3529. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  3530. \begin_inset Text
  3531. \begin_layout Plain Layout
  3532. \family roman
  3533. \series medium
  3534. \shape up
  3535. \size normal
  3536. \emph off
  3537. \bar no
  3538. \strikeout off
  3539. \xout off
  3540. \uuline off
  3541. \uwave off
  3542. \noun off
  3543. \color none
  3544. 0.718
  3545. \end_layout
  3546. \end_inset
  3547. </cell>
  3548. </row>
  3549. <row>
  3550. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3551. \begin_inset Text
  3552. \begin_layout Plain Layout
  3553. \family roman
  3554. \series medium
  3555. \shape up
  3556. \size normal
  3557. \emph off
  3558. \bar no
  3559. \strikeout off
  3560. \xout off
  3561. \uuline off
  3562. \uwave off
  3563. \noun off
  3564. \color none
  3565. SCAN
  3566. \end_layout
  3567. \end_inset
  3568. </cell>
  3569. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3570. \begin_inset Text
  3571. \begin_layout Plain Layout
  3572. Yes
  3573. \end_layout
  3574. \end_inset
  3575. </cell>
  3576. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  3577. \begin_inset Text
  3578. \begin_layout Plain Layout
  3579. \family roman
  3580. \series medium
  3581. \shape up
  3582. \size normal
  3583. \emph off
  3584. \bar no
  3585. \strikeout off
  3586. \xout off
  3587. \uuline off
  3588. \uwave off
  3589. \noun off
  3590. \color none
  3591. 0.853
  3592. \end_layout
  3593. \end_inset
  3594. </cell>
  3595. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  3596. \begin_inset Text
  3597. \begin_layout Plain Layout
  3598. \family roman
  3599. \series medium
  3600. \shape up
  3601. \size normal
  3602. \emph off
  3603. \bar no
  3604. \strikeout off
  3605. \xout off
  3606. \uuline off
  3607. \uwave off
  3608. \noun off
  3609. \color none
  3610. 0.689
  3611. \end_layout
  3612. \end_inset
  3613. </cell>
  3614. </row>
  3615. </lyxtabular>
  3616. \end_inset
  3617. \end_layout
  3618. \begin_layout Plain Layout
  3619. \begin_inset Caption Standard
  3620. \begin_layout Plain Layout
  3621. \begin_inset CommandInset label
  3622. LatexCommand label
  3623. name "tab:AUC-PAM"
  3624. \end_inset
  3625. \series bold
  3626. AUC values for internal and external validation with 6 different normalization
  3627. strategies.
  3628. \series default
  3629. Only fRMA and SCAN are single-channel normalizations.
  3630. The other 4 normalizations are for comparison.
  3631. \end_layout
  3632. \end_inset
  3633. \end_layout
  3634. \end_inset
  3635. \end_layout
  3636. \begin_layout Standard
  3637. For internal validation, the 6 methods' AUC values ranged from 0.816 to 0.891,
  3638. as shown in Table
  3639. \begin_inset CommandInset ref
  3640. LatexCommand ref
  3641. reference "tab:AUC-PAM"
  3642. plural "false"
  3643. caps "false"
  3644. noprefix "false"
  3645. \end_inset
  3646. .
  3647. Among the non-single-channel normalizations, dChip outperformed RMA, while
  3648. GRSN reduced the AUC values for both dChip and RMA.
  3649. Both single-channel methods, fRMA and SCAN, slightly outperformed RMA,
  3650. with fRMA ahead of SCAN.
  3651. However, the difference between RMA and fRMA is still quite small.
  3652. Figure
  3653. \begin_inset CommandInset ref
  3654. LatexCommand ref
  3655. reference "fig:ROC-PAM-int"
  3656. plural "false"
  3657. caps "false"
  3658. noprefix "false"
  3659. \end_inset
  3660. shows that the ROC curves for RMA, dChip, and fRMA look very similar and
  3661. relatively smooth, while both GRSN curves and the curve for SCAN have a
  3662. more jagged appearance.
  3663. \end_layout
  3664. \begin_layout Standard
  3665. \begin_inset Float figure
  3666. placement tb
  3667. wide false
  3668. sideways false
  3669. status collapsed
  3670. \begin_layout Plain Layout
  3671. \align center
  3672. \begin_inset Graphics
  3673. filename graphics/PAM/ROC-TXvsAR-external.pdf
  3674. lyxscale 50
  3675. width 100col%
  3676. groupId colwidth
  3677. \end_inset
  3678. \end_layout
  3679. \begin_layout Plain Layout
  3680. \begin_inset Caption Standard
  3681. \begin_layout Plain Layout
  3682. \series bold
  3683. \begin_inset CommandInset label
  3684. LatexCommand label
  3685. name "fig:ROC-PAM-ext"
  3686. \end_inset
  3687. ROC curve for PAM on external validation data using different normalization
  3688. strategies
  3689. \end_layout
  3690. \end_inset
  3691. \end_layout
  3692. \end_inset
  3693. \end_layout
  3694. \begin_layout Standard
  3695. For external validation, as expected, all the AUC values are lower than
  3696. the internal validations, ranging from 0.642 to 0.750 (Table
  3697. \begin_inset CommandInset ref
  3698. LatexCommand ref
  3699. reference "tab:AUC-PAM"
  3700. plural "false"
  3701. caps "false"
  3702. noprefix "false"
  3703. \end_inset
  3704. ).
  3705. With or without GRSN, RMA shows its dominance over dChip in this more challengi
  3706. ng test.
  3707. Unlike in the internal validation, GRSN actually improves the classifier
  3708. performance for RMA, although it does not for dChip.
  3709. Once again, both single-channel methods perform about on par with RMA,
  3710. with fRMA performing slightly better and SCAN performing a bit worse.
  3711. Figure
  3712. \begin_inset CommandInset ref
  3713. LatexCommand ref
  3714. reference "fig:ROC-PAM-ext"
  3715. plural "false"
  3716. caps "false"
  3717. noprefix "false"
  3718. \end_inset
  3719. shows the ROC curves for the external validation test.
  3720. As expected, none of them are as clean-looking as the internal validation
  3721. ROC curves.
  3722. The curves for RMA, RMA+GRSN, and fRMA all look similar, while the other
  3723. curves look more divergent.
  3724. \end_layout
  3725. \begin_layout Standard
  3726. \begin_inset ERT
  3727. status collapsed
  3728. \begin_layout Plain Layout
  3729. \backslash
  3730. FloatBarrier
  3731. \end_layout
  3732. \end_inset
  3733. \end_layout
  3734. \begin_layout Subsection
  3735. fRMA with custom-generated vectors enables normalization on hthgu133pluspm
  3736. \end_layout
  3737. \begin_layout Standard
  3738. \begin_inset Float figure
  3739. placement tb
  3740. wide false
  3741. sideways false
  3742. status collapsed
  3743. \begin_layout Plain Layout
  3744. \align center
  3745. \begin_inset Graphics
  3746. filename graphics/frma-pax-bx/batchsize_batches.pdf
  3747. \end_inset
  3748. \end_layout
  3749. \begin_layout Plain Layout
  3750. \begin_inset Caption Standard
  3751. \begin_layout Plain Layout
  3752. \begin_inset CommandInset label
  3753. LatexCommand label
  3754. name "fig:batch-size-batches"
  3755. \end_inset
  3756. \series bold
  3757. Effect of batch size selection on number of batches included in fRMA probe
  3758. weight learning.
  3759. \series default
  3760. For batch sizes ranging from 3 to 15, the number of batches with at least
  3761. that many samples was plotted for biopsy (BX) and blood (PAX) samples.
  3762. The selected batch size, 5, is marked with a dotted vertical line.
  3763. \end_layout
  3764. \end_inset
  3765. \end_layout
  3766. \end_inset
  3767. \end_layout
  3768. \begin_layout Standard
  3769. \begin_inset Float figure
  3770. placement tb
  3771. wide false
  3772. sideways false
  3773. status collapsed
  3774. \begin_layout Plain Layout
  3775. \align center
  3776. \begin_inset Graphics
  3777. filename graphics/frma-pax-bx/batchsize_samples.pdf
  3778. \end_inset
  3779. \end_layout
  3780. \begin_layout Plain Layout
  3781. \begin_inset Caption Standard
  3782. \begin_layout Plain Layout
  3783. \begin_inset CommandInset label
  3784. LatexCommand label
  3785. name "fig:batch-size-samples"
  3786. \end_inset
  3787. \series bold
  3788. Effect of batch size selection on number of samples included in fRMA probe
  3789. weight learning.
  3790. \series default
  3791. For batch sizes ranging from 3 to 15, the number of samples included in
  3792. probe weight training was plotted for biopsy (BX) and blood (PAX) samples.
  3793. The selected batch size, 5, is marked with a dotted vertical line.
  3794. \end_layout
  3795. \end_inset
  3796. \end_layout
  3797. \end_inset
  3798. \end_layout
  3799. \begin_layout Standard
  3800. In order to enable use of fRMA to normalize hthgu133pluspm, a custom set
  3801. of fRMA vectors was created.
  3802. First, an appropriate batch size was chosen by looking at the number of
  3803. batches and number of samples included as a function of batch size (Figures
  3804. \begin_inset CommandInset ref
  3805. LatexCommand ref
  3806. reference "fig:batch-size-batches"
  3807. plural "false"
  3808. caps "false"
  3809. noprefix "false"
  3810. \end_inset
  3811. and
  3812. \begin_inset CommandInset ref
  3813. LatexCommand ref
  3814. reference "fig:batch-size-samples"
  3815. plural "false"
  3816. caps "false"
  3817. noprefix "false"
  3818. \end_inset
  3819. , respectively).
  3820. For a given batch size, all batches with fewer samples that the chosen
  3821. size must be ignored during training, while larger batches must be randomly
  3822. downsampled to the chosen size.
  3823. Hence, the number of samples included for a given batch size equals the
  3824. batch size times the number of batches with at least that many samples.
  3825. From Figure
  3826. \begin_inset CommandInset ref
  3827. LatexCommand ref
  3828. reference "fig:batch-size-samples"
  3829. plural "false"
  3830. caps "false"
  3831. noprefix "false"
  3832. \end_inset
  3833. , it is apparent that that a batch size of 8 maximizes the number of samples
  3834. included in training.
  3835. Increasing the batch size beyond this causes too many smaller batches to
  3836. be excluded, reducing the total number of samples for both tissue types.
  3837. However, a batch size of 8 is not necessarily optimal.
  3838. The article introducing frmaTools concluded that it was highly advantageous
  3839. to use a smaller batch size in order to include more batches, even at the
  3840. expense of including fewer total samples in training
  3841. \begin_inset CommandInset citation
  3842. LatexCommand cite
  3843. key "McCall2011"
  3844. literal "false"
  3845. \end_inset
  3846. .
  3847. To strike an appropriate balance between more batches and more samples,
  3848. a batch size of 5 was chosen.
  3849. For both blood and biopsy samples, this increased the number of batches
  3850. included by 10, with only a modest reduction in the number of samples compared
  3851. to a batch size of 8.
  3852. With a batch size of 5, 26 batches of biopsy samples and 46 batches of
  3853. blood samples were available.
  3854. \end_layout
  3855. \begin_layout Standard
  3856. \begin_inset Float figure
  3857. wide false
  3858. sideways false
  3859. status collapsed
  3860. \begin_layout Plain Layout
  3861. \align center
  3862. \begin_inset Graphics
  3863. filename graphics/frma-pax-bx/M-BX-violin.pdf
  3864. lyxscale 40
  3865. height 80theight%
  3866. groupId m-violin
  3867. \end_inset
  3868. \end_layout
  3869. \begin_layout Plain Layout
  3870. \begin_inset Caption Standard
  3871. \begin_layout Plain Layout
  3872. \begin_inset CommandInset label
  3873. LatexCommand label
  3874. name "fig:m-bx-violin"
  3875. \end_inset
  3876. \series bold
  3877. Violin plot of log ratios between normalizations for 20 biopsy samples.
  3878. \series default
  3879. Each of 20 randomly selected biopsy samples was normalized with RMA and
  3880. with 5 different sets of fRMA vectors.
  3881. This shows the distribution of log ratios between normalized expression
  3882. values, aggregated across all 20 arrays.
  3883. \end_layout
  3884. \end_inset
  3885. \end_layout
  3886. \end_inset
  3887. \end_layout
  3888. \begin_layout Standard
  3889. Since fRMA training requires equal-size batches, larger batches are downsampled
  3890. randomly.
  3891. This introduces a nondeterministic step in the generation of normalization
  3892. vectors.
  3893. To show that this randomness does not substantially change the outcome,
  3894. the random downsampling and subsequent vector learning was repeated 5 times,
  3895. with a different random seed each time.
  3896. 20 samples were selected at random as a test set and normalized with each
  3897. of the 5 sets of fRMA normalization vectors as well as ordinary RMA, and
  3898. the normalized expression values were compared across normalizations.
  3899. Figure
  3900. \begin_inset CommandInset ref
  3901. LatexCommand ref
  3902. reference "fig:m-bx-violin"
  3903. plural "false"
  3904. caps "false"
  3905. noprefix "false"
  3906. \end_inset
  3907. shows a summary of these comparisons for biopsy samples.
  3908. Comparing RMA to each of the 5 fRMA normalizations, the distribution of
  3909. log ratios is somewhat wide, indicating that the normalizations disagree
  3910. on the expression values of a fair number of probe sets.
  3911. In contrast, comparisons of fRMA against fRMA, the vast mojority of probe
  3912. sets have very small log ratios, indicating a very high agreement between
  3913. the normalized values generated by the two normalizations.
  3914. This shows that the fRMA normalization's behavior is not very sensitive
  3915. to the random downsampling of larger batches during training.
  3916. \end_layout
  3917. \begin_layout Standard
  3918. \begin_inset Float figure
  3919. wide false
  3920. sideways false
  3921. status collapsed
  3922. \begin_layout Plain Layout
  3923. \align center
  3924. \begin_inset Graphics
  3925. filename graphics/frma-pax-bx/MA-BX-RMA.fRMA.pdf
  3926. lyxscale 50
  3927. width 100text%
  3928. groupId ma-frma
  3929. \end_inset
  3930. \end_layout
  3931. \begin_layout Plain Layout
  3932. \begin_inset Caption Standard
  3933. \begin_layout Plain Layout
  3934. \begin_inset CommandInset label
  3935. LatexCommand label
  3936. name "fig:ma-bx-rma-frma"
  3937. \end_inset
  3938. \series bold
  3939. Representative MA plot comparing RMA against fRMA for 20 biopsy samples.
  3940. \series default
  3941. Averages and log ratios were computed for every probe in each of 20 biopsy
  3942. samples between RMA normalization and fRMA.
  3943. Density of points is represented by darkness of shading, and individual
  3944. outlier points are plotted.
  3945. \end_layout
  3946. \end_inset
  3947. \end_layout
  3948. \end_inset
  3949. \end_layout
  3950. \begin_layout Standard
  3951. \begin_inset Float figure
  3952. wide false
  3953. sideways false
  3954. status collapsed
  3955. \begin_layout Plain Layout
  3956. \align center
  3957. \begin_inset Graphics
  3958. filename graphics/frma-pax-bx/MA-BX-fRMA.fRMA.pdf
  3959. lyxscale 50
  3960. width 100text%
  3961. groupId ma-frma
  3962. \end_inset
  3963. \end_layout
  3964. \begin_layout Plain Layout
  3965. \begin_inset Caption Standard
  3966. \begin_layout Plain Layout
  3967. \begin_inset CommandInset label
  3968. LatexCommand label
  3969. name "fig:ma-bx-frma-frma"
  3970. \end_inset
  3971. \series bold
  3972. Representative MA plot comparing different fRMA vectors for 20 biopsy samples.
  3973. \series default
  3974. Averages and log ratios were computed for every probe in each of 20 biopsy
  3975. samples between fRMA normalizations using vectors from two different batch
  3976. samplings.
  3977. Density of points is represented by darkness of shading, and individual
  3978. outlier points are plotted.
  3979. \end_layout
  3980. \end_inset
  3981. \end_layout
  3982. \end_inset
  3983. \end_layout
  3984. \begin_layout Standard
  3985. Figure
  3986. \begin_inset CommandInset ref
  3987. LatexCommand ref
  3988. reference "fig:ma-bx-rma-frma"
  3989. plural "false"
  3990. caps "false"
  3991. noprefix "false"
  3992. \end_inset
  3993. shows an MA plot of the RMA-normalized values against the fRMA-normalized
  3994. values for the same probe sets and arrays, corresponding to the first row
  3995. of Figure
  3996. \begin_inset CommandInset ref
  3997. LatexCommand ref
  3998. reference "fig:m-bx-violin"
  3999. plural "false"
  4000. caps "false"
  4001. noprefix "false"
  4002. \end_inset
  4003. .
  4004. This MA plot shows that not only is there a wide distribution of M-values,
  4005. but the trend of M-values is dependent on the average normalized intensity.
  4006. This is expected, since the overall trend represents the differences in
  4007. the quantile normalization step.
  4008. When running RMA, only the quantiles for these specific 20 arrays are used,
  4009. while for fRMA the quantile distribution is taking from all arrays used
  4010. in training.
  4011. Figure
  4012. \begin_inset CommandInset ref
  4013. LatexCommand ref
  4014. reference "fig:ma-bx-frma-frma"
  4015. plural "false"
  4016. caps "false"
  4017. noprefix "false"
  4018. \end_inset
  4019. shows a similar MA plot comparing 2 different fRMA normalizations, correspondin
  4020. g to the 6th row of Figure
  4021. \begin_inset CommandInset ref
  4022. LatexCommand ref
  4023. reference "fig:m-bx-violin"
  4024. plural "false"
  4025. caps "false"
  4026. noprefix "false"
  4027. \end_inset
  4028. .
  4029. The MA plot is very tightly centered around zero with no visible trend.
  4030. Figures
  4031. \begin_inset CommandInset ref
  4032. LatexCommand ref
  4033. reference "fig:m-pax-violin"
  4034. plural "false"
  4035. caps "false"
  4036. noprefix "false"
  4037. \end_inset
  4038. ,
  4039. \begin_inset CommandInset ref
  4040. LatexCommand ref
  4041. reference "fig:MA-PAX-rma-frma"
  4042. plural "false"
  4043. caps "false"
  4044. noprefix "false"
  4045. \end_inset
  4046. , and
  4047. \begin_inset CommandInset ref
  4048. LatexCommand ref
  4049. reference "fig:ma-bx-frma-frma"
  4050. plural "false"
  4051. caps "false"
  4052. noprefix "false"
  4053. \end_inset
  4054. show exactly the same information for the blood samples, once again comparing
  4055. the normalized expression values between normalizations for all probe sets
  4056. across 20 randomly selected test arrays.
  4057. Once again, there is a wider distribution of log ratios between RMA-normalized
  4058. values and fRMA-normalized, and a much tighter distribution when comparing
  4059. different fRMA normalizations to each other, indicating that the fRMA training
  4060. process is robust to random batch downsampling for the blood samples as
  4061. well.
  4062. \end_layout
  4063. \begin_layout Standard
  4064. \begin_inset Float figure
  4065. wide false
  4066. sideways false
  4067. status collapsed
  4068. \begin_layout Plain Layout
  4069. \align center
  4070. \begin_inset Graphics
  4071. filename graphics/frma-pax-bx/M-PAX-violin.pdf
  4072. lyxscale 40
  4073. height 80theight%
  4074. groupId m-violin
  4075. \end_inset
  4076. \end_layout
  4077. \begin_layout Plain Layout
  4078. \begin_inset Caption Standard
  4079. \begin_layout Plain Layout
  4080. \begin_inset CommandInset label
  4081. LatexCommand label
  4082. name "fig:m-pax-violin"
  4083. \end_inset
  4084. \series bold
  4085. Violin plot of log ratios between normalizations for 20 blood samples.
  4086. \series default
  4087. Each of 20 randomly selected blood samples was normalized with RMA and with
  4088. 5 different sets of fRMA vectors.
  4089. This shows the distribution of log ratios between normalized expression
  4090. values, aggregated across all 20 arrays.
  4091. \end_layout
  4092. \end_inset
  4093. \end_layout
  4094. \end_inset
  4095. \end_layout
  4096. \begin_layout Standard
  4097. \begin_inset Float figure
  4098. wide false
  4099. sideways false
  4100. status collapsed
  4101. \begin_layout Plain Layout
  4102. \align center
  4103. \begin_inset Graphics
  4104. filename graphics/frma-pax-bx/MA-PAX-RMA.fRMA.pdf
  4105. lyxscale 50
  4106. width 100text%
  4107. groupId ma-frma
  4108. \end_inset
  4109. \end_layout
  4110. \begin_layout Plain Layout
  4111. \begin_inset Caption Standard
  4112. \begin_layout Plain Layout
  4113. \begin_inset CommandInset label
  4114. LatexCommand label
  4115. name "fig:MA-PAX-rma-frma"
  4116. \end_inset
  4117. \series bold
  4118. Representative MA plot comparing RMA against fRMA for 20 blood samples.
  4119. \series default
  4120. Averages and log ratios were computed for every probe in each of 20 blood
  4121. samples between RMA normalization and fRMA.
  4122. Density of points is represented by darkness of shading, and individual
  4123. outlier points are plotted.
  4124. \end_layout
  4125. \end_inset
  4126. \end_layout
  4127. \begin_layout Plain Layout
  4128. \end_layout
  4129. \end_inset
  4130. \end_layout
  4131. \begin_layout Standard
  4132. \begin_inset Float figure
  4133. wide false
  4134. sideways false
  4135. status collapsed
  4136. \begin_layout Plain Layout
  4137. \align center
  4138. \begin_inset Graphics
  4139. filename graphics/frma-pax-bx/MA-PAX-fRMA.fRMA.pdf
  4140. lyxscale 50
  4141. width 100text%
  4142. groupId ma-frma
  4143. \end_inset
  4144. \end_layout
  4145. \begin_layout Plain Layout
  4146. \begin_inset Caption Standard
  4147. \begin_layout Plain Layout
  4148. \begin_inset CommandInset label
  4149. LatexCommand label
  4150. name "fig:MA-PAX-frma-frma"
  4151. \end_inset
  4152. \series bold
  4153. Representative MA plot comparing different fRMA vectors for 20 blood samples.
  4154. \series default
  4155. Averages and log ratios were computed for every probe in each of 20 blood
  4156. samples between fRMA normalizations using vectors from two different batch
  4157. samplings.
  4158. Density of points is represented by darkness of shading, and individual
  4159. outlier points are plotted.
  4160. \end_layout
  4161. \end_inset
  4162. \end_layout
  4163. \end_inset
  4164. \end_layout
  4165. \begin_layout Standard
  4166. \begin_inset ERT
  4167. status collapsed
  4168. \begin_layout Plain Layout
  4169. \backslash
  4170. FloatBarrier
  4171. \end_layout
  4172. \end_inset
  4173. \end_layout
  4174. \begin_layout Subsection
  4175. SVA, voom, and array weights improve model fit for methylation array data
  4176. \end_layout
  4177. \begin_layout Standard
  4178. \begin_inset Float figure
  4179. wide false
  4180. sideways false
  4181. status collapsed
  4182. \begin_layout Plain Layout
  4183. \align center
  4184. \begin_inset Flex TODO Note (inline)
  4185. status open
  4186. \begin_layout Plain Layout
  4187. Fix axis labels:
  4188. \begin_inset Quotes eld
  4189. \end_inset
  4190. log2 M-value
  4191. \begin_inset Quotes erd
  4192. \end_inset
  4193. is redundant because M-values are already log scale
  4194. \end_layout
  4195. \end_inset
  4196. \end_layout
  4197. \begin_layout Plain Layout
  4198. \align center
  4199. \begin_inset Graphics
  4200. filename graphics/methylvoom/unadj.dupcor/meanvar-trends-PAGE1-CROP-RASTER.png
  4201. lyxscale 15
  4202. width 100col%
  4203. groupId raster-600ppi
  4204. \end_inset
  4205. \end_layout
  4206. \begin_layout Plain Layout
  4207. \begin_inset Caption Standard
  4208. \begin_layout Plain Layout
  4209. \series bold
  4210. \begin_inset CommandInset label
  4211. LatexCommand label
  4212. name "fig:meanvar-basic"
  4213. \end_inset
  4214. Mean-variance trend for analysis A.
  4215. \series default
  4216. The log2(standard deviation) for each probe is plotted against the probe's
  4217. average M-value across all samples as a black point, with some transparency
  4218. to make overplotting more visible, since there are about 450,000 points.
  4219. Density of points is also indicated by the dark blue contour lines.
  4220. The prior variance trend estimated by eBayes is shown in light blue, while
  4221. the lowess trend of the points is shown in red.
  4222. \end_layout
  4223. \end_inset
  4224. \end_layout
  4225. \end_inset
  4226. \end_layout
  4227. \begin_layout Standard
  4228. Figure
  4229. \begin_inset CommandInset ref
  4230. LatexCommand ref
  4231. reference "fig:meanvar-basic"
  4232. plural "false"
  4233. caps "false"
  4234. noprefix "false"
  4235. \end_inset
  4236. shows the relationship between the mean M-value and the standard deviation
  4237. calculated for each probe in the methylation array data set.
  4238. A few features of the data are apparent.
  4239. First, the data are very strongly bimodal, with peaks in the density around
  4240. M-values of +4 and -4.
  4241. These modes correspond to methylation sites that are nearly 100% methylated
  4242. and nearly 100% unmethylated, respectively.
  4243. The strong bomodality indicates that a majority of probes interrogate sites
  4244. that fall into one of these two categories.
  4245. The points in between these modes represent sites that are either partially
  4246. methylated in many samples, or are fully methylated in some samples and
  4247. fully unmethylated in other samples, or some combination.
  4248. The next visible feature of the data is the W-shaped variance trend.
  4249. The upticks in the variance trend on either side are expected, based on
  4250. the sigmoid transformation exaggerating small differences at extreme M-values
  4251. (Figure
  4252. \begin_inset CommandInset ref
  4253. LatexCommand ref
  4254. reference "fig:Sigmoid-beta-m-mapping"
  4255. plural "false"
  4256. caps "false"
  4257. noprefix "false"
  4258. \end_inset
  4259. ).
  4260. However, the uptick in the center is interesting: it indicates that sites
  4261. that are not constitutitively methylated or unmethylated have a higher
  4262. variance.
  4263. This could be a genuine biological effect, or it could be spurious noise
  4264. that is only observable at sites with varying methylation.
  4265. \end_layout
  4266. \begin_layout Standard
  4267. \begin_inset Float figure
  4268. wide false
  4269. sideways false
  4270. status open
  4271. \begin_layout Plain Layout
  4272. \begin_inset Graphics
  4273. filename graphics/methylvoom/unadj.dupcor.sva.aw/meanvar-trends-PAGE1-CROP-RASTER.png
  4274. lyxscale 15
  4275. width 100col%
  4276. groupId raster-600ppi
  4277. \end_inset
  4278. \end_layout
  4279. \begin_layout Plain Layout
  4280. \begin_inset Caption Standard
  4281. \begin_layout Plain Layout
  4282. \series bold
  4283. \begin_inset CommandInset label
  4284. LatexCommand label
  4285. name "fig:meanvar-sva-aw"
  4286. \end_inset
  4287. Mean-variance trend for analysis B.
  4288. \series default
  4289. Interpretation is as in Figure
  4290. \begin_inset CommandInset ref
  4291. LatexCommand ref
  4292. reference "fig:meanvar-basic"
  4293. plural "false"
  4294. caps "false"
  4295. noprefix "false"
  4296. \end_inset
  4297. .
  4298. \end_layout
  4299. \end_inset
  4300. \end_layout
  4301. \end_inset
  4302. \end_layout
  4303. \begin_layout Standard
  4304. In Figure
  4305. \begin_inset CommandInset ref
  4306. LatexCommand ref
  4307. reference "fig:meanvar-sva-aw"
  4308. plural "false"
  4309. caps "false"
  4310. noprefix "false"
  4311. \end_inset
  4312. , we see the mean-variance trend for the same methylation array data, this
  4313. time with surrogate variables and sample quality weights estimated from
  4314. the data and included in the model.
  4315. As expected, the overall average variance is smaller, since the surrogate
  4316. variables account for some of the variance.
  4317. In addition, the uptick in variance in the middle of the M-value range
  4318. has disappeared, turning the W shape into a wide U shape.
  4319. This indicates that the excess variance in the probes with intermediate
  4320. M-values was explained by systematic variations not correlated with known
  4321. covariates, and these variations were modeled by the surrogate variables.
  4322. The result is a nearly flat variance trend for the entire intermediate
  4323. M-value range from about -3 to +3.
  4324. In contrast, the excess variance at the extremes was not
  4325. \begin_inset Quotes eld
  4326. \end_inset
  4327. absorbed
  4328. \begin_inset Quotes erd
  4329. \end_inset
  4330. by the surrogate variables and remains in the plot, indicating that this
  4331. variation has no systematic component: probes with extreme M-values are
  4332. uniformly more variable across all samples, as expected.
  4333. \end_layout
  4334. \begin_layout Standard
  4335. \begin_inset Float figure
  4336. wide false
  4337. sideways false
  4338. status collapsed
  4339. \begin_layout Plain Layout
  4340. \begin_inset Graphics
  4341. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/meanvar-trends-PAGE2-CROP-RASTER.png
  4342. lyxscale 15
  4343. width 100col%
  4344. groupId raster-600ppi
  4345. \end_inset
  4346. \end_layout
  4347. \begin_layout Plain Layout
  4348. \begin_inset Caption Standard
  4349. \begin_layout Plain Layout
  4350. \series bold
  4351. \begin_inset CommandInset label
  4352. LatexCommand label
  4353. name "fig:meanvar-sva-voomaw"
  4354. \end_inset
  4355. Mean-variance trend after voom modeling in analysis C.
  4356. \series default
  4357. Interpretation is as in Figure
  4358. \begin_inset CommandInset ref
  4359. LatexCommand ref
  4360. reference "fig:meanvar-basic"
  4361. plural "false"
  4362. caps "false"
  4363. noprefix "false"
  4364. \end_inset
  4365. .
  4366. \end_layout
  4367. \end_inset
  4368. \end_layout
  4369. \end_inset
  4370. \end_layout
  4371. \begin_layout Standard
  4372. Figure
  4373. \begin_inset CommandInset ref
  4374. LatexCommand ref
  4375. reference "fig:meanvar-sva-voomaw"
  4376. plural "false"
  4377. caps "false"
  4378. noprefix "false"
  4379. \end_inset
  4380. shows the mean-variance trend after fitting the model with the observation
  4381. weights assigned by voom based on the mean-variance trend shown in Figure
  4382. \begin_inset CommandInset ref
  4383. LatexCommand ref
  4384. reference "fig:meanvar-sva-aw"
  4385. plural "false"
  4386. caps "false"
  4387. noprefix "false"
  4388. \end_inset
  4389. .
  4390. As expected, the weights exactly counteract the trend in the data, resulting
  4391. in a nearly flat trend centered vertically at 1 (i.e.
  4392. 0 on the log scale).
  4393. This shows that the observations with extreme M-values have been appropriately
  4394. down-weighted to account for the fact that the noise in those observations
  4395. has been amplified by the non-linear M-value transformation.
  4396. In turn, this gives relatively more weight to observervations in the middle
  4397. region, which are more likely to correspond to probes measuring interesting
  4398. biology (not constitutively methylated or unmethylated).
  4399. \end_layout
  4400. \begin_layout Standard
  4401. \begin_inset Float table
  4402. wide false
  4403. sideways false
  4404. status collapsed
  4405. \begin_layout Plain Layout
  4406. \align center
  4407. \begin_inset Tabular
  4408. <lyxtabular version="3" rows="5" columns="3">
  4409. <features tabularvalignment="middle">
  4410. <column alignment="center" valignment="top">
  4411. <column alignment="center" valignment="top">
  4412. <column alignment="center" valignment="top">
  4413. <row>
  4414. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4415. \begin_inset Text
  4416. \begin_layout Plain Layout
  4417. Covariate
  4418. \end_layout
  4419. \end_inset
  4420. </cell>
  4421. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4422. \begin_inset Text
  4423. \begin_layout Plain Layout
  4424. Test used
  4425. \end_layout
  4426. \end_inset
  4427. </cell>
  4428. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4429. \begin_inset Text
  4430. \begin_layout Plain Layout
  4431. p-value
  4432. \end_layout
  4433. \end_inset
  4434. </cell>
  4435. </row>
  4436. <row>
  4437. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4438. \begin_inset Text
  4439. \begin_layout Plain Layout
  4440. Transplant Status
  4441. \end_layout
  4442. \end_inset
  4443. </cell>
  4444. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4445. \begin_inset Text
  4446. \begin_layout Plain Layout
  4447. F-test
  4448. \end_layout
  4449. \end_inset
  4450. </cell>
  4451. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4452. \begin_inset Text
  4453. \begin_layout Plain Layout
  4454. 0.404
  4455. \end_layout
  4456. \end_inset
  4457. </cell>
  4458. </row>
  4459. <row>
  4460. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4461. \begin_inset Text
  4462. \begin_layout Plain Layout
  4463. Diabetes Diagnosis
  4464. \end_layout
  4465. \end_inset
  4466. </cell>
  4467. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4468. \begin_inset Text
  4469. \begin_layout Plain Layout
  4470. t-test
  4471. \end_layout
  4472. \end_inset
  4473. </cell>
  4474. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4475. \begin_inset Text
  4476. \begin_layout Plain Layout
  4477. 0.00106
  4478. \end_layout
  4479. \end_inset
  4480. </cell>
  4481. </row>
  4482. <row>
  4483. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4484. \begin_inset Text
  4485. \begin_layout Plain Layout
  4486. Sex
  4487. \end_layout
  4488. \end_inset
  4489. </cell>
  4490. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4491. \begin_inset Text
  4492. \begin_layout Plain Layout
  4493. t-test
  4494. \end_layout
  4495. \end_inset
  4496. </cell>
  4497. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4498. \begin_inset Text
  4499. \begin_layout Plain Layout
  4500. 0.148
  4501. \end_layout
  4502. \end_inset
  4503. </cell>
  4504. </row>
  4505. <row>
  4506. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4507. \begin_inset Text
  4508. \begin_layout Plain Layout
  4509. Age
  4510. \end_layout
  4511. \end_inset
  4512. </cell>
  4513. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4514. \begin_inset Text
  4515. \begin_layout Plain Layout
  4516. linear regression
  4517. \end_layout
  4518. \end_inset
  4519. </cell>
  4520. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4521. \begin_inset Text
  4522. \begin_layout Plain Layout
  4523. 0.212
  4524. \end_layout
  4525. \end_inset
  4526. </cell>
  4527. </row>
  4528. </lyxtabular>
  4529. \end_inset
  4530. \end_layout
  4531. \begin_layout Plain Layout
  4532. \begin_inset Caption Standard
  4533. \begin_layout Plain Layout
  4534. \series bold
  4535. \begin_inset CommandInset label
  4536. LatexCommand label
  4537. name "tab:weight-covariate-tests"
  4538. \end_inset
  4539. Association of sample weights with clinical covariates in methylation array
  4540. data.
  4541. \series default
  4542. Computed sample quality log weights were tested for significant association
  4543. with each of the variables in the model (1st column).
  4544. An appropriate test was selected for each variable (2nd column).
  4545. P-values for significant association are shown in the 3rd column.
  4546. \end_layout
  4547. \end_inset
  4548. \end_layout
  4549. \end_inset
  4550. \end_layout
  4551. \begin_layout Standard
  4552. \begin_inset Flex TODO Note (inline)
  4553. status open
  4554. \begin_layout Plain Layout
  4555. Redo the sample weight boxplot with notches and without fill colors (and
  4556. update the legend)
  4557. \end_layout
  4558. \end_inset
  4559. \end_layout
  4560. \begin_layout Standard
  4561. \begin_inset Float figure
  4562. wide false
  4563. sideways false
  4564. status collapsed
  4565. \begin_layout Plain Layout
  4566. \begin_inset Graphics
  4567. filename graphics/methylvoom/unadj.dupcor.sva.voomaw/sample-weights-PAGE3-CROP.pdf
  4568. \end_inset
  4569. \end_layout
  4570. \begin_layout Plain Layout
  4571. \begin_inset Caption Standard
  4572. \begin_layout Plain Layout
  4573. \begin_inset CommandInset label
  4574. LatexCommand label
  4575. name "fig:diabetes-sample-weights"
  4576. \end_inset
  4577. \series bold
  4578. Boxplot of sample quality weights grouped by diabetes diagnosis.
  4579. \series default
  4580. Sample were grouped based on diabetes diagnosis, and the distribution of
  4581. sample quality weights for each diagnosis was plotted.
  4582. \end_layout
  4583. \end_inset
  4584. \end_layout
  4585. \begin_layout Plain Layout
  4586. \end_layout
  4587. \end_inset
  4588. \end_layout
  4589. \begin_layout Standard
  4590. To determine whether any of the known experimental factors had an impact
  4591. on data quality, the sample quality weights estimated from the data were
  4592. tested for association with each of the experimental factors (Table
  4593. \begin_inset CommandInset ref
  4594. LatexCommand ref
  4595. reference "tab:weight-covariate-tests"
  4596. plural "false"
  4597. caps "false"
  4598. noprefix "false"
  4599. \end_inset
  4600. ).
  4601. Diabetes diagnosis was found to have a potentially significant association
  4602. with the sample weights, with a t-test p-value of
  4603. \begin_inset Formula $1.06\times10^{-3}$
  4604. \end_inset
  4605. .
  4606. Figure
  4607. \begin_inset CommandInset ref
  4608. LatexCommand ref
  4609. reference "fig:diabetes-sample-weights"
  4610. plural "false"
  4611. caps "false"
  4612. noprefix "false"
  4613. \end_inset
  4614. shows the distribution of sample weights grouped by diabetes diagnosis.
  4615. The samples from patients with Type 2 diabetes were assigned significantly
  4616. lower weights than those from patients with Type 1 diabetes.
  4617. This indicates that the type 2 diabetes samples had an overall higher variance
  4618. on average across all probes.
  4619. \end_layout
  4620. \begin_layout Standard
  4621. \begin_inset Float table
  4622. wide false
  4623. sideways false
  4624. status collapsed
  4625. \begin_layout Plain Layout
  4626. \align center
  4627. \begin_inset Flex TODO Note (inline)
  4628. status open
  4629. \begin_layout Plain Layout
  4630. Consider transposing this table and the next one
  4631. \end_layout
  4632. \end_inset
  4633. \end_layout
  4634. \begin_layout Plain Layout
  4635. \align center
  4636. \begin_inset Tabular
  4637. <lyxtabular version="3" rows="5" columns="4">
  4638. <features tabularvalignment="middle">
  4639. <column alignment="center" valignment="top">
  4640. <column alignment="center" valignment="top">
  4641. <column alignment="center" valignment="top">
  4642. <column alignment="center" valignment="top">
  4643. <row>
  4644. <cell alignment="center" valignment="top" usebox="none">
  4645. \begin_inset Text
  4646. \begin_layout Plain Layout
  4647. \end_layout
  4648. \end_inset
  4649. </cell>
  4650. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4651. \begin_inset Text
  4652. \begin_layout Plain Layout
  4653. Analysis
  4654. \end_layout
  4655. \end_inset
  4656. </cell>
  4657. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4658. \begin_inset Text
  4659. \begin_layout Plain Layout
  4660. \end_layout
  4661. \end_inset
  4662. </cell>
  4663. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4664. \begin_inset Text
  4665. \begin_layout Plain Layout
  4666. \end_layout
  4667. \end_inset
  4668. </cell>
  4669. </row>
  4670. <row>
  4671. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4672. \begin_inset Text
  4673. \begin_layout Plain Layout
  4674. Contrast
  4675. \end_layout
  4676. \end_inset
  4677. </cell>
  4678. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4679. \begin_inset Text
  4680. \begin_layout Plain Layout
  4681. A
  4682. \end_layout
  4683. \end_inset
  4684. </cell>
  4685. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4686. \begin_inset Text
  4687. \begin_layout Plain Layout
  4688. B
  4689. \end_layout
  4690. \end_inset
  4691. </cell>
  4692. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4693. \begin_inset Text
  4694. \begin_layout Plain Layout
  4695. C
  4696. \end_layout
  4697. \end_inset
  4698. </cell>
  4699. </row>
  4700. <row>
  4701. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4702. \begin_inset Text
  4703. \begin_layout Plain Layout
  4704. TX vs AR
  4705. \end_layout
  4706. \end_inset
  4707. </cell>
  4708. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4709. \begin_inset Text
  4710. \begin_layout Plain Layout
  4711. 0
  4712. \end_layout
  4713. \end_inset
  4714. </cell>
  4715. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4716. \begin_inset Text
  4717. \begin_layout Plain Layout
  4718. 25
  4719. \end_layout
  4720. \end_inset
  4721. </cell>
  4722. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4723. \begin_inset Text
  4724. \begin_layout Plain Layout
  4725. 22
  4726. \end_layout
  4727. \end_inset
  4728. </cell>
  4729. </row>
  4730. <row>
  4731. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4732. \begin_inset Text
  4733. \begin_layout Plain Layout
  4734. TX vs ADNR
  4735. \end_layout
  4736. \end_inset
  4737. </cell>
  4738. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4739. \begin_inset Text
  4740. \begin_layout Plain Layout
  4741. 7
  4742. \end_layout
  4743. \end_inset
  4744. </cell>
  4745. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4746. \begin_inset Text
  4747. \begin_layout Plain Layout
  4748. 338
  4749. \end_layout
  4750. \end_inset
  4751. </cell>
  4752. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4753. \begin_inset Text
  4754. \begin_layout Plain Layout
  4755. 369
  4756. \end_layout
  4757. \end_inset
  4758. </cell>
  4759. </row>
  4760. <row>
  4761. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4762. \begin_inset Text
  4763. \begin_layout Plain Layout
  4764. TX vs CAN
  4765. \end_layout
  4766. \end_inset
  4767. </cell>
  4768. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4769. \begin_inset Text
  4770. \begin_layout Plain Layout
  4771. 0
  4772. \end_layout
  4773. \end_inset
  4774. </cell>
  4775. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4776. \begin_inset Text
  4777. \begin_layout Plain Layout
  4778. 231
  4779. \end_layout
  4780. \end_inset
  4781. </cell>
  4782. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4783. \begin_inset Text
  4784. \begin_layout Plain Layout
  4785. 278
  4786. \end_layout
  4787. \end_inset
  4788. </cell>
  4789. </row>
  4790. </lyxtabular>
  4791. \end_inset
  4792. \end_layout
  4793. \begin_layout Plain Layout
  4794. \begin_inset Caption Standard
  4795. \begin_layout Plain Layout
  4796. \begin_inset CommandInset label
  4797. LatexCommand label
  4798. name "tab:methyl-num-signif"
  4799. \end_inset
  4800. \series bold
  4801. Number of probes significant at 10% FDR for each contrast in each analysis.
  4802. \series default
  4803. For each of the analyses in Table
  4804. \begin_inset CommandInset ref
  4805. LatexCommand ref
  4806. reference "tab:Summary-of-meth-analysis"
  4807. plural "false"
  4808. caps "false"
  4809. noprefix "false"
  4810. \end_inset
  4811. , the table shows the number of probes called significantly differentially
  4812. methylated at a threshold of 10% FDR for each comparison between TX and
  4813. the other 3 transplant statuses.
  4814. \end_layout
  4815. \end_inset
  4816. \end_layout
  4817. \end_inset
  4818. \end_layout
  4819. \begin_layout Standard
  4820. \begin_inset Float table
  4821. wide false
  4822. sideways false
  4823. status collapsed
  4824. \begin_layout Plain Layout
  4825. \align center
  4826. \begin_inset Tabular
  4827. <lyxtabular version="3" rows="5" columns="4">
  4828. <features tabularvalignment="middle">
  4829. <column alignment="center" valignment="top">
  4830. <column alignment="center" valignment="top">
  4831. <column alignment="center" valignment="top">
  4832. <column alignment="center" valignment="top">
  4833. <row>
  4834. <cell alignment="center" valignment="top" usebox="none">
  4835. \begin_inset Text
  4836. \begin_layout Plain Layout
  4837. \end_layout
  4838. \end_inset
  4839. </cell>
  4840. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4841. \begin_inset Text
  4842. \begin_layout Plain Layout
  4843. Analysis
  4844. \end_layout
  4845. \end_inset
  4846. </cell>
  4847. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4848. \begin_inset Text
  4849. \begin_layout Plain Layout
  4850. \end_layout
  4851. \end_inset
  4852. </cell>
  4853. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4854. \begin_inset Text
  4855. \begin_layout Plain Layout
  4856. \end_layout
  4857. \end_inset
  4858. </cell>
  4859. </row>
  4860. <row>
  4861. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4862. \begin_inset Text
  4863. \begin_layout Plain Layout
  4864. Contrast
  4865. \end_layout
  4866. \end_inset
  4867. </cell>
  4868. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4869. \begin_inset Text
  4870. \begin_layout Plain Layout
  4871. A
  4872. \end_layout
  4873. \end_inset
  4874. </cell>
  4875. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4876. \begin_inset Text
  4877. \begin_layout Plain Layout
  4878. B
  4879. \end_layout
  4880. \end_inset
  4881. </cell>
  4882. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4883. \begin_inset Text
  4884. \begin_layout Plain Layout
  4885. C
  4886. \end_layout
  4887. \end_inset
  4888. </cell>
  4889. </row>
  4890. <row>
  4891. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4892. \begin_inset Text
  4893. \begin_layout Plain Layout
  4894. TX vs AR
  4895. \end_layout
  4896. \end_inset
  4897. </cell>
  4898. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4899. \begin_inset Text
  4900. \begin_layout Plain Layout
  4901. 0
  4902. \end_layout
  4903. \end_inset
  4904. </cell>
  4905. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4906. \begin_inset Text
  4907. \begin_layout Plain Layout
  4908. 10,063
  4909. \end_layout
  4910. \end_inset
  4911. </cell>
  4912. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4913. \begin_inset Text
  4914. \begin_layout Plain Layout
  4915. 11,225
  4916. \end_layout
  4917. \end_inset
  4918. </cell>
  4919. </row>
  4920. <row>
  4921. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4922. \begin_inset Text
  4923. \begin_layout Plain Layout
  4924. TX vs ADNR
  4925. \end_layout
  4926. \end_inset
  4927. </cell>
  4928. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4929. \begin_inset Text
  4930. \begin_layout Plain Layout
  4931. 27
  4932. \end_layout
  4933. \end_inset
  4934. </cell>
  4935. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  4936. \begin_inset Text
  4937. \begin_layout Plain Layout
  4938. 12,674
  4939. \end_layout
  4940. \end_inset
  4941. </cell>
  4942. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  4943. \begin_inset Text
  4944. \begin_layout Plain Layout
  4945. 13,086
  4946. \end_layout
  4947. \end_inset
  4948. </cell>
  4949. </row>
  4950. <row>
  4951. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4952. \begin_inset Text
  4953. \begin_layout Plain Layout
  4954. TX vs CAN
  4955. \end_layout
  4956. \end_inset
  4957. </cell>
  4958. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4959. \begin_inset Text
  4960. \begin_layout Plain Layout
  4961. 966
  4962. \end_layout
  4963. \end_inset
  4964. </cell>
  4965. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  4966. \begin_inset Text
  4967. \begin_layout Plain Layout
  4968. 20,039
  4969. \end_layout
  4970. \end_inset
  4971. </cell>
  4972. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  4973. \begin_inset Text
  4974. \begin_layout Plain Layout
  4975. 20,955
  4976. \end_layout
  4977. \end_inset
  4978. </cell>
  4979. </row>
  4980. </lyxtabular>
  4981. \end_inset
  4982. \end_layout
  4983. \begin_layout Plain Layout
  4984. \begin_inset Caption Standard
  4985. \begin_layout Plain Layout
  4986. \begin_inset CommandInset label
  4987. LatexCommand label
  4988. name "tab:methyl-est-nonnull"
  4989. \end_inset
  4990. \series bold
  4991. Estimated number of non-null tests for each contrast in each analysis.
  4992. \series default
  4993. For each of the analyses in Table
  4994. \begin_inset CommandInset ref
  4995. LatexCommand ref
  4996. reference "tab:Summary-of-meth-analysis"
  4997. plural "false"
  4998. caps "false"
  4999. noprefix "false"
  5000. \end_inset
  5001. , the table shows the number of probes estimated to be differentially methylated
  5002. between TX and the other 3 transplant statuses.
  5003. \end_layout
  5004. \end_inset
  5005. \end_layout
  5006. \end_inset
  5007. \end_layout
  5008. \begin_layout Standard
  5009. \begin_inset Float figure
  5010. wide false
  5011. sideways false
  5012. status collapsed
  5013. \begin_layout Plain Layout
  5014. \begin_inset Flex TODO Note (inline)
  5015. status open
  5016. \begin_layout Plain Layout
  5017. Re-generate p-value histograms for all relevant contrasts in a single page,
  5018. then write an appropriate legend.
  5019. \end_layout
  5020. \end_inset
  5021. \end_layout
  5022. \begin_layout Plain Layout
  5023. \align center
  5024. \series bold
  5025. [Figure goes here]
  5026. \end_layout
  5027. \begin_layout Plain Layout
  5028. \begin_inset Caption Standard
  5029. \begin_layout Plain Layout
  5030. \series bold
  5031. \begin_inset CommandInset label
  5032. LatexCommand label
  5033. name "fig:meth-p-value-histograms"
  5034. \end_inset
  5035. Probe p-value histograms for each contrast in each analysis.
  5036. \end_layout
  5037. \end_inset
  5038. \end_layout
  5039. \begin_layout Plain Layout
  5040. \end_layout
  5041. \end_inset
  5042. \end_layout
  5043. \begin_layout Standard
  5044. Table
  5045. \begin_inset CommandInset ref
  5046. LatexCommand ref
  5047. reference "tab:methyl-num-signif"
  5048. plural "false"
  5049. caps "false"
  5050. noprefix "false"
  5051. \end_inset
  5052. shows the number of significantly differentially methylated probes reported
  5053. by each analysis for each comparison of interest at an FDR of 10%.
  5054. As expected, the more elaborate analyses, B and C, report more significant
  5055. probes than the more basic analysis A, consistent with the conclusions
  5056. above that the data contain hidden systematic variations that must be modeled.
  5057. Table
  5058. \begin_inset CommandInset ref
  5059. LatexCommand ref
  5060. reference "tab:methyl-est-nonnull"
  5061. plural "false"
  5062. caps "false"
  5063. noprefix "false"
  5064. \end_inset
  5065. shows the estimated number differentially methylated probes for each test
  5066. from each analysis.
  5067. This was computed by estimating the proportion of null hypotheses that
  5068. were true using the method of
  5069. \begin_inset CommandInset citation
  5070. LatexCommand cite
  5071. key "Phipson2013"
  5072. literal "false"
  5073. \end_inset
  5074. and subtracting that fraction from the total number of probes, yielding
  5075. an estimate of the number of null hypotheses that are false based on the
  5076. distribution of p-values across the entire dataset.
  5077. Note that this does not identify which null hypotheses should be rejected
  5078. (i.e.
  5079. which probes are significant); it only estimates the true number of such
  5080. probes.
  5081. Once again, analyses B and C result it much larger estimates for the number
  5082. of differentially methylated probes.
  5083. In this case, analysis C, the only analysis that includes voom, estimates
  5084. the largest number of differentially methylated probes for all 3 contrasts.
  5085. If the assumptions of all the methods employed hold, then this represents
  5086. a gain in statistical power over the simpler analysis A.
  5087. Figure
  5088. \begin_inset CommandInset ref
  5089. LatexCommand ref
  5090. reference "fig:meth-p-value-histograms"
  5091. plural "false"
  5092. caps "false"
  5093. noprefix "false"
  5094. \end_inset
  5095. shows the p-value distributions for each test, from which the numbers in
  5096. Table
  5097. \begin_inset CommandInset ref
  5098. LatexCommand ref
  5099. reference "tab:methyl-est-nonnull"
  5100. plural "false"
  5101. caps "false"
  5102. noprefix "false"
  5103. \end_inset
  5104. were generated.
  5105. The distributions for analysis A all have a dip in density near zero, which
  5106. is a strong sign of a poor model fit.
  5107. The histograms for analyses B and C are more well-behaved, with a uniform
  5108. component stretching all the way from 0 to 1 representing the probes for
  5109. which the null hypotheses is true (no differential methylation), and a
  5110. zero-biased component representing the probes for which the null hypothesis
  5111. is false (differentially methylated).
  5112. These histograms do not indicate any major issues with the model fit.
  5113. \end_layout
  5114. \begin_layout Standard
  5115. \begin_inset Flex TODO Note (inline)
  5116. status open
  5117. \begin_layout Plain Layout
  5118. Maybe include the PCA plots before/after SVA effect subtraction?
  5119. \end_layout
  5120. \end_inset
  5121. \end_layout
  5122. \begin_layout Standard
  5123. \begin_inset ERT
  5124. status collapsed
  5125. \begin_layout Plain Layout
  5126. \backslash
  5127. FloatBarrier
  5128. \end_layout
  5129. \end_inset
  5130. \end_layout
  5131. \begin_layout Section
  5132. Discussion
  5133. \end_layout
  5134. \begin_layout Subsection
  5135. fRMA achieves clinically applicable normalization without sacrificing classifica
  5136. tion performance
  5137. \end_layout
  5138. \begin_layout Standard
  5139. As shown in Figure
  5140. \begin_inset CommandInset ref
  5141. LatexCommand ref
  5142. reference "fig:Classifier-probabilities-RMA"
  5143. plural "false"
  5144. caps "false"
  5145. noprefix "false"
  5146. \end_inset
  5147. , improper normalization, particularly separate normalization of training
  5148. and test samples, leads to unwanted biases in classification.
  5149. In a controlled experimental context, it is always possible to correct
  5150. this issue by normalizing all experimental samples together.
  5151. However, because it is not feasible to normalize all samples together in
  5152. a clinical context, a single-channel normalization is required is required.
  5153. \end_layout
  5154. \begin_layout Standard
  5155. The major concern in using a single-channel normalization is that non-single-cha
  5156. nnel methods can share information between arrays to improve the normalization,
  5157. and single-channel methods risk sacrificing the gains in normalization
  5158. accuracy that come from this information sharing.
  5159. In the case of RMA, this information sharing is accomplished through quantile
  5160. normalization and median polish steps.
  5161. The need for information sharing in quantile normalization can easily be
  5162. removed by learning a fixed set of quantiles from external data and normalizing
  5163. each array to these fixed quantiles, instead of the quantiles of the data
  5164. itself.
  5165. As long as the fixed quantiles are reasonable, the result will be similar
  5166. to standard RMA.
  5167. However, there is no analogous way to eliminate cross-array information
  5168. sharing in the median polish step, so fRMA replaces this with a weighted
  5169. average of probes on each array, with the weights learned from external
  5170. data.
  5171. This step of fRMA has the greatest potential to diverge from RMA un undesirable
  5172. ways.
  5173. \end_layout
  5174. \begin_layout Standard
  5175. However, when run on real data, fRMA performed at least as well as RMA in
  5176. both the internal validation and external validation tests.
  5177. This shows that fRMA can be used to normalize individual clinical samples
  5178. in a class prediction context without sacrificing the classifier performance
  5179. that would be obtained by using the more well-established RMA for normalization.
  5180. The other single-channel normalization method considered, SCAN, showed
  5181. some loss of AUC in the external validation test.
  5182. Based on these results, fRMA is the preferred normalization for clinical
  5183. samples in a class prediction context.
  5184. \end_layout
  5185. \begin_layout Subsection
  5186. Robust fRMA vectors can be generated for new array platforms
  5187. \end_layout
  5188. \begin_layout Standard
  5189. \begin_inset Flex TODO Note (inline)
  5190. status open
  5191. \begin_layout Plain Layout
  5192. Look up the exact numbers, do a find & replace for
  5193. \begin_inset Quotes eld
  5194. \end_inset
  5195. 850
  5196. \begin_inset Quotes erd
  5197. \end_inset
  5198. \end_layout
  5199. \end_inset
  5200. \end_layout
  5201. \begin_layout Standard
  5202. The published fRMA normalization vectors for the hgu133plus2 platform were
  5203. generated from a set of about 850 samples chosen from a wide range of tissues,
  5204. which the authors determined was sufficient to generate a robust set of
  5205. normalization vectors that could be applied across all tissues
  5206. \begin_inset CommandInset citation
  5207. LatexCommand cite
  5208. key "McCall2010"
  5209. literal "false"
  5210. \end_inset
  5211. .
  5212. Since we only had hthgu133pluspm for 2 tissues of interest, our needs were
  5213. more modest.
  5214. Even using only 130 samples in 26 batches of 5 samples each for kidney
  5215. biopsies, we were able to train a robust set of fRMA normalization vectors
  5216. that were not meaningfully affected by the random selection of 5 samples
  5217. from each batch.
  5218. As expected, the training process was just as robust for the blood samples
  5219. with 230 samples in 46 batches of 5 samples each.
  5220. Because these vectors were each generated using training samples from a
  5221. single tissue, they are not suitable for general use, unlike the vectors
  5222. provided with fRMA itself.
  5223. They are purpose-built for normalizing a specific type of sample on a specific
  5224. platform.
  5225. This is a mostly acceptable limitation in the context of developing a machine
  5226. learning classifier for diagnosing a disease based on samples of a specific
  5227. tissue.
  5228. \end_layout
  5229. \begin_layout Standard
  5230. \begin_inset Flex TODO Note (inline)
  5231. status open
  5232. \begin_layout Plain Layout
  5233. How to bring up that these custom vectors were used in another project by
  5234. someone else that was never published?
  5235. \end_layout
  5236. \end_inset
  5237. \end_layout
  5238. \begin_layout Subsection
  5239. Methylation array data can be successfully analyzed using existing techniques,
  5240. but machine learning poses additional challenges
  5241. \end_layout
  5242. \begin_layout Standard
  5243. Both analysis strategies B and C both yield a reasonable analysis, with
  5244. a mean-variance trend that matches the expected behavior for the non-linear
  5245. M-value transformation (Figure
  5246. \begin_inset CommandInset ref
  5247. LatexCommand ref
  5248. reference "fig:meanvar-sva-aw"
  5249. plural "false"
  5250. caps "false"
  5251. noprefix "false"
  5252. \end_inset
  5253. ) and well-behaved p-value distributions (Figure
  5254. \begin_inset CommandInset ref
  5255. LatexCommand ref
  5256. reference "fig:meth-p-value-histograms"
  5257. plural "false"
  5258. caps "false"
  5259. noprefix "false"
  5260. \end_inset
  5261. ).
  5262. These two analyses also yield similar numbers of significant probes (Table
  5263. \begin_inset CommandInset ref
  5264. LatexCommand ref
  5265. reference "tab:methyl-num-signif"
  5266. plural "false"
  5267. caps "false"
  5268. noprefix "false"
  5269. \end_inset
  5270. ) and similar estimates of the number of differentially methylated probes
  5271. (Table
  5272. \begin_inset CommandInset ref
  5273. LatexCommand ref
  5274. reference "tab:methyl-est-nonnull"
  5275. plural "false"
  5276. caps "false"
  5277. noprefix "false"
  5278. \end_inset
  5279. ).
  5280. The main difference between these two analyses is the method used to account
  5281. for the mean-variance trend.
  5282. In analysis B, the trend is estimated and applied at the probe level: each
  5283. probe's estimated variance is squeezed toward the trend using an empirical
  5284. Bayes procedure (Figure
  5285. \begin_inset CommandInset ref
  5286. LatexCommand ref
  5287. reference "fig:meanvar-sva-aw"
  5288. plural "false"
  5289. caps "false"
  5290. noprefix "false"
  5291. \end_inset
  5292. ).
  5293. In analysis C, the trend is still estimated at the probe level, but instead
  5294. of estimating a single variance value shared across all observations for
  5295. a given probe, the voom method computes an initial estiamte of the variance
  5296. for each observation individually based on where its model-fitted M-value
  5297. falls on the trend line and then assigns inverse-variance weights to model
  5298. the difference in variance between observations.
  5299. An overall variance is still estimated for each probe using the same empirical
  5300. Bayes method, but now the residual trend is flat (Figure
  5301. \begin_inset CommandInset ref
  5302. LatexCommand ref
  5303. reference "fig:meanvar-sva-voomaw"
  5304. plural "false"
  5305. caps "false"
  5306. noprefix "false"
  5307. \end_inset
  5308. ), and the mean-variance trend is modeled by scaling the probe's estimated
  5309. variance for each observation using the weights computed by voom.
  5310. The difference between these two methods is analogous to the difference
  5311. between a t-test with equal variance and a t-test with unequal variance,
  5312. except that the unequal group variances used in the latter test are estimated
  5313. based on the mean-variance trend from all the probes rather than the data
  5314. for the specific probe being tested, thus stabilizing the group variance
  5315. estimates by sharing information between probes.
  5316. In practice, allowing voom to model the variance using observation weights
  5317. in this manner allows the linear model fit to concentrate statistical power
  5318. where it will do the most good.
  5319. For example, if a particular probe's M-values are always at the extreme
  5320. of the M-value range (e.g.
  5321. less than -4) for ADNR samples, but the M-values for that probe in TX and
  5322. CAN samples are within the flat region of the mean-variance trend (between
  5323. -3 and +3), voom is able to down-weight the contribution of the high-variance
  5324. M-values from the ADNR samples in order to gain more statistical power
  5325. while testing for differential methylation between TX and CAN.
  5326. In contrast, modeling the mean-variance trend only at the probe level would
  5327. combine the high-variance ADNR samples and lower-variance samples from
  5328. other conditions and estimate an intermediate variance for this probe.
  5329. In practice, analysis B shows that this approach is adequate, but the voom
  5330. approach in analysis C is at least as good on all model fit criteria and
  5331. yields a larger estimate for the number of differentially methylated genes.
  5332. \end_layout
  5333. \begin_layout Standard
  5334. The significant association of diebetes diagnosis with sample quality is
  5335. interesting.
  5336. The samples with Type 2 diabetes tended to have more variation, averaged
  5337. across all probes, than those with Type 1 diabetes.
  5338. This is consistent with the consensus that type 2 disbetes and the associated
  5339. metabolic syndrome represent a broad dysregulation of the body's endocrine
  5340. signalling related to metabolism [citation needed].
  5341. This dysregulation could easily manifest as a greater degree of variation
  5342. in the DNA methylation patterns of affected tissues.
  5343. In contrast, Type 1 disbetes has a more specific cause and effect, so a
  5344. less variable methylation signature is expected.
  5345. \end_layout
  5346. \begin_layout Standard
  5347. This preliminary anlaysis suggests that some degree of differential methylation
  5348. exists between TX and each of the three types of transplant disfunction
  5349. studied.
  5350. Hence, it may be feasible to train a classifier to diagnose transplant
  5351. disfunction from DNA methylation array data.
  5352. However, the major importance of both SVA and sample quality weighting
  5353. for proper modeling of this data poses significant challenges for any attempt
  5354. at a machine learning on data of similar quality.
  5355. While these are easily used in a modeling context with full sample information,
  5356. neither of these methods is directly applicable in a machine learning context,
  5357. where the diagnosis is not known ahead of time.
  5358. If a machine learning approach for methylation-based diagnosis is to be
  5359. pursued, it will either require machine-learning-friendly methods to address
  5360. the same systematic trends in the data that SVA and sample quality weighting
  5361. address, or it will require higher quality data with substantially less
  5362. systematic perturbation of the data.
  5363. \end_layout
  5364. \begin_layout Chapter
  5365. Globin-blocking for more effective blood RNA-seq analysis in primate animal
  5366. model
  5367. \end_layout
  5368. \begin_layout Standard
  5369. \begin_inset Flex TODO Note (inline)
  5370. status open
  5371. \begin_layout Plain Layout
  5372. Choose between above and the paper title: Optimizing yield of deep RNA sequencin
  5373. g for gene expression profiling by globin reduction of peripheral blood
  5374. samples from cynomolgus monkeys (Macaca fascicularis).
  5375. \end_layout
  5376. \end_inset
  5377. \end_layout
  5378. \begin_layout Standard
  5379. \begin_inset Flex TODO Note (inline)
  5380. status open
  5381. \begin_layout Plain Layout
  5382. Chapter author list: https://tex.stackexchange.com/questions/156862/displaying-aut
  5383. hor-for-each-chapter-in-book Every chapter gets an author list, which may
  5384. or may not be part of a citation to a published/preprinted paper.
  5385. \end_layout
  5386. \end_inset
  5387. \end_layout
  5388. \begin_layout Standard
  5389. \begin_inset Flex TODO Note (inline)
  5390. status open
  5391. \begin_layout Plain Layout
  5392. Preprint then cite the paper
  5393. \end_layout
  5394. \end_inset
  5395. \end_layout
  5396. \begin_layout Section*
  5397. Abstract
  5398. \end_layout
  5399. \begin_layout Paragraph
  5400. Background
  5401. \end_layout
  5402. \begin_layout Standard
  5403. Primate blood contains high concentrations of globin messenger RNA.
  5404. Globin reduction is a standard technique used to improve the expression
  5405. results obtained by DNA microarrays on RNA from blood samples.
  5406. However, with whole transcriptome RNA-sequencing (RNA-seq) quickly replacing
  5407. microarrays for many applications, the impact of globin reduction for RNA-seq
  5408. has not been previously studied.
  5409. Moreover, no off-the-shelf kits are available for globin reduction in nonhuman
  5410. primates.
  5411. \end_layout
  5412. \begin_layout Paragraph
  5413. Results
  5414. \end_layout
  5415. \begin_layout Standard
  5416. Here we report a protocol for RNA-seq in primate blood samples that uses
  5417. complimentary oligonucleotides to block reverse transcription of the alpha
  5418. and beta globin genes.
  5419. In test samples from cynomolgus monkeys (Macaca fascicularis), this globin
  5420. blocking protocol approximately doubles the yield of informative (non-globin)
  5421. reads by greatly reducing the fraction of globin reads, while also improving
  5422. the consistency in sequencing depth between samples.
  5423. The increased yield enables detection of about 2000 more genes, significantly
  5424. increases the correlation in measured gene expression levels between samples,
  5425. and increases the sensitivity of differential gene expression tests.
  5426. \end_layout
  5427. \begin_layout Paragraph
  5428. Conclusions
  5429. \end_layout
  5430. \begin_layout Standard
  5431. These results show that globin blocking significantly improves the cost-effectiv
  5432. eness of mRNA sequencing in primate blood samples by doubling the yield
  5433. of useful reads, allowing detection of more genes, and improving the precision
  5434. of gene expression measurements.
  5435. Based on these results, a globin reducing or blocking protocol is recommended
  5436. for all RNA-seq studies of primate blood samples.
  5437. \end_layout
  5438. \begin_layout Section
  5439. Approach
  5440. \end_layout
  5441. \begin_layout Standard
  5442. \begin_inset Note Note
  5443. status open
  5444. \begin_layout Plain Layout
  5445. Consider putting some of this in the Intro chapter
  5446. \end_layout
  5447. \begin_layout Itemize
  5448. Cynomolgus monkeys as a model organism
  5449. \end_layout
  5450. \begin_deeper
  5451. \begin_layout Itemize
  5452. Highly related to humans
  5453. \end_layout
  5454. \begin_layout Itemize
  5455. Small size and short life cycle - good research animal
  5456. \end_layout
  5457. \begin_layout Itemize
  5458. Genomics resources still in development
  5459. \end_layout
  5460. \end_deeper
  5461. \begin_layout Itemize
  5462. Inadequacy of existing blood RNA-seq protocols
  5463. \end_layout
  5464. \begin_deeper
  5465. \begin_layout Itemize
  5466. Existing protocols use a separate globin pulldown step, slowing down processing
  5467. \end_layout
  5468. \end_deeper
  5469. \end_inset
  5470. \end_layout
  5471. \begin_layout Standard
  5472. Increasingly, researchers are turning to high-throughput mRNA sequencing
  5473. technologies (RNA-seq) in preference to expression microarrays for analysis
  5474. of gene expression
  5475. \begin_inset CommandInset citation
  5476. LatexCommand cite
  5477. key "Mutz2012"
  5478. literal "false"
  5479. \end_inset
  5480. .
  5481. The advantages are even greater for study of model organisms with no well-estab
  5482. lished array platforms available, such as the cynomolgus monkey (Macaca
  5483. fascicularis).
  5484. High fractions of globin mRNA are naturally present in mammalian peripheral
  5485. blood samples (up to 70% of total mRNA) and these are known to interfere
  5486. with the results of array-based expression profiling
  5487. \begin_inset CommandInset citation
  5488. LatexCommand cite
  5489. key "Winn2010"
  5490. literal "false"
  5491. \end_inset
  5492. .
  5493. The importance of globin reduction for RNA-seq of blood has only been evaluated
  5494. for a deepSAGE protocol on human samples
  5495. \begin_inset CommandInset citation
  5496. LatexCommand cite
  5497. key "Mastrokolias2012"
  5498. literal "false"
  5499. \end_inset
  5500. .
  5501. In the present report, we evaluated globin reduction using custom blocking
  5502. oligonucleotides for deep RNA-seq of peripheral blood samples from a nonhuman
  5503. primate, cynomolgus monkey, using the Illumina technology platform.
  5504. We demonstrate that globin reduction significantly improves the cost-effectiven
  5505. ess of RNA-seq in blood samples.
  5506. Thus, our protocol offers a significant advantage to any investigator planning
  5507. to use RNA-seq for gene expression profiling of nonhuman primate blood
  5508. samples.
  5509. Our method can be generally applied to any species by designing complementary
  5510. oligonucleotide blocking probes to the globin gene sequences of that species.
  5511. Indeed, any highly expressed but biologically uninformative transcripts
  5512. can also be blocked to further increase sequencing efficiency and value
  5513. \begin_inset CommandInset citation
  5514. LatexCommand cite
  5515. key "Arnaud2016"
  5516. literal "false"
  5517. \end_inset
  5518. .
  5519. \end_layout
  5520. \begin_layout Section
  5521. Methods
  5522. \end_layout
  5523. \begin_layout Subsection
  5524. Sample collection
  5525. \end_layout
  5526. \begin_layout Standard
  5527. All research reported here was done under IACUC-approved protocols at the
  5528. University of Miami and complied with all applicable federal and state
  5529. regulations and ethical principles for nonhuman primate research.
  5530. Blood draws occurred between 16 April 2012 and 18 June 2015.
  5531. The experimental system involved intrahepatic pancreatic islet transplantation
  5532. into Cynomolgus monkeys with induced diabetes mellitus with or without
  5533. concomitant infusion of mesenchymal stem cells.
  5534. Blood was collected at serial time points before and after transplantation
  5535. into PAXgene Blood RNA tubes (PreAnalytiX/Qiagen, Valencia, CA) at the
  5536. precise volume:volume ratio of 2.5 ml whole blood into 6.9 ml of PAX gene
  5537. additive.
  5538. \end_layout
  5539. \begin_layout Subsection
  5540. Globin Blocking
  5541. \end_layout
  5542. \begin_layout Standard
  5543. Four oligonucleotides were designed to hybridize to the 3’ end of the transcript
  5544. s for Cynomolgus HBA1, HBA2 and HBB, with two hybridization sites for HBB
  5545. and 2 sites for HBA (the chosen sites were identical in both HBA genes).
  5546. All oligos were purchased from Sigma and were entirely composed of 2’O-Me
  5547. bases with a C3 spacer positioned at the 3’ ends to prevent any polymerase
  5548. mediated primer extension.
  5549. \end_layout
  5550. \begin_layout Quote
  5551. HBA1/2 site 1: GCCCACUCAGACUUUAUUCAAAG-C3spacer
  5552. \end_layout
  5553. \begin_layout Quote
  5554. HBA1/2 site 2: GGUGCAAGGAGGGGAGGAG-C3spacer
  5555. \end_layout
  5556. \begin_layout Quote
  5557. HBB site 1: AAUGAAAAUAAAUGUUUUUUAUUAG-C3spacer
  5558. \end_layout
  5559. \begin_layout Quote
  5560. HBB site 2: CUCAAGGCCCUUCAUAAUAUCCC-C3spacer
  5561. \end_layout
  5562. \begin_layout Subsection
  5563. RNA-seq Library Preparation
  5564. \end_layout
  5565. \begin_layout Standard
  5566. Sequencing libraries were prepared with 200ng total RNA from each sample.
  5567. Polyadenylated mRNA was selected from 200 ng aliquots of cynomologus blood-deri
  5568. ved total RNA using Ambion Dynabeads Oligo(dT)25 beads (Invitrogen) following
  5569. manufacturer’s recommended protocol.
  5570. PolyA selected RNA was then combined with 8 pmol of HBA1/2 (site 1), 8
  5571. pmol of HBA1/2 (site 2), 12 pmol of HBB (site 1) and 12 pmol of HBB (site
  5572. 2) oligonucleotides.
  5573. In addition, 20 pmol of RT primer containing a portion of the Illumina
  5574. adapter sequence (B-oligo-dTV: GAGTTCCTTGGCACCCGAGAATTCCATTTTTTTTTTTTTTTTTTTV)
  5575. and 4 µL of 5X First Strand buffer (250 mM Tris-HCl pH 8.3, 375 mM KCl,
  5576. 15mM MgCl2) were added in a total volume of 15 µL.
  5577. The RNA was fragmented by heating this cocktail for 3 minutes at 95°C and
  5578. then placed on ice.
  5579. This was followed by the addition of 2 µL 0.1 M DTT, 1 µL RNaseOUT, 1 µL
  5580. 10mM dNTPs 10% biotin-16 aminoallyl-2’- dUTP and 10% biotin-16 aminoallyl-2’-
  5581. dCTP (TriLink Biotech, San Diego, CA), 1 µL Superscript II (200U/ µL, Thermo-Fi
  5582. sher).
  5583. A second “unblocked” library was prepared in the same way for each sample
  5584. but replacing the blocking oligos with an equivalent volume of water.
  5585. The reaction was carried out at 25°C for 15 minutes and 42°C for 40 minutes,
  5586. followed by incubation at 75°C for 10 minutes to inactivate the reverse
  5587. transcriptase.
  5588. \end_layout
  5589. \begin_layout Standard
  5590. The cDNA/RNA hybrid molecules were purified using 1.8X Ampure XP beads (Agencourt
  5591. ) following supplier’s recommended protocol.
  5592. The cDNA/RNA hybrid was eluted in 25 µL of 10 mM Tris-HCl pH 8.0, and then
  5593. bound to 25 µL of M280 Magnetic Streptavidin beads washed per recommended
  5594. protocol (Thermo-Fisher).
  5595. After 30 minutes of binding, beads were washed one time in 100 µL 0.1N NaOH
  5596. to denature and remove the bound RNA, followed by two 100 µL washes with
  5597. 1X TE buffer.
  5598. \end_layout
  5599. \begin_layout Standard
  5600. Subsequent attachment of the 5-prime Illumina A adapter was performed by
  5601. on-bead random primer extension of the following sequence (A-N8 primer:
  5602. TTCAGAGTTCTACAGTCCGACGATCNNNNNNNN).
  5603. Briefly, beads were resuspended in a 20 µL reaction containing 5 µM A-N8
  5604. primer, 40mM Tris-HCl pH 7.5, 20mM MgCl2, 50mM NaCl, 0.325U/µL Sequenase
  5605. 2.0 (Affymetrix, Santa Clara, CA), 0.0025U/µL inorganic pyrophosphatase (Affymetr
  5606. ix) and 300 µM each dNTP.
  5607. Reaction was incubated at 22°C for 30 minutes, then beads were washed 2
  5608. times with 1X TE buffer (200µL).
  5609. \end_layout
  5610. \begin_layout Standard
  5611. The magnetic streptavidin beads were resuspended in 34 µL nuclease-free
  5612. water and added directly to a PCR tube.
  5613. The two Illumina protocol-specified PCR primers were added at 0.53 µM (Illumina
  5614. TruSeq Universal Primer 1 and Illumina TruSeq barcoded PCR primer 2), along
  5615. with 40 µL 2X KAPA HiFi Hotstart ReadyMix (KAPA, Willmington MA) and thermocycl
  5616. ed as follows: starting with 98°C (2 min-hold); 15 cycles of 98°C, 20sec;
  5617. 60°C, 30sec; 72°C, 30sec; and finished with a 72°C (2 min-hold).
  5618. \end_layout
  5619. \begin_layout Standard
  5620. PCR products were purified with 1X Ampure Beads following manufacturer’s
  5621. recommended protocol.
  5622. Libraries were then analyzed using the Agilent TapeStation and quantitation
  5623. of desired size range was performed by “smear analysis”.
  5624. Samples were pooled in equimolar batches of 16 samples.
  5625. Pooled libraries were size selected on 2% agarose gels (E-Gel EX Agarose
  5626. Gels; Thermo-Fisher).
  5627. Products were cut between 250 and 350 bp (corresponding to insert sizes
  5628. of 130 to 230 bps).
  5629. Finished library pools were then sequenced on the Illumina NextSeq500 instrumen
  5630. t with 75 base read lengths.
  5631. \end_layout
  5632. \begin_layout Subsection
  5633. Read alignment and counting
  5634. \end_layout
  5635. \begin_layout Standard
  5636. Reads were aligned to the cynomolgus genome using STAR
  5637. \begin_inset CommandInset citation
  5638. LatexCommand cite
  5639. key "Dobin2013,Wilson2013"
  5640. literal "false"
  5641. \end_inset
  5642. .
  5643. Counts of uniquely mapped reads were obtained for every gene in each sample
  5644. with the “featureCounts” function from the Rsubread package, using each
  5645. of the three possibilities for the “strandSpecific” option: sense, antisense,
  5646. and unstranded
  5647. \begin_inset CommandInset citation
  5648. LatexCommand cite
  5649. key "Liao2014"
  5650. literal "false"
  5651. \end_inset
  5652. .
  5653. A few artifacts in the cynomolgus genome annotation complicated read counting.
  5654. First, no ortholog is annotated for alpha globin in the cynomolgus genome,
  5655. presumably because the human genome has two alpha globin genes with nearly
  5656. identical sequences, making the orthology relationship ambiguous.
  5657. However, two loci in the cynomolgus genome are as “hemoglobin subunit alpha-lik
  5658. e” (LOC102136192 and LOC102136846).
  5659. LOC102136192 is annotated as a pseudogene while LOC102136846 is annotated
  5660. as protein-coding.
  5661. Our globin reduction protocol was designed to include blocking of these
  5662. two genes.
  5663. Indeed, these two genes have almost the same read counts in each library
  5664. as the properly-annotated HBB gene and much larger counts than any other
  5665. gene in the unblocked libraries, giving confidence that reads derived from
  5666. the real alpha globin are mapping to both genes.
  5667. Thus, reads from both of these loci were counted as alpha globin reads
  5668. in all further analyses.
  5669. The second artifact is a small, uncharacterized non-coding RNA gene (LOC1021365
  5670. 91), which overlaps the HBA-like gene (LOC102136192) on the opposite strand.
  5671. If counting is not performed in stranded mode (or if a non-strand-specific
  5672. sequencing protocol is used), many reads mapping to the globin gene will
  5673. be discarded as ambiguous due to their overlap with this ncRNA gene, resulting
  5674. in significant undercounting of globin reads.
  5675. Therefore, stranded sense counts were used for all further analysis in
  5676. the present study to insure that we accurately accounted for globin transcript
  5677. reduction.
  5678. However, we note that stranded reads are not necessary for RNA-seq using
  5679. our protocol in standard practice.
  5680. \end_layout
  5681. \begin_layout Subsection
  5682. Normalization and Exploratory Data Analysis
  5683. \end_layout
  5684. \begin_layout Standard
  5685. Libraries were normalized by computing scaling factors using the edgeR package’s
  5686. Trimmed Mean of M-values method
  5687. \begin_inset CommandInset citation
  5688. LatexCommand cite
  5689. key "Robinson2010"
  5690. literal "false"
  5691. \end_inset
  5692. .
  5693. Log2 counts per million values (logCPM) were calculated using the cpm function
  5694. in edgeR for individual samples and aveLogCPM function for averages across
  5695. groups of samples, using those functions’ default prior count values to
  5696. avoid taking the logarithm of 0.
  5697. Genes were considered “present” if their average normalized logCPM values
  5698. across all libraries were at least -1.
  5699. Normalizing for gene length was unnecessary because the sequencing protocol
  5700. is 3’-biased and hence the expected read count for each gene is related
  5701. to the transcript’s copy number but not its length.
  5702. \end_layout
  5703. \begin_layout Standard
  5704. In order to assess the effect of blocking on reproducibility, Pearson and
  5705. Spearman correlation coefficients were computed between the logCPM values
  5706. for every pair of libraries within the globin-blocked (GB) and unblocked
  5707. (non-GB) groups, and edgeR's “estimateDisp” function was used to compute
  5708. negative binomial dispersions separately for the two groups
  5709. \begin_inset CommandInset citation
  5710. LatexCommand cite
  5711. key "Chen2014"
  5712. literal "false"
  5713. \end_inset
  5714. .
  5715. \end_layout
  5716. \begin_layout Subsection
  5717. Differential Expression Analysis
  5718. \end_layout
  5719. \begin_layout Standard
  5720. All tests for differential gene expression were performed using edgeR, by
  5721. first fitting a negative binomial generalized linear model to the counts
  5722. and normalization factors and then performing a quasi-likelihood F-test
  5723. with robust estimation of outlier gene dispersions
  5724. \begin_inset CommandInset citation
  5725. LatexCommand cite
  5726. key "Lund2012,Phipson2016"
  5727. literal "false"
  5728. \end_inset
  5729. .
  5730. To investigate the effects of globin blocking on each gene, an additive
  5731. model was fit to the full data with coefficients for globin blocking and
  5732. SampleID.
  5733. To test the effect of globin blocking on detection of differentially expressed
  5734. genes, the GB samples and non-GB samples were each analyzed independently
  5735. as follows: for each animal with both a pre-transplant and a post-transplant
  5736. time point in the data set, the pre-transplant sample and the earliest
  5737. post-transplant sample were selected, and all others were excluded, yielding
  5738. a pre-/post-transplant pair of samples for each animal (N=7 animals with
  5739. paired samples).
  5740. These samples were analyzed for pre-transplant vs.
  5741. post-transplant differential gene expression while controlling for inter-animal
  5742. variation using an additive model with coefficients for transplant and
  5743. animal ID.
  5744. In all analyses, p-values were adjusted using the Benjamini-Hochberg procedure
  5745. for FDR control
  5746. \begin_inset CommandInset citation
  5747. LatexCommand cite
  5748. key "Benjamini1995"
  5749. literal "false"
  5750. \end_inset
  5751. .
  5752. \end_layout
  5753. \begin_layout Standard
  5754. \begin_inset Note Note
  5755. status open
  5756. \begin_layout Itemize
  5757. New blood RNA-seq protocol to block reverse transcription of globin genes
  5758. \end_layout
  5759. \begin_layout Itemize
  5760. Blood RNA-seq time course after transplants with/without MSC infusion
  5761. \end_layout
  5762. \end_inset
  5763. \end_layout
  5764. \begin_layout Section
  5765. Results
  5766. \end_layout
  5767. \begin_layout Subsection
  5768. Globin blocking yields a larger and more consistent fraction of useful reads
  5769. \end_layout
  5770. \begin_layout Standard
  5771. The objective of the present study was to validate a new protocol for deep
  5772. RNA-seq of whole blood drawn into PaxGene tubes from cynomolgus monkeys
  5773. undergoing islet transplantation, with particular focus on minimizing the
  5774. loss of useful sequencing space to uninformative globin reads.
  5775. The details of the analysis with respect to transplant outcomes and the
  5776. impact of mesenchymal stem cell treatment will be reported in a separate
  5777. manuscript (in preparation).
  5778. To focus on the efficacy of our globin blocking protocol, 37 blood samples,
  5779. 16 from pre-transplant and 21 from post-transplant time points, were each
  5780. prepped once with and once without globin blocking oligos, and were then
  5781. sequenced on an Illumina NextSeq500 instrument.
  5782. The number of reads aligning to each gene in the cynomolgus genome was
  5783. counted.
  5784. Table 1 summarizes the distribution of read fractions among the GB and
  5785. non-GB libraries.
  5786. In the libraries with no globin blocking, globin reads made up an average
  5787. of 44.6% of total input reads, while reads assigned to all other genes made
  5788. up an average of 26.3%.
  5789. The remaining reads either aligned to intergenic regions (that include
  5790. long non-coding RNAs) or did not align with any annotated transcripts in
  5791. the current build of the cynomolgus genome.
  5792. In the GB libraries, globin reads made up only 3.48% and reads assigned
  5793. to all other genes increased to 50.4%.
  5794. Thus, globin blocking resulted in a 92.2% reduction in globin reads and
  5795. a 91.6% increase in yield of useful non-globin reads.
  5796. \end_layout
  5797. \begin_layout Standard
  5798. This reduction is not quite as efficient as the previous analysis showed
  5799. for human samples by DeepSAGE (<0.4% globin reads after globin reduction)
  5800. \begin_inset CommandInset citation
  5801. LatexCommand cite
  5802. key "Mastrokolias2012"
  5803. literal "false"
  5804. \end_inset
  5805. .
  5806. Nonetheless, this degree of globin reduction is sufficient to nearly double
  5807. the yield of useful reads.
  5808. Thus, globin blocking cuts the required sequencing effort (and costs) to
  5809. achieve a target coverage depth by almost 50%.
  5810. Consistent with this near doubling of yield, the average difference in
  5811. un-normalized logCPM across all genes between the GB libraries and non-GB
  5812. libraries is approximately 1 (mean = 1.01, median = 1.08), an overall 2-fold
  5813. increase.
  5814. Un-normalized values are used here because the TMM normalization correctly
  5815. identifies this 2-fold difference as biologically irrelevant and removes
  5816. it.
  5817. \end_layout
  5818. \begin_layout Standard
  5819. \begin_inset Float figure
  5820. wide false
  5821. sideways false
  5822. status open
  5823. \begin_layout Plain Layout
  5824. \align center
  5825. \begin_inset Graphics
  5826. filename graphics/Globin Paper/figure1 - globin-fractions.pdf
  5827. \end_inset
  5828. \end_layout
  5829. \begin_layout Plain Layout
  5830. \begin_inset Caption Standard
  5831. \begin_layout Plain Layout
  5832. \series bold
  5833. \begin_inset Argument 1
  5834. status collapsed
  5835. \begin_layout Plain Layout
  5836. Fraction of genic reads in each sample aligned to non-globin genes, with
  5837. and without globin blocking (GB).
  5838. \end_layout
  5839. \end_inset
  5840. \begin_inset CommandInset label
  5841. LatexCommand label
  5842. name "fig:Fraction-of-genic-reads"
  5843. \end_inset
  5844. Fraction of genic reads in each sample aligned to non-globin genes, with
  5845. and without globin blocking (GB).
  5846. \series default
  5847. All reads in each sequencing library were aligned to the cyno genome, and
  5848. the number of reads uniquely aligning to each gene was counted.
  5849. For each sample, counts were summed separately for all globin genes and
  5850. for the remainder of the genes (non-globin genes), and the fraction of
  5851. genic reads aligned to non-globin genes was computed.
  5852. Each point represents an individual sample.
  5853. Gray + signs indicate the means for globin-blocked libraries and unblocked
  5854. libraries.
  5855. The overall distribution for each group is represented as a notched box
  5856. plots.
  5857. Points are randomly spread vertically to avoid excessive overlapping.
  5858. \end_layout
  5859. \end_inset
  5860. \end_layout
  5861. \begin_layout Plain Layout
  5862. \end_layout
  5863. \end_inset
  5864. \end_layout
  5865. \begin_layout Standard
  5866. \begin_inset Float table
  5867. placement p
  5868. wide false
  5869. sideways true
  5870. status open
  5871. \begin_layout Plain Layout
  5872. \align center
  5873. \begin_inset Tabular
  5874. <lyxtabular version="3" rows="4" columns="7">
  5875. <features tabularvalignment="middle">
  5876. <column alignment="center" valignment="top">
  5877. <column alignment="center" valignment="top">
  5878. <column alignment="center" valignment="top">
  5879. <column alignment="center" valignment="top">
  5880. <column alignment="center" valignment="top">
  5881. <column alignment="center" valignment="top">
  5882. <column alignment="center" valignment="top">
  5883. <row>
  5884. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5885. \begin_inset Text
  5886. \begin_layout Plain Layout
  5887. \end_layout
  5888. \end_inset
  5889. </cell>
  5890. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5891. \begin_inset Text
  5892. \begin_layout Plain Layout
  5893. \family roman
  5894. \series medium
  5895. \shape up
  5896. \size normal
  5897. \emph off
  5898. \bar no
  5899. \strikeout off
  5900. \xout off
  5901. \uuline off
  5902. \uwave off
  5903. \noun off
  5904. \color none
  5905. Percent of Total Reads
  5906. \end_layout
  5907. \end_inset
  5908. </cell>
  5909. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5910. \begin_inset Text
  5911. \begin_layout Plain Layout
  5912. \end_layout
  5913. \end_inset
  5914. </cell>
  5915. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5916. \begin_inset Text
  5917. \begin_layout Plain Layout
  5918. \end_layout
  5919. \end_inset
  5920. </cell>
  5921. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  5922. \begin_inset Text
  5923. \begin_layout Plain Layout
  5924. \end_layout
  5925. \end_inset
  5926. </cell>
  5927. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5928. \begin_inset Text
  5929. \begin_layout Plain Layout
  5930. \family roman
  5931. \series medium
  5932. \shape up
  5933. \size normal
  5934. \emph off
  5935. \bar no
  5936. \strikeout off
  5937. \xout off
  5938. \uuline off
  5939. \uwave off
  5940. \noun off
  5941. \color none
  5942. Percent of Genic Reads
  5943. \end_layout
  5944. \end_inset
  5945. </cell>
  5946. <cell multicolumn="2" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  5947. \begin_inset Text
  5948. \begin_layout Plain Layout
  5949. \end_layout
  5950. \end_inset
  5951. </cell>
  5952. </row>
  5953. <row>
  5954. <cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
  5955. \begin_inset Text
  5956. \begin_layout Plain Layout
  5957. GB
  5958. \end_layout
  5959. \end_inset
  5960. </cell>
  5961. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5962. \begin_inset Text
  5963. \begin_layout Plain Layout
  5964. \family roman
  5965. \series medium
  5966. \shape up
  5967. \size normal
  5968. \emph off
  5969. \bar no
  5970. \strikeout off
  5971. \xout off
  5972. \uuline off
  5973. \uwave off
  5974. \noun off
  5975. \color none
  5976. Non-globin Reads
  5977. \end_layout
  5978. \end_inset
  5979. </cell>
  5980. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  5981. \begin_inset Text
  5982. \begin_layout Plain Layout
  5983. \family roman
  5984. \series medium
  5985. \shape up
  5986. \size normal
  5987. \emph off
  5988. \bar no
  5989. \strikeout off
  5990. \xout off
  5991. \uuline off
  5992. \uwave off
  5993. \noun off
  5994. \color none
  5995. Globin Reads
  5996. \end_layout
  5997. \end_inset
  5998. </cell>
  5999. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6000. \begin_inset Text
  6001. \begin_layout Plain Layout
  6002. \family roman
  6003. \series medium
  6004. \shape up
  6005. \size normal
  6006. \emph off
  6007. \bar no
  6008. \strikeout off
  6009. \xout off
  6010. \uuline off
  6011. \uwave off
  6012. \noun off
  6013. \color none
  6014. All Genic Reads
  6015. \end_layout
  6016. \end_inset
  6017. </cell>
  6018. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6019. \begin_inset Text
  6020. \begin_layout Plain Layout
  6021. \family roman
  6022. \series medium
  6023. \shape up
  6024. \size normal
  6025. \emph off
  6026. \bar no
  6027. \strikeout off
  6028. \xout off
  6029. \uuline off
  6030. \uwave off
  6031. \noun off
  6032. \color none
  6033. All Aligned Reads
  6034. \end_layout
  6035. \end_inset
  6036. </cell>
  6037. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6038. \begin_inset Text
  6039. \begin_layout Plain Layout
  6040. \family roman
  6041. \series medium
  6042. \shape up
  6043. \size normal
  6044. \emph off
  6045. \bar no
  6046. \strikeout off
  6047. \xout off
  6048. \uuline off
  6049. \uwave off
  6050. \noun off
  6051. \color none
  6052. Non-globin Reads
  6053. \end_layout
  6054. \end_inset
  6055. </cell>
  6056. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  6057. \begin_inset Text
  6058. \begin_layout Plain Layout
  6059. \family roman
  6060. \series medium
  6061. \shape up
  6062. \size normal
  6063. \emph off
  6064. \bar no
  6065. \strikeout off
  6066. \xout off
  6067. \uuline off
  6068. \uwave off
  6069. \noun off
  6070. \color none
  6071. Globin Reads
  6072. \end_layout
  6073. \end_inset
  6074. </cell>
  6075. </row>
  6076. <row>
  6077. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6078. \begin_inset Text
  6079. \begin_layout Plain Layout
  6080. \family roman
  6081. \series medium
  6082. \shape up
  6083. \size normal
  6084. \emph off
  6085. \bar no
  6086. \strikeout off
  6087. \xout off
  6088. \uuline off
  6089. \uwave off
  6090. \noun off
  6091. \color none
  6092. Yes
  6093. \end_layout
  6094. \end_inset
  6095. </cell>
  6096. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6097. \begin_inset Text
  6098. \begin_layout Plain Layout
  6099. \family roman
  6100. \series medium
  6101. \shape up
  6102. \size normal
  6103. \emph off
  6104. \bar no
  6105. \strikeout off
  6106. \xout off
  6107. \uuline off
  6108. \uwave off
  6109. \noun off
  6110. \color none
  6111. 50.4% ± 6.82
  6112. \end_layout
  6113. \end_inset
  6114. </cell>
  6115. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6116. \begin_inset Text
  6117. \begin_layout Plain Layout
  6118. \family roman
  6119. \series medium
  6120. \shape up
  6121. \size normal
  6122. \emph off
  6123. \bar no
  6124. \strikeout off
  6125. \xout off
  6126. \uuline off
  6127. \uwave off
  6128. \noun off
  6129. \color none
  6130. 3.48% ± 2.94
  6131. \end_layout
  6132. \end_inset
  6133. </cell>
  6134. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6135. \begin_inset Text
  6136. \begin_layout Plain Layout
  6137. \family roman
  6138. \series medium
  6139. \shape up
  6140. \size normal
  6141. \emph off
  6142. \bar no
  6143. \strikeout off
  6144. \xout off
  6145. \uuline off
  6146. \uwave off
  6147. \noun off
  6148. \color none
  6149. 53.9% ± 6.81
  6150. \end_layout
  6151. \end_inset
  6152. </cell>
  6153. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6154. \begin_inset Text
  6155. \begin_layout Plain Layout
  6156. \family roman
  6157. \series medium
  6158. \shape up
  6159. \size normal
  6160. \emph off
  6161. \bar no
  6162. \strikeout off
  6163. \xout off
  6164. \uuline off
  6165. \uwave off
  6166. \noun off
  6167. \color none
  6168. 89.7% ± 2.40
  6169. \end_layout
  6170. \end_inset
  6171. </cell>
  6172. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6173. \begin_inset Text
  6174. \begin_layout Plain Layout
  6175. \family roman
  6176. \series medium
  6177. \shape up
  6178. \size normal
  6179. \emph off
  6180. \bar no
  6181. \strikeout off
  6182. \xout off
  6183. \uuline off
  6184. \uwave off
  6185. \noun off
  6186. \color none
  6187. 93.5% ± 5.25
  6188. \end_layout
  6189. \end_inset
  6190. </cell>
  6191. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6192. \begin_inset Text
  6193. \begin_layout Plain Layout
  6194. \family roman
  6195. \series medium
  6196. \shape up
  6197. \size normal
  6198. \emph off
  6199. \bar no
  6200. \strikeout off
  6201. \xout off
  6202. \uuline off
  6203. \uwave off
  6204. \noun off
  6205. \color none
  6206. 6.49% ± 5.25
  6207. \end_layout
  6208. \end_inset
  6209. </cell>
  6210. </row>
  6211. <row>
  6212. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6213. \begin_inset Text
  6214. \begin_layout Plain Layout
  6215. \family roman
  6216. \series medium
  6217. \shape up
  6218. \size normal
  6219. \emph off
  6220. \bar no
  6221. \strikeout off
  6222. \xout off
  6223. \uuline off
  6224. \uwave off
  6225. \noun off
  6226. \color none
  6227. No
  6228. \end_layout
  6229. \end_inset
  6230. </cell>
  6231. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6232. \begin_inset Text
  6233. \begin_layout Plain Layout
  6234. \family roman
  6235. \series medium
  6236. \shape up
  6237. \size normal
  6238. \emph off
  6239. \bar no
  6240. \strikeout off
  6241. \xout off
  6242. \uuline off
  6243. \uwave off
  6244. \noun off
  6245. \color none
  6246. 26.3% ± 8.95
  6247. \end_layout
  6248. \end_inset
  6249. </cell>
  6250. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6251. \begin_inset Text
  6252. \begin_layout Plain Layout
  6253. \family roman
  6254. \series medium
  6255. \shape up
  6256. \size normal
  6257. \emph off
  6258. \bar no
  6259. \strikeout off
  6260. \xout off
  6261. \uuline off
  6262. \uwave off
  6263. \noun off
  6264. \color none
  6265. 44.6% ± 16.6
  6266. \end_layout
  6267. \end_inset
  6268. </cell>
  6269. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6270. \begin_inset Text
  6271. \begin_layout Plain Layout
  6272. \family roman
  6273. \series medium
  6274. \shape up
  6275. \size normal
  6276. \emph off
  6277. \bar no
  6278. \strikeout off
  6279. \xout off
  6280. \uuline off
  6281. \uwave off
  6282. \noun off
  6283. \color none
  6284. 70.1% ± 9.38
  6285. \end_layout
  6286. \end_inset
  6287. </cell>
  6288. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6289. \begin_inset Text
  6290. \begin_layout Plain Layout
  6291. \family roman
  6292. \series medium
  6293. \shape up
  6294. \size normal
  6295. \emph off
  6296. \bar no
  6297. \strikeout off
  6298. \xout off
  6299. \uuline off
  6300. \uwave off
  6301. \noun off
  6302. \color none
  6303. 90.7% ± 5.16
  6304. \end_layout
  6305. \end_inset
  6306. </cell>
  6307. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6308. \begin_inset Text
  6309. \begin_layout Plain Layout
  6310. \family roman
  6311. \series medium
  6312. \shape up
  6313. \size normal
  6314. \emph off
  6315. \bar no
  6316. \strikeout off
  6317. \xout off
  6318. \uuline off
  6319. \uwave off
  6320. \noun off
  6321. \color none
  6322. 38.8% ± 17.1
  6323. \end_layout
  6324. \end_inset
  6325. </cell>
  6326. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  6327. \begin_inset Text
  6328. \begin_layout Plain Layout
  6329. \family roman
  6330. \series medium
  6331. \shape up
  6332. \size normal
  6333. \emph off
  6334. \bar no
  6335. \strikeout off
  6336. \xout off
  6337. \uuline off
  6338. \uwave off
  6339. \noun off
  6340. \color none
  6341. 61.2% ± 17.1
  6342. \end_layout
  6343. \end_inset
  6344. </cell>
  6345. </row>
  6346. </lyxtabular>
  6347. \end_inset
  6348. \end_layout
  6349. \begin_layout Plain Layout
  6350. \begin_inset Caption Standard
  6351. \begin_layout Plain Layout
  6352. \series bold
  6353. \begin_inset Argument 1
  6354. status collapsed
  6355. \begin_layout Plain Layout
  6356. Fractions of reads mapping to genomic features in GB and non-GB samples.
  6357. \end_layout
  6358. \end_inset
  6359. \begin_inset CommandInset label
  6360. LatexCommand label
  6361. name "tab:Fractions-of-reads"
  6362. \end_inset
  6363. Fractions of reads mapping to genomic features in GB and non-GB samples.
  6364. \series default
  6365. All values are given as mean ± standard deviation.
  6366. \end_layout
  6367. \end_inset
  6368. \end_layout
  6369. \begin_layout Plain Layout
  6370. \end_layout
  6371. \end_inset
  6372. \end_layout
  6373. \begin_layout Standard
  6374. Another important aspect is that the standard deviations in Table
  6375. \begin_inset CommandInset ref
  6376. LatexCommand ref
  6377. reference "tab:Fractions-of-reads"
  6378. plural "false"
  6379. caps "false"
  6380. noprefix "false"
  6381. \end_inset
  6382. are uniformly smaller in the GB samples than the non-GB ones, indicating
  6383. much greater consistency of yield.
  6384. This is best seen in the percentage of non-globin reads as a fraction of
  6385. total reads aligned to annotated genes (genic reads).
  6386. For the non-GB samples, this measure ranges from 10.9% to 80.9%, while for
  6387. the GB samples it ranges from 81.9% to 99.9% (Figure
  6388. \begin_inset CommandInset ref
  6389. LatexCommand ref
  6390. reference "fig:Fraction-of-genic-reads"
  6391. plural "false"
  6392. caps "false"
  6393. noprefix "false"
  6394. \end_inset
  6395. ).
  6396. This means that for applications where it is critical that each sample
  6397. achieve a specified minimum coverage in order to provide useful information,
  6398. it would be necessary to budget up to 10 times the sequencing depth per
  6399. sample without globin blocking, even though the average yield improvement
  6400. for globin blocking is only 2-fold, because every sample has a chance of
  6401. being 90% globin and 10% useful reads.
  6402. Hence, the more consistent behavior of GB samples makes planning an experiment
  6403. easier and more efficient because it eliminates the need to over-sequence
  6404. every sample in order to guard against the worst case of a high-globin
  6405. fraction.
  6406. \end_layout
  6407. \begin_layout Subsection
  6408. Globin blocking lowers the noise floor and allows detection of about 2000
  6409. more genes
  6410. \end_layout
  6411. \begin_layout Standard
  6412. \begin_inset Flex TODO Note (inline)
  6413. status open
  6414. \begin_layout Plain Layout
  6415. Remove redundant titles from figures
  6416. \end_layout
  6417. \end_inset
  6418. \end_layout
  6419. \begin_layout Standard
  6420. \begin_inset Float figure
  6421. wide false
  6422. sideways false
  6423. status open
  6424. \begin_layout Plain Layout
  6425. \align center
  6426. \begin_inset Graphics
  6427. filename graphics/Globin Paper/figure2 - aveLogCPM-colored.pdf
  6428. \end_inset
  6429. \end_layout
  6430. \begin_layout Plain Layout
  6431. \begin_inset Caption Standard
  6432. \begin_layout Plain Layout
  6433. \series bold
  6434. \begin_inset Argument 1
  6435. status collapsed
  6436. \begin_layout Plain Layout
  6437. Distributions of average group gene abundances when normalized separately
  6438. or together.
  6439. \end_layout
  6440. \end_inset
  6441. \begin_inset CommandInset label
  6442. LatexCommand label
  6443. name "fig:logcpm-dists"
  6444. \end_inset
  6445. Distributions of average group gene abundances when normalized separately
  6446. or together.
  6447. \series default
  6448. All reads in each sequencing library were aligned to the cyno genome, and
  6449. the number of reads uniquely aligning to each gene was counted.
  6450. Genes with zero counts in all libraries were discarded.
  6451. Libraries were normalized using the TMM method.
  6452. Libraries were split into globin-blocked (GB) and non-GB groups and the
  6453. average abundance for each gene in both groups, measured in log2 counts
  6454. per million reads counted, was computed using the aveLogCPM function.
  6455. The distribution of average gene logCPM values was plotted for both groups
  6456. using a kernel density plot to approximate a continuous distribution.
  6457. The logCPM GB distributions are marked in red, non-GB in blue.
  6458. The black vertical line denotes the chosen detection threshold of -1.
  6459. Top panel: Libraries were split into GB and non-GB groups first and normalized
  6460. separately.
  6461. Bottom panel: Libraries were all normalized together first and then split
  6462. into groups.
  6463. \end_layout
  6464. \end_inset
  6465. \end_layout
  6466. \begin_layout Plain Layout
  6467. \end_layout
  6468. \end_inset
  6469. \end_layout
  6470. \begin_layout Standard
  6471. Since globin blocking yields more usable sequencing depth, it should also
  6472. allow detection of more genes at any given threshold.
  6473. When we looked at the distribution of average normalized logCPM values
  6474. across all libraries for genes with at least one read assigned to them,
  6475. we observed the expected bimodal distribution, with a high-abundance "signal"
  6476. peak representing detected genes and a low-abundance "noise" peak representing
  6477. genes whose read count did not rise above the noise floor (Figure
  6478. \begin_inset CommandInset ref
  6479. LatexCommand ref
  6480. reference "fig:logcpm-dists"
  6481. plural "false"
  6482. caps "false"
  6483. noprefix "false"
  6484. \end_inset
  6485. ).
  6486. Consistent with the 2-fold increase in raw counts assigned to non-globin
  6487. genes, the signal peak for GB samples is shifted to the right relative
  6488. to the non-GB signal peak.
  6489. When all the samples are normalized together, this difference is normalized
  6490. out, lining up the signal peaks, and this reveals that, as expected, the
  6491. noise floor for the GB samples is about 2-fold lower.
  6492. This greater separation between signal and noise peaks in the GB samples
  6493. means that low-expression genes should be more easily detected and more
  6494. precisely quantified than in the non-GB samples.
  6495. \end_layout
  6496. \begin_layout Standard
  6497. \begin_inset Float figure
  6498. wide false
  6499. sideways false
  6500. status open
  6501. \begin_layout Plain Layout
  6502. \align center
  6503. \begin_inset Graphics
  6504. filename graphics/Globin Paper/figure3 - detection.pdf
  6505. \end_inset
  6506. \end_layout
  6507. \begin_layout Plain Layout
  6508. \begin_inset Caption Standard
  6509. \begin_layout Plain Layout
  6510. \series bold
  6511. \begin_inset Argument 1
  6512. status collapsed
  6513. \begin_layout Plain Layout
  6514. Gene detections as a function of abundance thresholds in globin-blocked
  6515. (GB) and non-GB samples.
  6516. \end_layout
  6517. \end_inset
  6518. \begin_inset CommandInset label
  6519. LatexCommand label
  6520. name "fig:Gene-detections"
  6521. \end_inset
  6522. Gene detections as a function of abundance thresholds in globin-blocked
  6523. (GB) and non-GB samples.
  6524. \series default
  6525. Average abundance (logCPM,
  6526. \begin_inset Formula $\log_{2}$
  6527. \end_inset
  6528. counts per million reads counted) was computed by separate group normalization
  6529. as described in Figure
  6530. \begin_inset CommandInset ref
  6531. LatexCommand ref
  6532. reference "fig:logcpm-dists"
  6533. plural "false"
  6534. caps "false"
  6535. noprefix "false"
  6536. \end_inset
  6537. for both the GB and non-GB groups, as well as for all samples considered
  6538. as one large group.
  6539. For each every integer threshold from -2 to 3, the number of genes detected
  6540. at or above that logCPM threshold was plotted for each group.
  6541. \end_layout
  6542. \end_inset
  6543. \end_layout
  6544. \begin_layout Plain Layout
  6545. \end_layout
  6546. \end_inset
  6547. \end_layout
  6548. \begin_layout Standard
  6549. Based on these distributions, we selected a detection threshold of -1, which
  6550. is approximately the leftmost edge of the trough between the signal and
  6551. noise peaks.
  6552. This represents the most liberal possible detection threshold that doesn't
  6553. call substantial numbers of noise genes as detected.
  6554. Among the full dataset, 13429 genes were detected at this threshold, and
  6555. 22276 were not.
  6556. When considering the GB libraries and non-GB libraries separately and re-comput
  6557. ing normalization factors independently within each group, 14535 genes were
  6558. detected in the GB libraries while only 12460 were detected in the non-GB
  6559. libraries.
  6560. Thus, GB allowed the detection of 2000 extra genes that were buried under
  6561. the noise floor without GB.
  6562. This pattern of at least 2000 additional genes detected with GB was also
  6563. consistent across a wide range of possible detection thresholds, from -2
  6564. to 3 (see Figure
  6565. \begin_inset CommandInset ref
  6566. LatexCommand ref
  6567. reference "fig:Gene-detections"
  6568. plural "false"
  6569. caps "false"
  6570. noprefix "false"
  6571. \end_inset
  6572. ).
  6573. \end_layout
  6574. \begin_layout Subsection
  6575. Globin blocking does not add significant additional noise or decrease sample
  6576. quality
  6577. \end_layout
  6578. \begin_layout Standard
  6579. One potential worry is that the globin blocking protocol could perturb the
  6580. levels of non-globin genes.
  6581. There are two kinds of possible perturbations: systematic and random.
  6582. The former is not a major concern for detection of differential expression,
  6583. since a 2-fold change in every sample has no effect on the relative fold
  6584. change between samples.
  6585. In contrast, random perturbations would increase the noise and obscure
  6586. the signal in the dataset, reducing the capacity to detect differential
  6587. expression.
  6588. \end_layout
  6589. \begin_layout Standard
  6590. \begin_inset Float figure
  6591. wide false
  6592. sideways false
  6593. status open
  6594. \begin_layout Plain Layout
  6595. \align center
  6596. \begin_inset Graphics
  6597. filename graphics/Globin Paper/figure4 - maplot-colored.pdf
  6598. \end_inset
  6599. \end_layout
  6600. \begin_layout Plain Layout
  6601. \begin_inset Caption Standard
  6602. \begin_layout Plain Layout
  6603. \begin_inset Argument 1
  6604. status collapsed
  6605. \begin_layout Plain Layout
  6606. MA plot showing effects of globin blocking on each gene's abundance.
  6607. \end_layout
  6608. \end_inset
  6609. \begin_inset CommandInset label
  6610. LatexCommand label
  6611. name "fig:MA-plot"
  6612. \end_inset
  6613. \series bold
  6614. MA plot showing effects of globin blocking on each gene's abundance.
  6615. \series default
  6616. All libraries were normalized together as described in Figure
  6617. \begin_inset CommandInset ref
  6618. LatexCommand ref
  6619. reference "fig:logcpm-dists"
  6620. plural "false"
  6621. caps "false"
  6622. noprefix "false"
  6623. \end_inset
  6624. , and genes with an average logCPM below -1 were filtered out.
  6625. Each remaining gene was tested for differential abundance with respect
  6626. to globin blocking (GB) using edgeR’s quasi-likelihod F-test, fitting a
  6627. negative binomial generalized linear model to table of read counts in each
  6628. library.
  6629. For each gene, edgeR reported average abundance (logCPM),
  6630. \begin_inset Formula $\log_{2}$
  6631. \end_inset
  6632. fold change (logFC), p-value, and Benjamini-Hochberg adjusted false discovery
  6633. rate (FDR).
  6634. Each gene's logFC was plotted against its logCPM, colored by FDR.
  6635. Red points are significant at ≤10% FDR, and blue are not significant at
  6636. that threshold.
  6637. The alpha and beta globin genes targeted for blocking are marked with large
  6638. triangles, while all other genes are represented as small points.
  6639. \end_layout
  6640. \end_inset
  6641. \end_layout
  6642. \begin_layout Plain Layout
  6643. \end_layout
  6644. \end_inset
  6645. \end_layout
  6646. \begin_layout Standard
  6647. \begin_inset Flex TODO Note (inline)
  6648. status open
  6649. \begin_layout Plain Layout
  6650. Standardize on
  6651. \begin_inset Quotes eld
  6652. \end_inset
  6653. log2
  6654. \begin_inset Quotes erd
  6655. \end_inset
  6656. notation
  6657. \end_layout
  6658. \end_inset
  6659. \end_layout
  6660. \begin_layout Standard
  6661. The data do indeed show small systematic perturbations in gene levels (Figure
  6662. \begin_inset CommandInset ref
  6663. LatexCommand ref
  6664. reference "fig:MA-plot"
  6665. plural "false"
  6666. caps "false"
  6667. noprefix "false"
  6668. \end_inset
  6669. ).
  6670. Other than the 3 designated alpha and beta globin genes, two other genes
  6671. stand out as having especially large negative log fold changes: HBD and
  6672. LOC1021365.
  6673. HBD, delta globin, is most likely targeted by the blocking oligos due to
  6674. high sequence homology with the other globin genes.
  6675. LOC1021365 is the aforementioned ncRNA that is reverse-complementary to
  6676. one of the alpha-like genes and that would be expected to be removed during
  6677. the globin blocking step.
  6678. All other genes appear in a cluster centered vertically at 0, and the vast
  6679. majority of genes in this cluster show an absolute log2(FC) of 0.5 or less.
  6680. Nevertheless, many of these small perturbations are still statistically
  6681. significant, indicating that the globin blocking oligos likely cause very
  6682. small but non-zero systematic perturbations in measured gene expression
  6683. levels.
  6684. \end_layout
  6685. \begin_layout Standard
  6686. \begin_inset Float figure
  6687. wide false
  6688. sideways false
  6689. status open
  6690. \begin_layout Plain Layout
  6691. \align center
  6692. \begin_inset Graphics
  6693. filename graphics/Globin Paper/figure5 - corrplot.pdf
  6694. \end_inset
  6695. \end_layout
  6696. \begin_layout Plain Layout
  6697. \begin_inset Caption Standard
  6698. \begin_layout Plain Layout
  6699. \series bold
  6700. \begin_inset Argument 1
  6701. status collapsed
  6702. \begin_layout Plain Layout
  6703. Comparison of inter-sample gene abundance correlations with and without
  6704. globin blocking.
  6705. \end_layout
  6706. \end_inset
  6707. \begin_inset CommandInset label
  6708. LatexCommand label
  6709. name "fig:gene-abundance-correlations"
  6710. \end_inset
  6711. Comparison of inter-sample gene abundance correlations with and without
  6712. globin blocking (GB).
  6713. \series default
  6714. All libraries were normalized together as described in Figure 2, and genes
  6715. with an average abundance (logCPM, log2 counts per million reads counted)
  6716. less than -1 were filtered out.
  6717. Each gene’s logCPM was computed in each library using the edgeR cpm function.
  6718. For each pair of biological samples, the Pearson correlation between those
  6719. samples' GB libraries was plotted against the correlation between the same
  6720. samples’ non-GB libraries.
  6721. Each point represents an unique pair of samples.
  6722. The solid gray line shows a quantile-quantile plot of distribution of GB
  6723. correlations vs.
  6724. that of non-GB correlations.
  6725. The thin dashed line is the identity line, provided for reference.
  6726. \end_layout
  6727. \end_inset
  6728. \end_layout
  6729. \begin_layout Plain Layout
  6730. \end_layout
  6731. \end_inset
  6732. \end_layout
  6733. \begin_layout Standard
  6734. To evaluate the possibility of globin blocking causing random perturbations
  6735. and reducing sample quality, we computed the Pearson correlation between
  6736. logCPM values for every pair of samples with and without GB and plotted
  6737. them against each other (Figure
  6738. \begin_inset CommandInset ref
  6739. LatexCommand ref
  6740. reference "fig:gene-abundance-correlations"
  6741. plural "false"
  6742. caps "false"
  6743. noprefix "false"
  6744. \end_inset
  6745. ).
  6746. The plot indicated that the GB libraries have higher sample-to-sample correlati
  6747. ons than the non-GB libraries.
  6748. Parametric and nonparametric tests for differences between the correlations
  6749. with and without GB both confirmed that this difference was highly significant
  6750. (2-sided paired t-test: t = 37.2, df = 665, P ≪ 2.2e-16; 2-sided Wilcoxon
  6751. sign-rank test: V = 2195, P ≪ 2.2e-16).
  6752. Performing the same tests on the Spearman correlations gave the same conclusion
  6753. (t-test: t = 26.8, df = 665, P ≪ 2.2e-16; sign-rank test: V = 8781, P ≪ 2.2e-16).
  6754. The edgeR package was used to compute the overall biological coefficient
  6755. of variation (BCV) for GB and non-GB libraries, and found that globin blocking
  6756. resulted in a negligible increase in the BCV (0.417 with GB vs.
  6757. 0.400 without).
  6758. The near equality of the BCVs for both sets indicates that the higher correlati
  6759. ons in the GB libraries are most likely a result of the increased yield
  6760. of useful reads, which reduces the contribution of Poisson counting uncertainty
  6761. to the overall variance of the logCPM values
  6762. \begin_inset CommandInset citation
  6763. LatexCommand cite
  6764. key "McCarthy2012"
  6765. literal "false"
  6766. \end_inset
  6767. .
  6768. This improves the precision of expression measurements and more than offsets
  6769. the negligible increase in BCV.
  6770. \end_layout
  6771. \begin_layout Subsection
  6772. More differentially expressed genes are detected with globin blocking
  6773. \end_layout
  6774. \begin_layout Standard
  6775. \begin_inset Float table
  6776. wide false
  6777. sideways false
  6778. status open
  6779. \begin_layout Plain Layout
  6780. \align center
  6781. \begin_inset Tabular
  6782. <lyxtabular version="3" rows="5" columns="5">
  6783. <features tabularvalignment="middle">
  6784. <column alignment="center" valignment="top">
  6785. <column alignment="center" valignment="top">
  6786. <column alignment="center" valignment="top">
  6787. <column alignment="center" valignment="top">
  6788. <column alignment="center" valignment="top">
  6789. <row>
  6790. <cell alignment="center" valignment="top" usebox="none">
  6791. \begin_inset Text
  6792. \begin_layout Plain Layout
  6793. \end_layout
  6794. \end_inset
  6795. </cell>
  6796. <cell alignment="center" valignment="top" usebox="none">
  6797. \begin_inset Text
  6798. \begin_layout Plain Layout
  6799. \end_layout
  6800. \end_inset
  6801. </cell>
  6802. <cell multicolumn="1" alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6803. \begin_inset Text
  6804. \begin_layout Plain Layout
  6805. \series bold
  6806. No Globin Blocking
  6807. \end_layout
  6808. \end_inset
  6809. </cell>
  6810. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  6811. \begin_inset Text
  6812. \begin_layout Plain Layout
  6813. \end_layout
  6814. \end_inset
  6815. </cell>
  6816. <cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  6817. \begin_inset Text
  6818. \begin_layout Plain Layout
  6819. \end_layout
  6820. \end_inset
  6821. </cell>
  6822. </row>
  6823. <row>
  6824. <cell alignment="center" valignment="top" usebox="none">
  6825. \begin_inset Text
  6826. \begin_layout Plain Layout
  6827. \end_layout
  6828. \end_inset
  6829. </cell>
  6830. <cell alignment="center" valignment="top" usebox="none">
  6831. \begin_inset Text
  6832. \begin_layout Plain Layout
  6833. \end_layout
  6834. \end_inset
  6835. </cell>
  6836. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6837. \begin_inset Text
  6838. \begin_layout Plain Layout
  6839. \series bold
  6840. Up
  6841. \end_layout
  6842. \end_inset
  6843. </cell>
  6844. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6845. \begin_inset Text
  6846. \begin_layout Plain Layout
  6847. \series bold
  6848. NS
  6849. \end_layout
  6850. \end_inset
  6851. </cell>
  6852. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6853. \begin_inset Text
  6854. \begin_layout Plain Layout
  6855. \series bold
  6856. Down
  6857. \end_layout
  6858. \end_inset
  6859. </cell>
  6860. </row>
  6861. <row>
  6862. <cell multirow="3" alignment="center" valignment="middle" topline="true" bottomline="true" leftline="true" usebox="none">
  6863. \begin_inset Text
  6864. \begin_layout Plain Layout
  6865. \series bold
  6866. Globin-Blocking
  6867. \end_layout
  6868. \end_inset
  6869. </cell>
  6870. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6871. \begin_inset Text
  6872. \begin_layout Plain Layout
  6873. \series bold
  6874. Up
  6875. \end_layout
  6876. \end_inset
  6877. </cell>
  6878. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6879. \begin_inset Text
  6880. \begin_layout Plain Layout
  6881. \family roman
  6882. \series medium
  6883. \shape up
  6884. \size normal
  6885. \emph off
  6886. \bar no
  6887. \strikeout off
  6888. \xout off
  6889. \uuline off
  6890. \uwave off
  6891. \noun off
  6892. \color none
  6893. 231
  6894. \end_layout
  6895. \end_inset
  6896. </cell>
  6897. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6898. \begin_inset Text
  6899. \begin_layout Plain Layout
  6900. \family roman
  6901. \series medium
  6902. \shape up
  6903. \size normal
  6904. \emph off
  6905. \bar no
  6906. \strikeout off
  6907. \xout off
  6908. \uuline off
  6909. \uwave off
  6910. \noun off
  6911. \color none
  6912. 515
  6913. \end_layout
  6914. \end_inset
  6915. </cell>
  6916. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6917. \begin_inset Text
  6918. \begin_layout Plain Layout
  6919. \family roman
  6920. \series medium
  6921. \shape up
  6922. \size normal
  6923. \emph off
  6924. \bar no
  6925. \strikeout off
  6926. \xout off
  6927. \uuline off
  6928. \uwave off
  6929. \noun off
  6930. \color none
  6931. 2
  6932. \end_layout
  6933. \end_inset
  6934. </cell>
  6935. </row>
  6936. <row>
  6937. <cell multirow="4" alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6938. \begin_inset Text
  6939. \begin_layout Plain Layout
  6940. \end_layout
  6941. \end_inset
  6942. </cell>
  6943. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6944. \begin_inset Text
  6945. \begin_layout Plain Layout
  6946. \series bold
  6947. NS
  6948. \end_layout
  6949. \end_inset
  6950. </cell>
  6951. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6952. \begin_inset Text
  6953. \begin_layout Plain Layout
  6954. \family roman
  6955. \series medium
  6956. \shape up
  6957. \size normal
  6958. \emph off
  6959. \bar no
  6960. \strikeout off
  6961. \xout off
  6962. \uuline off
  6963. \uwave off
  6964. \noun off
  6965. \color none
  6966. 160
  6967. \end_layout
  6968. \end_inset
  6969. </cell>
  6970. <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
  6971. \begin_inset Text
  6972. \begin_layout Plain Layout
  6973. \family roman
  6974. \series medium
  6975. \shape up
  6976. \size normal
  6977. \emph off
  6978. \bar no
  6979. \strikeout off
  6980. \xout off
  6981. \uuline off
  6982. \uwave off
  6983. \noun off
  6984. \color none
  6985. 11235
  6986. \end_layout
  6987. \end_inset
  6988. </cell>
  6989. <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
  6990. \begin_inset Text
  6991. \begin_layout Plain Layout
  6992. \family roman
  6993. \series medium
  6994. \shape up
  6995. \size normal
  6996. \emph off
  6997. \bar no
  6998. \strikeout off
  6999. \xout off
  7000. \uuline off
  7001. \uwave off
  7002. \noun off
  7003. \color none
  7004. 136
  7005. \end_layout
  7006. \end_inset
  7007. </cell>
  7008. </row>
  7009. <row>
  7010. <cell multirow="4" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7011. \begin_inset Text
  7012. \begin_layout Plain Layout
  7013. \end_layout
  7014. \end_inset
  7015. </cell>
  7016. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7017. \begin_inset Text
  7018. \begin_layout Plain Layout
  7019. \series bold
  7020. Down
  7021. \end_layout
  7022. \end_inset
  7023. </cell>
  7024. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7025. \begin_inset Text
  7026. \begin_layout Plain Layout
  7027. \family roman
  7028. \series medium
  7029. \shape up
  7030. \size normal
  7031. \emph off
  7032. \bar no
  7033. \strikeout off
  7034. \xout off
  7035. \uuline off
  7036. \uwave off
  7037. \noun off
  7038. \color none
  7039. 0
  7040. \end_layout
  7041. \end_inset
  7042. </cell>
  7043. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
  7044. \begin_inset Text
  7045. \begin_layout Plain Layout
  7046. \family roman
  7047. \series medium
  7048. \shape up
  7049. \size normal
  7050. \emph off
  7051. \bar no
  7052. \strikeout off
  7053. \xout off
  7054. \uuline off
  7055. \uwave off
  7056. \noun off
  7057. \color none
  7058. 548
  7059. \end_layout
  7060. \end_inset
  7061. </cell>
  7062. <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
  7063. \begin_inset Text
  7064. \begin_layout Plain Layout
  7065. \family roman
  7066. \series medium
  7067. \shape up
  7068. \size normal
  7069. \emph off
  7070. \bar no
  7071. \strikeout off
  7072. \xout off
  7073. \uuline off
  7074. \uwave off
  7075. \noun off
  7076. \color none
  7077. 127
  7078. \end_layout
  7079. \end_inset
  7080. </cell>
  7081. </row>
  7082. </lyxtabular>
  7083. \end_inset
  7084. \end_layout
  7085. \begin_layout Plain Layout
  7086. \begin_inset Caption Standard
  7087. \begin_layout Plain Layout
  7088. \series bold
  7089. \begin_inset Argument 1
  7090. status open
  7091. \begin_layout Plain Layout
  7092. Comparison of significantly differentially expressed genes with and without
  7093. globin blocking.
  7094. \end_layout
  7095. \end_inset
  7096. \begin_inset CommandInset label
  7097. LatexCommand label
  7098. name "tab:Comparison-of-significant"
  7099. \end_inset
  7100. Comparison of significantly differentially expressed genes with and without
  7101. globin blocking.
  7102. \series default
  7103. Up, Down: Genes significantly up/down-regulated in post-transplant samples
  7104. relative to pre-transplant samples, with a false discovery rate of 10%
  7105. or less.
  7106. NS: Non-significant genes (false discovery rate greater than 10%).
  7107. \end_layout
  7108. \end_inset
  7109. \end_layout
  7110. \begin_layout Plain Layout
  7111. \end_layout
  7112. \end_inset
  7113. \end_layout
  7114. \begin_layout Standard
  7115. To compare performance on differential gene expression tests, we took subsets
  7116. of both the GB and non-GB libraries with exactly one pre-transplant and
  7117. one post-transplant sample for each animal that had paired samples available
  7118. for analysis (N=7 animals, N=14 samples in each subset).
  7119. The same test for pre- vs.
  7120. post-transplant differential gene expression was performed on the same
  7121. 7 pairs of samples from GB libraries and non-GB libraries, in each case
  7122. using an FDR of 10% as the threshold of significance.
  7123. Out of 12954 genes that passed the detection threshold in both subsets,
  7124. 358 were called significantly differentially expressed in the same direction
  7125. in both sets; 1063 were differentially expressed in the GB set only; 296
  7126. were differentially expressed in the non-GB set only; 2 genes were called
  7127. significantly up in the GB set but significantly down in the non-GB set;
  7128. and the remaining 11235 were not called differentially expressed in either
  7129. set.
  7130. These data are summarized in Table
  7131. \begin_inset CommandInset ref
  7132. LatexCommand ref
  7133. reference "tab:Comparison-of-significant"
  7134. plural "false"
  7135. caps "false"
  7136. noprefix "false"
  7137. \end_inset
  7138. .
  7139. The differences in BCV calculated by EdgeR for these subsets of samples
  7140. were negligible (BCV = 0.302 for GB and 0.297 for non-GB).
  7141. \end_layout
  7142. \begin_layout Standard
  7143. The key point is that the GB data results in substantially more differentially
  7144. expressed calls than the non-GB data.
  7145. Since there is no gold standard for this dataset, it is impossible to be
  7146. certain whether this is due to under-calling of differential expression
  7147. in the non-GB samples or over-calling in the GB samples.
  7148. However, given that both datasets are derived from the same biological
  7149. samples and have nearly equal BCVs, it is more likely that the larger number
  7150. of DE calls in the GB samples are genuine detections that were enabled
  7151. by the higher sequencing depth and measurement precision of the GB samples.
  7152. Note that the same set of genes was considered in both subsets, so the
  7153. larger number of differentially expressed gene calls in the GB data set
  7154. reflects a greater sensitivity to detect significant differential gene
  7155. expression and not simply the larger total number of detected genes in
  7156. GB samples described earlier.
  7157. \end_layout
  7158. \begin_layout Section
  7159. Discussion
  7160. \end_layout
  7161. \begin_layout Standard
  7162. The original experience with whole blood gene expression profiling on DNA
  7163. microarrays demonstrated that the high concentration of globin transcripts
  7164. reduced the sensitivity to detect genes with relatively low expression
  7165. levels, in effect, significantly reducing the sensitivity.
  7166. To address this limitation, commercial protocols for globin reduction were
  7167. developed based on strategies to block globin transcript amplification
  7168. during labeling or physically removing globin transcripts by affinity bead
  7169. methods
  7170. \begin_inset CommandInset citation
  7171. LatexCommand cite
  7172. key "Winn2010"
  7173. literal "false"
  7174. \end_inset
  7175. .
  7176. More recently, using the latest generation of labeling protocols and arrays,
  7177. it was determined that globin reduction was no longer necessary to obtain
  7178. sufficient sensitivity to detect differential transcript expression
  7179. \begin_inset CommandInset citation
  7180. LatexCommand cite
  7181. key "NuGEN2010"
  7182. literal "false"
  7183. \end_inset
  7184. .
  7185. However, we are not aware of any publications using these currently available
  7186. protocols the with latest generation of microarrays that actually compare
  7187. the detection sensitivity with and without globin reduction.
  7188. However, in practice this has now been adopted generally primarily driven
  7189. by concerns for cost control.
  7190. The main objective of our work was to directly test the impact of globin
  7191. gene transcripts and a new globin blocking protocol for application to
  7192. the newest generation of differential gene expression profiling determined
  7193. using next generation sequencing.
  7194. \end_layout
  7195. \begin_layout Standard
  7196. The challenge of doing global gene expression profiling in cynomolgus monkeys
  7197. is that the current available arrays were never designed to comprehensively
  7198. cover this genome and have not been updated since the first assemblies
  7199. of the cynomolgus genome were published.
  7200. Therefore, we determined that the best strategy for peripheral blood profiling
  7201. was to do deep RNA-seq and inform the workflow using the latest available
  7202. genome assembly and annotation
  7203. \begin_inset CommandInset citation
  7204. LatexCommand cite
  7205. key "Wilson2013"
  7206. literal "false"
  7207. \end_inset
  7208. .
  7209. However, it was not immediately clear whether globin reduction was necessary
  7210. for RNA-seq or how much improvement in efficiency or sensitivity to detect
  7211. differential gene expression would be achieved for the added cost and work.
  7212. \end_layout
  7213. \begin_layout Standard
  7214. We only found one report that demonstrated that globin reduction significantly
  7215. improved the effective read yields for sequencing of human peripheral blood
  7216. cell RNA using a DeepSAGE protocol
  7217. \begin_inset CommandInset citation
  7218. LatexCommand cite
  7219. key "Mastrokolias2012"
  7220. literal "false"
  7221. \end_inset
  7222. .
  7223. The approach to DeepSAGE involves two different restriction enzymes that
  7224. purify and then tag small fragments of transcripts at specific locations
  7225. and thus, significantly reduces the complexity of the transcriptome.
  7226. Therefore, we could not determine how DeepSAGE results would translate
  7227. to the common strategy in the field for assaying the entire transcript
  7228. population by whole-transcriptome 3’-end RNA-seq.
  7229. Furthermore, if globin reduction is necessary, we also needed a globin
  7230. reduction method specific to cynomolgus globin sequences that would work
  7231. an organism for which no kit is available off the shelf.
  7232. \end_layout
  7233. \begin_layout Standard
  7234. As mentioned above, the addition of globin blocking oligos has a very small
  7235. impact on measured expression levels of gene expression.
  7236. However, this is a non-issue for the purposes of differential expression
  7237. testing, since a systematic change in a gene in all samples does not affect
  7238. relative expression levels between samples.
  7239. However, we must acknowledge that simple comparisons of gene expression
  7240. data obtained by GB and non-GB protocols are not possible without additional
  7241. normalization.
  7242. \end_layout
  7243. \begin_layout Standard
  7244. More importantly, globin blocking not only nearly doubles the yield of usable
  7245. reads, it also increases inter-sample correlation and sensitivity to detect
  7246. differential gene expression relative to the same set of samples profiled
  7247. without blocking.
  7248. In addition, globin blocking does not add a significant amount of random
  7249. noise to the data.
  7250. Globin blocking thus represents a cost-effective way to squeeze more data
  7251. and statistical power out of the same blood samples and the same amount
  7252. of sequencing.
  7253. In conclusion, globin reduction greatly increases the yield of useful RNA-seq
  7254. reads mapping to the rest of the genome, with minimal perturbations in
  7255. the relative levels of non-globin genes.
  7256. Based on these results, globin transcript reduction using sequence-specific,
  7257. complementary blocking oligonucleotides is recommended for all deep RNA-seq
  7258. of cynomolgus and other nonhuman primate blood samples.
  7259. \end_layout
  7260. \begin_layout Chapter
  7261. Future Directions
  7262. \end_layout
  7263. \begin_layout Standard
  7264. \begin_inset Flex TODO Note (inline)
  7265. status open
  7266. \begin_layout Plain Layout
  7267. Consider per-chapter future directions.
  7268. Check instructions.
  7269. \end_layout
  7270. \end_inset
  7271. \end_layout
  7272. \begin_layout Itemize
  7273. Study other epigenetic marks in more contexts
  7274. \end_layout
  7275. \begin_deeper
  7276. \begin_layout Itemize
  7277. DNA methylation, histone marks, chromatin accessibility & conformation in
  7278. CD4 T-cells
  7279. \end_layout
  7280. \begin_layout Itemize
  7281. Also look at other types of lymphocytes: CD8 T-cells, B-cells, NK cells
  7282. \end_layout
  7283. \end_deeper
  7284. \begin_layout Itemize
  7285. Use CV or bootstrap to better evaluate classifiers
  7286. \end_layout
  7287. \begin_layout Itemize
  7288. fRMAtools could be adapted to not require equal-sized groups
  7289. \end_layout
  7290. \begin_layout Standard
  7291. \begin_inset ERT
  7292. status open
  7293. \begin_layout Plain Layout
  7294. % Call it "References" instead of "Bibliography"
  7295. \end_layout
  7296. \begin_layout Plain Layout
  7297. \backslash
  7298. renewcommand{
  7299. \backslash
  7300. bibname}{References}
  7301. \end_layout
  7302. \end_inset
  7303. \end_layout
  7304. \begin_layout Standard
  7305. \begin_inset Flex TODO Note (inline)
  7306. status open
  7307. \begin_layout Plain Layout
  7308. Check bib entry formatting & sort order
  7309. \end_layout
  7310. \end_inset
  7311. \end_layout
  7312. \begin_layout Standard
  7313. \begin_inset CommandInset bibtex
  7314. LatexCommand bibtex
  7315. btprint "btPrintCited"
  7316. bibfiles "refs,code-refs"
  7317. options "bibtotoc,unsrt"
  7318. \end_inset
  7319. \end_layout
  7320. \end_body
  7321. \end_document