probe-selection.R.html 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
  2. "http://www.w3.org/TR/html4/strict.dtd">
  3. <html>
  4. <head>
  5. <title></title>
  6. <meta http-equiv="content-type" content="text/html; charset=utf-8">
  7. <style type="text/css">
  8. td.linenos { background-color: #f0f0f0; padding-right: 10px; }
  9. span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
  10. pre { line-height: 125%; }
  11. body .hll { background-color: #ffffcc }
  12. body { background: #f8f8f8; }
  13. body .c { color: #408080; font-style: italic } /* Comment */
  14. body .err { border: 1px solid #FF0000 } /* Error */
  15. body .k { color: #008000; font-weight: bold } /* Keyword */
  16. body .o { color: #666666 } /* Operator */
  17. body .ch { color: #408080; font-style: italic } /* Comment.Hashbang */
  18. body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
  19. body .cp { color: #BC7A00 } /* Comment.Preproc */
  20. body .cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */
  21. body .c1 { color: #408080; font-style: italic } /* Comment.Single */
  22. body .cs { color: #408080; font-style: italic } /* Comment.Special */
  23. body .gd { color: #A00000 } /* Generic.Deleted */
  24. body .ge { font-style: italic } /* Generic.Emph */
  25. body .gr { color: #FF0000 } /* Generic.Error */
  26. body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
  27. body .gi { color: #00A000 } /* Generic.Inserted */
  28. body .go { color: #888888 } /* Generic.Output */
  29. body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
  30. body .gs { font-weight: bold } /* Generic.Strong */
  31. body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
  32. body .gt { color: #0044DD } /* Generic.Traceback */
  33. body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
  34. body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
  35. body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
  36. body .kp { color: #008000 } /* Keyword.Pseudo */
  37. body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
  38. body .kt { color: #B00040 } /* Keyword.Type */
  39. body .m { color: #666666 } /* Literal.Number */
  40. body .s { color: #BA2121 } /* Literal.String */
  41. body .na { color: #7D9029 } /* Name.Attribute */
  42. body .nb { color: #008000 } /* Name.Builtin */
  43. body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
  44. body .no { color: #880000 } /* Name.Constant */
  45. body .nd { color: #AA22FF } /* Name.Decorator */
  46. body .ni { color: #999999; font-weight: bold } /* Name.Entity */
  47. body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
  48. body .nf { color: #0000FF } /* Name.Function */
  49. body .nl { color: #A0A000 } /* Name.Label */
  50. body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
  51. body .nt { color: #008000; font-weight: bold } /* Name.Tag */
  52. body .nv { color: #19177C } /* Name.Variable */
  53. body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
  54. body .w { color: #bbbbbb } /* Text.Whitespace */
  55. body .mb { color: #666666 } /* Literal.Number.Bin */
  56. body .mf { color: #666666 } /* Literal.Number.Float */
  57. body .mh { color: #666666 } /* Literal.Number.Hex */
  58. body .mi { color: #666666 } /* Literal.Number.Integer */
  59. body .mo { color: #666666 } /* Literal.Number.Oct */
  60. body .sa { color: #BA2121 } /* Literal.String.Affix */
  61. body .sb { color: #BA2121 } /* Literal.String.Backtick */
  62. body .sc { color: #BA2121 } /* Literal.String.Char */
  63. body .dl { color: #BA2121 } /* Literal.String.Delimiter */
  64. body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
  65. body .s2 { color: #BA2121 } /* Literal.String.Double */
  66. body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
  67. body .sh { color: #BA2121 } /* Literal.String.Heredoc */
  68. body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
  69. body .sx { color: #008000 } /* Literal.String.Other */
  70. body .sr { color: #BB6688 } /* Literal.String.Regex */
  71. body .s1 { color: #BA2121 } /* Literal.String.Single */
  72. body .ss { color: #19177C } /* Literal.String.Symbol */
  73. body .bp { color: #008000 } /* Name.Builtin.Pseudo */
  74. body .fm { color: #0000FF } /* Name.Function.Magic */
  75. body .vc { color: #19177C } /* Name.Variable.Class */
  76. body .vg { color: #19177C } /* Name.Variable.Global */
  77. body .vi { color: #19177C } /* Name.Variable.Instance */
  78. body .vm { color: #19177C } /* Name.Variable.Magic */
  79. body .il { color: #666666 } /* Literal.Number.Integer.Long */
  80. </style>
  81. </head>
  82. <body>
  83. <h2></h2>
  84. <div class="highlight"><pre><span></span><span class="c1"># We will use base 2 logarithms for now</span>
  85. log.base <span class="o">=</span> <span class="m">2</span>
  86. <span class="c1"># Read the data in from the file</span>
  87. filename.data <span class="o">=</span> <span class="s">&quot;Processed_Data_Files/Normalized_Pair_Files/All_norm_pair.txt&quot;</span>
  88. <span class="c1">#filename.data = &quot;Raw_Data_Files/Pair_Files/All_pair.txt&quot;</span>
  89. intensity.lin <span class="o">&lt;-</span> read.table<span class="p">(</span>file<span class="o">=</span>filename.data<span class="p">,</span> header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>row.names<span class="o">=</span><span class="s">&quot;PROBE_ID&quot;</span><span class="p">)</span>
  90. <span class="c1"># Column names and numbers used to categorize the data</span>
  91. intensity.maincols <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;GENE_EXPR_OPTION&quot;</span><span class="p">,</span> <span class="s">&quot;SEQ_ID&quot;</span><span class="p">,</span> <span class="s">&quot;POSITION&quot;</span><span class="p">)</span>
  92. intensity.maincolnums <span class="o">=</span> <span class="kp">which</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>intensity.lin<span class="p">)</span> <span class="o">%in%</span> intensity.maincols<span class="p">)</span>
  93. <span class="c1"># Column names and numbers containing intensity data</span>
  94. intensity.datacolnums <span class="o">=</span> <span class="p">(</span><span class="m">1</span><span class="o">:</span><span class="kp">dim</span><span class="p">(</span>intensity.lin<span class="p">)[</span><span class="m">2</span><span class="p">])[</span><span class="o">-</span>intensity.maincolnums<span class="p">]</span> <span class="c1"># i.e. &quot;the rest&quot;</span>
  95. intensity.datacols <span class="o">=</span> <span class="kp">names</span><span class="p">(</span>intensity.lin<span class="p">)[</span>intensity.datacolnums<span class="p">]</span>
  96. <span class="c1"># Take the logarithm of the data</span>
  97. intensity.log <span class="o">&lt;-</span> <span class="kt">data.frame</span><span class="p">(</span>intensity.lin<span class="p">[</span>intensity.maincols<span class="p">],</span><span class="kp">log</span><span class="p">(</span>intensity.lin<span class="p">[</span>intensity.datacols<span class="p">],</span> base <span class="o">=</span> log.base<span class="p">))</span>
  98. <span class="c1"># Separate into random (i.e. background) and data probes</span>
  99. <span class="c1"># Split into &quot;intensity$rand&quot; and &quot;intensity$data&quot;</span>
  100. intensity <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>x <span class="o">=</span> intensity.log<span class="p">,</span> f <span class="o">=</span> <span class="p">(</span><span class="kp">ifelse</span><span class="p">(</span>intensity.log<span class="o">$</span>GENE_EXPR_OPTION <span class="o">==</span> <span class="s">&quot;RANDOM&quot;</span><span class="p">,</span><span class="s">&quot;rand&quot;</span><span class="p">,</span><span class="s">&quot;data&quot;</span><span class="p">)))</span>
  101. <span class="kp">rm</span><span class="p">(</span>intensity.lin<span class="p">,</span> intensity.log<span class="p">)</span> <span class="c1"># Discard unused stuff to free memory</span>
  102. <span class="c1"># This gives a QQ plot of the data against the noise. I used it to select my cutoffs.</span>
  103. <span class="c1">#qqplot( y=as.vector(as.matrix(intensity$data[sample(1:dim(intensity$data)[1],5000),intensity.datacols])), x=as.vector(as.matrix(intensity$rand[intensity.datacols])), ylab=&quot;Data&quot;, xlab=&quot;Random Control&quot;,main = &quot;Log10 QQ plot of Specific Probe Intensities vs. Random Controls&quot;)</span>
  104. <span class="c1"># Detection (low) threshold is 2 sd above mean random background</span>
  105. intensity.rand.vector <span class="o">=</span> <span class="kp">as.vector</span><span class="p">(</span><span class="kp">as.matrix</span><span class="p">(</span>intensity<span class="o">$</span>rand<span class="p">[</span>intensity.datacols<span class="p">]))</span>
  106. threshold.low <span class="o">=</span> <span class="kp">mean</span><span class="p">(</span>intensity.rand.vector<span class="p">)</span> <span class="o">+</span> <span class="m">2</span><span class="o">*</span>sd<span class="p">(</span>intensity.rand.vector<span class="p">)</span>
  107. <span class="kp">rm</span><span class="p">(</span>intensity.rand.vector<span class="p">)</span>
  108. <span class="c1"># Saturation (high) threshold is 2-fold down from max possible</span>
  109. threshold.high <span class="o">=</span> <span class="kp">log</span><span class="p">(</span><span class="m">65535</span><span class="o">/</span><span class="m">2</span><span class="p">,</span> base<span class="o">=</span>log.base<span class="p">)</span>
  110. <span class="kr">if</span> <span class="p">(</span>threshold.high <span class="o">&lt;=</span> threshold.low<span class="p">)</span> <span class="kp">print</span><span class="p">(</span><span class="s">&quot;Error: low threshold is too high&quot;</span><span class="p">)</span>
  111. <span class="c1"># Compute needed row statistics</span>
  112. intensity<span class="o">$</span>data<span class="o">$</span>MAX <span class="o">&lt;-</span> <span class="kp">apply</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="p">[</span>intensity.datacols<span class="p">],</span><span class="m">1</span><span class="p">,</span><span class="kp">max</span><span class="p">)</span>
  113. <span class="c1"># Actually, we only need the max, it seems. Uncomment these if needed.</span>
  114. <span class="c1">#intensity$data$MIN &lt;- apply(intensity$data[intensity.datacols],1,min)</span>
  115. <span class="c1">#intensity$data$MEAN &lt;- rowMeans(intensity$data[intensity.datacols]) # Don&#39;t need that one</span>
  116. <span class="c1">#intensity$data$SD &lt;- apply(intensity$data[intensity.datacols],1,sd) # Don&#39;t need that one</span>
  117. <span class="c1"># Sort probes into three bins: absent, present, and saturated</span>
  118. <span class="c1"># The integer value of BIN also serves as a rank:</span>
  119. <span class="c1"># present &lt; saturated &lt; absent; lower is better</span>
  120. intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">&lt;-</span> <span class="kp">factor</span><span class="p">(</span><span class="m">1</span> <span class="o">+</span> <span class="p">(</span>intensity<span class="o">$</span>data<span class="o">$</span>MAX <span class="o">&gt;</span> threshold.high<span class="p">)</span> <span class="o">+</span> <span class="p">(</span>intensity<span class="o">$</span>data<span class="o">$</span>MAX <span class="o">&lt;</span> threshold.low<span class="p">)</span> <span class="o">*</span> <span class="m">2</span><span class="p">,</span> labels <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="s">&quot;present&quot;</span><span class="p">,</span> <span class="s">&quot;saturated&quot;</span><span class="p">,</span> <span class="s">&quot;absent&quot;</span><span class="p">))</span>
  121. <span class="c1"># Count how many probes from each probeset (SEQ_ID) went into each bin</span>
  122. <span class="c1"># Also count total probes per set</span>
  123. <span class="c1"># Counting is done by measuring length of data aggregated by SEQ_ID</span>
  124. num.probes <span class="o">&lt;-</span> <span class="kp">lapply</span><span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="kt">list</span><span class="p">(</span>total<span class="o">=</span>intensity<span class="o">$</span>data<span class="p">),</span><span class="kp">split</span><span class="p">(</span>x <span class="o">=</span> intensity<span class="o">$</span>data<span class="p">,</span> f <span class="o">=</span> intensity<span class="o">$</span>data<span class="o">$</span>BIN<span class="p">)),</span> FUN <span class="o">=</span> <span class="kr">function</span> <span class="p">(</span>y<span class="p">)</span> aggregate<span class="p">(</span>x<span class="o">=</span><span class="kp">rep</span><span class="p">(</span><span class="kc">NA</span><span class="p">,</span><span class="kp">dim</span><span class="p">(</span>y<span class="p">)[</span><span class="m">1</span><span class="p">]),</span>by<span class="o">=</span><span class="kt">list</span><span class="p">(</span>SEQ_ID<span class="o">=</span>y<span class="o">$</span>SEQ_ID<span class="p">),</span>FUN<span class="o">=</span><span class="kp">length</span><span class="p">))</span>
  125. <span class="kr">if</span><span class="p">(</span><span class="kp">mean</span><span class="p">(</span>num.probes<span class="o">$</span>present<span class="o">$</span>x<span class="p">)</span> <span class="o">&lt;</span> <span class="m">3</span><span class="p">)</span> <span class="p">{</span> <span class="kp">print</span><span class="p">(</span><span class="s">&quot;Error: Not enough present probes.&quot;</span><span class="p">)</span> <span class="p">}</span>
  126. <span class="c1"># Something below this needs updating</span>
  127. <span class="c1"># 2 rankings: Nimblegen&#39;s and distance from probeset mean</span>
  128. <span class="c1"># Nimblegen&#39;s rank is read from the design file</span>
  129. <span class="c1"># Probeinfo file</span>
  130. <span class="c1"># This is information parsed from the design file.</span>
  131. <span class="c1"># The design file can&#39;t be used directly because info on probe</span>
  132. <span class="c1"># selection is not in separate columns.</span>
  133. filename.probeinfo <span class="o">=</span> <span class="s">&quot;Design_Files/071031_U_Va_Tobacco_Expr.probeinfo&quot;</span>
  134. probeinfo <span class="o">&lt;-</span> read.table<span class="p">(</span>filename.probeinfo<span class="p">,</span>header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>row.names<span class="o">=</span><span class="s">&quot;PROBE_ID&quot;</span><span class="p">,</span>as.is<span class="o">=</span><span class="s">&quot;SEQ&quot;</span><span class="p">)</span>
  135. <span class="c1"># Use probe names as row names for indexing</span>
  136. <span class="c1">#row.names(probeinfo) &lt;- as.character(probeinfo$PROBE_ID)</span>
  137. <span class="c1"># Add Nimblegen rank to the main data frame</span>
  138. intensity<span class="o">$</span>data<span class="p">[</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;RANK&quot;</span><span class="p">,</span><span class="s">&quot;SEQ&quot;</span><span class="p">)]</span> <span class="o">&lt;-</span> probeinfo<span class="p">[</span><span class="kp">row.names</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="p">),</span><span class="kt">c</span><span class="p">(</span><span class="s">&quot;RANK&quot;</span><span class="p">,</span><span class="s">&quot;SEQ&quot;</span><span class="p">)]</span>
  139. <span class="c1"># For present probes, rank is based on correlation to probeset mean</span>
  140. <span class="c1"># Read probeset means from calls file</span>
  141. filename.calls <span class="o">=</span> <span class="s">&quot;Processed_Data_Files/Normalized_Calls_Files/All_norm_calls.txt&quot;</span>
  142. calls.lin <span class="o">&lt;-</span> read.table<span class="p">(</span>filename.calls<span class="p">,</span>header<span class="o">=</span><span class="kc">TRUE</span><span class="p">,</span>row.names<span class="o">=</span><span class="s">&quot;SEQ_ID&quot;</span><span class="p">)</span>
  143. <span class="c1"># This line simultaneously logs the data and gives the same column order as the intensity table</span>
  144. probeset.means <span class="o">=</span> <span class="kp">log</span><span class="p">(</span>calls.lin<span class="p">[</span>intensity.datacols<span class="p">],</span> base<span class="o">=</span>log.base<span class="p">)</span>
  145. <span class="kp">rm</span><span class="p">(</span>calls.lin<span class="p">)</span>
  146. <span class="c1"># Collect the relevant data into matrices for efficiency</span>
  147. probes.present.data <span class="o">=</span> <span class="kp">t</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">==</span> <span class="s">&quot;present&quot;</span><span class="p">,</span>intensity.datacols<span class="p">])</span>
  148. probeset.means.data <span class="o">=</span> <span class="kp">t</span><span class="p">(</span>probeset.means<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>SEQ_ID<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">==</span> <span class="s">&quot;present&quot;</span><span class="p">],])</span>
  149. <span class="c1"># We invert the correlation so that lower is better</span>
  150. intensity<span class="o">$</span>data<span class="o">$</span>RANK<span class="p">[</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN <span class="o">==</span> <span class="s">&quot;present&quot;</span><span class="p">]</span> <span class="o">&lt;-</span> <span class="kp">sapply</span><span class="p">(</span><span class="m">1</span><span class="o">:</span><span class="kp">dim</span><span class="p">(</span>probes.present.data<span class="p">)[</span><span class="m">2</span><span class="p">],</span><span class="kr">function</span> <span class="p">(</span>x<span class="p">)</span> <span class="p">{</span> <span class="o">-</span>cor<span class="p">(</span>probes.present.data<span class="p">[,</span>x<span class="p">],</span>probeset.means.data<span class="p">[,</span>x<span class="p">])</span> <span class="p">})</span>
  151. <span class="c1"># Done with these</span>
  152. <span class="kp">rm</span><span class="p">(</span><span class="s">&quot;probes.present.data&quot;</span><span class="p">,</span><span class="s">&quot;probeset.means.data&quot;</span><span class="p">)</span>
  153. <span class="c1"># Sort by bin, then rank</span>
  154. intensity<span class="o">$</span>data.ranked <span class="o">=</span> intensity<span class="o">$</span>data<span class="p">[</span><span class="kp">order</span><span class="p">(</span>intensity<span class="o">$</span>data<span class="o">$</span>BIN<span class="p">,</span>intensity<span class="o">$</span>data<span class="o">$</span>RANK<span class="p">),]</span>
  155. <span class="c1"># Split by SEQ_ID</span>
  156. probes.ranked <span class="o">&lt;-</span> <span class="kp">split</span><span class="p">(</span>x<span class="o">=</span><span class="kp">row.names</span><span class="p">(</span>intensity<span class="o">$</span>data.ranked<span class="p">),</span>
  157. f<span class="o">=</span>intensity<span class="o">$</span>data.ranked<span class="o">$</span>SEQ_ID<span class="p">,</span>
  158. drop<span class="o">=</span><span class="kc">TRUE</span><span class="p">)</span>
  159. <span class="c1"># Set the desired number of probes per sequence</span>
  160. num.probes.desired <span class="o">&lt;-</span> <span class="m">3</span>
  161. <span class="c1"># A function to make any vector have length n, by truncating longer ones and padding shorter ones with NA</span>
  162. firstN <span class="o">&lt;-</span> <span class="kr">function</span> <span class="p">(</span>v<span class="p">,</span>n<span class="p">)</span> <span class="kt">c</span><span class="p">(</span>v<span class="p">,</span><span class="kp">rep</span><span class="p">(</span><span class="kc">NA</span><span class="p">,</span>n<span class="p">))[</span><span class="m">1</span><span class="o">:</span>n<span class="p">]</span>
  163. <span class="c1"># A function to take a string and append P1, P2, P3, etc. up to PN.</span>
  164. probenamesN <span class="o">&lt;-</span> <span class="kr">function</span> <span class="p">(</span>s<span class="p">,</span>n<span class="p">)</span> <span class="p">(</span><span class="kp">paste</span><span class="p">(</span>s<span class="p">,</span><span class="s">&quot;P&quot;</span><span class="p">,</span><span class="m">1</span><span class="o">:</span>n<span class="p">,</span>sep<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">))[</span><span class="m">1</span><span class="o">:</span>n<span class="p">]</span>
  165. probes.selected <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="kp">sapply</span><span class="p">(</span>probes.ranked<span class="p">,</span>firstN<span class="p">,</span>num.probes.desired<span class="p">))</span>
  166. <span class="kp">names</span><span class="p">(</span>probes.selected<span class="p">)</span> <span class="o">&lt;-</span> <span class="kt">c</span><span class="p">(</span><span class="kp">sapply</span><span class="p">(</span><span class="kp">names</span><span class="p">(</span>probes.ranked<span class="p">),</span>probenamesN<span class="p">,</span>num.probes.desired<span class="p">))</span>
  167. <span class="c1"># Filter NA</span>
  168. probes.selected <span class="o">&lt;-</span> probes.selected<span class="p">[</span><span class="o">!</span><span class="kp">is.na</span><span class="p">(</span>probes.selected<span class="p">)]</span>
  169. probes.selected.fasta <span class="o">&lt;-</span> <span class="kp">paste</span><span class="p">(</span>sep<span class="o">=</span><span class="s">&quot;&quot;</span><span class="p">,</span><span class="s">&quot;&gt;&quot;</span><span class="p">,</span><span class="kp">names</span><span class="p">(</span>probes.selected<span class="p">),</span><span class="s">&quot;\n&quot;</span><span class="p">,</span>intensity<span class="o">$</span>data<span class="p">[</span>probes.selected<span class="p">,</span><span class="s">&quot;SEQ&quot;</span><span class="p">])</span>
  170. <span class="kp">cat</span><span class="p">(</span>sep<span class="o">=</span><span class="s">&quot;\n&quot;</span><span class="p">,</span>probes.selected.fasta<span class="p">,</span>file<span class="o">=</span><span class="s">&quot;selected_probes.fasta&quot;</span><span class="p">)</span>
  171. <span class="kp">save.image</span><span class="p">(</span>file<span class="o">=</span><span class="s">&quot;probesel.rda&quot;</span><span class="p">)</span>
  172. </pre></div>
  173. </body>
  174. </html>