probe-selection.R 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # We will use base 2 logarithms for now
  2. log.base = 2
  3. # Read the data in from the file
  4. filename.data = "Processed_Data_Files/Normalized_Pair_Files/All_norm_pair.txt"
  5. #filename.data = "Raw_Data_Files/Pair_Files/All_pair.txt"
  6. intensity.lin <- read.table(file=filename.data, header=TRUE,row.names="PROBE_ID")
  7. # Column names and numbers used to categorize the data
  8. intensity.maincols = c("GENE_EXPR_OPTION", "SEQ_ID", "POSITION")
  9. intensity.maincolnums = which(names(intensity.lin) %in% intensity.maincols)
  10. # Column names and numbers containing intensity data
  11. intensity.datacolnums = (1:dim(intensity.lin)[2])[-intensity.maincolnums] # i.e. "the rest"
  12. intensity.datacols = names(intensity.lin)[intensity.datacolnums]
  13. # Take the logarithm of the data
  14. intensity.log <- data.frame(intensity.lin[intensity.maincols],log(intensity.lin[intensity.datacols], base = log.base))
  15. # Separate into random (i.e. background) and data probes
  16. # Split into "intensity$rand" and "intensity$data"
  17. intensity <- split(x = intensity.log, f = (ifelse(intensity.log$GENE_EXPR_OPTION == "RANDOM","rand","data")))
  18. rm(intensity.lin, intensity.log) # Discard unused stuff to free memory
  19. # This gives a QQ plot of the data against the noise. I used it to select my cutoffs.
  20. #qqplot( y=as.vector(as.matrix(intensity$data[sample(1:dim(intensity$data)[1],5000),intensity.datacols])), x=as.vector(as.matrix(intensity$rand[intensity.datacols])), ylab="Data", xlab="Random Control",main = "Log10 QQ plot of Specific Probe Intensities vs. Random Controls")
  21. # Detection (low) threshold is 2 sd above mean random background
  22. intensity.rand.vector = as.vector(as.matrix(intensity$rand[intensity.datacols]))
  23. threshold.low = mean(intensity.rand.vector) + 2*sd(intensity.rand.vector)
  24. rm(intensity.rand.vector)
  25. # Saturation (high) threshold is 2-fold down from max possible
  26. threshold.high = log(65535/2, base=log.base)
  27. if (threshold.high <= threshold.low) print("Error: low threshold is too high")
  28. # Compute needed row statistics
  29. intensity$data$MAX <- apply(intensity$data[intensity.datacols],1,max)
  30. # Actually, we only need the max, it seems. Uncomment these if needed.
  31. #intensity$data$MIN <- apply(intensity$data[intensity.datacols],1,min)
  32. #intensity$data$MEAN <- rowMeans(intensity$data[intensity.datacols]) # Don't need that one
  33. #intensity$data$SD <- apply(intensity$data[intensity.datacols],1,sd) # Don't need that one
  34. # Sort probes into three bins: absent, present, and saturated
  35. # The integer value of BIN also serves as a rank:
  36. # present < saturated < absent; lower is better
  37. intensity$data$BIN <- factor(1 + (intensity$data$MAX > threshold.high) + (intensity$data$MAX < threshold.low) * 2, labels = c("present", "saturated", "absent"))
  38. # Count how many probes from each probeset (SEQ_ID) went into each bin
  39. # Also count total probes per set
  40. # Counting is done by measuring length of data aggregated by SEQ_ID
  41. num.probes <- lapply(c(list(total=intensity$data),split(x = intensity$data, f = intensity$data$BIN)), FUN = function (y) aggregate(x=rep(NA,dim(y)[1]),by=list(SEQ_ID=y$SEQ_ID),FUN=length))
  42. if(mean(num.probes$present$x) < 3) { print("Error: Not enough present probes.") }
  43. # Something below this needs updating
  44. # 2 rankings: Nimblegen's and distance from probeset mean
  45. # Nimblegen's rank is read from the design file
  46. # Probeinfo file
  47. # This is information parsed from the design file.
  48. # The design file can't be used directly because info on probe
  49. # selection is not in separate columns.
  50. filename.probeinfo = "Design_Files/071031_U_Va_Tobacco_Expr.probeinfo"
  51. probeinfo <- read.table(filename.probeinfo,header=TRUE,row.names="PROBE_ID",as.is="SEQ")
  52. # Use probe names as row names for indexing
  53. #row.names(probeinfo) <- as.character(probeinfo$PROBE_ID)
  54. # Add Nimblegen rank to the main data frame
  55. intensity$data[c("RANK","SEQ")] <- probeinfo[row.names(intensity$data),c("RANK","SEQ")]
  56. # For present probes, rank is based on correlation to probeset mean
  57. # Read probeset means from calls file
  58. filename.calls = "Processed_Data_Files/Normalized_Calls_Files/All_norm_calls.txt"
  59. calls.lin <- read.table(filename.calls,header=TRUE,row.names="SEQ_ID")
  60. # This line simultaneously logs the data and gives the same column order as the intensity table
  61. probeset.means = log(calls.lin[intensity.datacols], base=log.base)
  62. rm(calls.lin)
  63. # Collect the relevant data into matrices for efficiency
  64. probes.present.data = t(intensity$data[intensity$data$BIN == "present",intensity.datacols])
  65. probeset.means.data = t(probeset.means[intensity$data$SEQ_ID[intensity$data$BIN == "present"],])
  66. # We invert the correlation so that lower is better
  67. intensity$data$RANK[intensity$data$BIN == "present"] <- sapply(1:dim(probes.present.data)[2],function (x) { -cor(probes.present.data[,x],probeset.means.data[,x]) })
  68. # Done with these
  69. rm("probes.present.data","probeset.means.data")
  70. # Sort by bin, then rank
  71. intensity$data.ranked = intensity$data[order(intensity$data$BIN,intensity$data$RANK),]
  72. # Split by SEQ_ID
  73. probes.ranked <- split(x=row.names(intensity$data.ranked),
  74. f=intensity$data.ranked$SEQ_ID,
  75. drop=TRUE)
  76. # Set the desired number of probes per sequence
  77. num.probes.desired <- 3
  78. # A function to make any vector have length n, by truncating longer ones and padding shorter ones with NA
  79. firstN <- function (v,n) c(v,rep(NA,n))[1:n]
  80. # A function to take a string and append P1, P2, P3, etc. up to PN.
  81. probenamesN <- function (s,n) (paste(s,"P",1:n,sep=""))[1:n]
  82. probes.selected <- c(sapply(probes.ranked,firstN,num.probes.desired))
  83. names(probes.selected) <- c(sapply(names(probes.ranked),probenamesN,num.probes.desired))
  84. # Filter NA
  85. probes.selected <- probes.selected[!is.na(probes.selected)]
  86. probes.selected.fasta <- paste(sep="",">",names(probes.selected),"\n",intensity$data[probes.selected,"SEQ"])
  87. cat(sep="\n",probes.selected.fasta,file="selected_probes.fasta")
  88. save.image(file="probesel.rda")