# We will use base 2 logarithms for now log.base = 2 # Read the data in from the file filename.data = "Processed_Data_Files/Normalized_Pair_Files/All_norm_pair.txt" #filename.data = "Raw_Data_Files/Pair_Files/All_pair.txt" intensity.lin <- read.table(file=filename.data, header=TRUE,row.names="PROBE_ID") # Column names and numbers used to categorize the data intensity.maincols = c("GENE_EXPR_OPTION", "SEQ_ID", "POSITION") intensity.maincolnums = which(names(intensity.lin) %in% intensity.maincols) # Column names and numbers containing intensity data intensity.datacolnums = (1:dim(intensity.lin)[2])[-intensity.maincolnums] # i.e. "the rest" intensity.datacols = names(intensity.lin)[intensity.datacolnums] # Take the logarithm of the data intensity.log <- data.frame(intensity.lin[intensity.maincols],log(intensity.lin[intensity.datacols], base = log.base)) # Separate into random (i.e. background) and data probes # Split into "intensity$rand" and "intensity$data" intensity <- split(x = intensity.log, f = (ifelse(intensity.log$GENE_EXPR_OPTION == "RANDOM","rand","data"))) rm(intensity.lin, intensity.log) # Discard unused stuff to free memory # This gives a QQ plot of the data against the noise. I used it to select my cutoffs. #qqplot( y=as.vector(as.matrix(intensity$data[sample(1:dim(intensity$data)[1],5000),intensity.datacols])), x=as.vector(as.matrix(intensity$rand[intensity.datacols])), ylab="Data", xlab="Random Control",main = "Log10 QQ plot of Specific Probe Intensities vs. Random Controls") # Detection (low) threshold is 2 sd above mean random background intensity.rand.vector = as.vector(as.matrix(intensity$rand[intensity.datacols])) threshold.low = mean(intensity.rand.vector) + 2*sd(intensity.rand.vector) rm(intensity.rand.vector) # Saturation (high) threshold is 2-fold down from max possible threshold.high = log(65535/2, base=log.base) if (threshold.high <= threshold.low) print("Error: low threshold is too high") # Compute needed row statistics intensity$data$MAX <- apply(intensity$data[intensity.datacols],1,max) # Actually, we only need the max, it seems. Uncomment these if needed. #intensity$data$MIN <- apply(intensity$data[intensity.datacols],1,min) #intensity$data$MEAN <- rowMeans(intensity$data[intensity.datacols]) # Don't need that one #intensity$data$SD <- apply(intensity$data[intensity.datacols],1,sd) # Don't need that one # Sort probes into three bins: absent, present, and saturated # The integer value of BIN also serves as a rank: # present < saturated < absent; lower is better intensity$data$BIN <- factor(1 + (intensity$data$MAX > threshold.high) + (intensity$data$MAX < threshold.low) * 2, labels = c("present", "saturated", "absent")) # Count how many probes from each probeset (SEQ_ID) went into each bin # Also count total probes per set # Counting is done by measuring length of data aggregated by SEQ_ID num.probes <- lapply(c(list(total=intensity$data),split(x = intensity$data, f = intensity$data$BIN)), FUN = function (y) aggregate(x=rep(NA,dim(y)[1]),by=list(SEQ_ID=y$SEQ_ID),FUN=length)) if(mean(num.probes$present$x) < 3) { print("Error: Not enough present probes.") } # Something below this needs updating # 2 rankings: Nimblegen's and distance from probeset mean # Nimblegen's rank is read from the design file # Probeinfo file # This is information parsed from the design file. # The design file can't be used directly because info on probe # selection is not in separate columns. filename.probeinfo = "Design_Files/071031_U_Va_Tobacco_Expr.probeinfo" probeinfo <- read.table(filename.probeinfo,header=TRUE,row.names="PROBE_ID",as.is="SEQ") # Use probe names as row names for indexing #row.names(probeinfo) <- as.character(probeinfo$PROBE_ID) # Add Nimblegen rank to the main data frame intensity$data[c("RANK","SEQ")] <- probeinfo[row.names(intensity$data),c("RANK","SEQ")] # For present probes, rank is based on correlation to probeset mean # Read probeset means from calls file filename.calls = "Processed_Data_Files/Normalized_Calls_Files/All_norm_calls.txt" calls.lin <- read.table(filename.calls,header=TRUE,row.names="SEQ_ID") # This line simultaneously logs the data and gives the same column order as the intensity table probeset.means = log(calls.lin[intensity.datacols], base=log.base) rm(calls.lin) # Collect the relevant data into matrices for efficiency probes.present.data = t(intensity$data[intensity$data$BIN == "present",intensity.datacols]) probeset.means.data = t(probeset.means[intensity$data$SEQ_ID[intensity$data$BIN == "present"],]) # We invert the correlation so that lower is better intensity$data$RANK[intensity$data$BIN == "present"] <- sapply(1:dim(probes.present.data)[2],function (x) { -cor(probes.present.data[,x],probeset.means.data[,x]) }) # Done with these rm("probes.present.data","probeset.means.data") # Sort by bin, then rank intensity$data.ranked = intensity$data[order(intensity$data$BIN,intensity$data$RANK),] # Split by SEQ_ID probes.ranked <- split(x=row.names(intensity$data.ranked), f=intensity$data.ranked$SEQ_ID, drop=TRUE) # Set the desired number of probes per sequence num.probes.desired <- 3 # A function to make any vector have length n, by truncating longer ones and padding shorter ones with NA firstN <- function (v,n) c(v,rep(NA,n))[1:n] # A function to take a string and append P1, P2, P3, etc. up to PN. probenamesN <- function (s,n) (paste(s,"P",1:n,sep=""))[1:n] probes.selected <- c(sapply(probes.ranked,firstN,num.probes.desired)) names(probes.selected) <- c(sapply(names(probes.ranked),probenamesN,num.probes.desired)) # Filter NA probes.selected <- probes.selected[!is.na(probes.selected)] probes.selected.fasta <- paste(sep="",">",names(probes.selected),"\n",intensity$data[probes.selected,"SEQ"]) cat(sep="\n",probes.selected.fasta,file="selected_probes.fasta") save.image(file="probesel.rda")