## BEGIN snpCDF.R
#############################################################
#
#  May 2005 James W. MacDonald
#
#  Functions to parse the cdf list that Ben Bolstad's function
#  CDFintoRList() creates. The two files needed are the
#  (binary) .CDF from Affy, and the FASTA probe sequence file
#  also from Affy.
#
#  Output is a list containing:
#  Probe ID
#  X coordinate
#  Y coordinate
#  PM/MM
#  Sequence
#  Sense/antisense
#  SNP position
#
#  Note that these functions don't have any error checking
#  Added to makePlatform design on May 16, 2005
#
#  June 6, 2005 Added probe_id to link PM/MM pairs
#  as well as genomic_location and chromosome to allow
#  localization (and future plotting functions). This requires
#  parsing the Affy .csv annotation file.
#
#  July 19, 2005 Removed the chromosome (but retained the
#  genomic location). Which chromosome the SNP is on will have
#  to come from an annotation package.
#
#  Added parsing of the AFFX probes so the indexing will work
#  out. Will only include the feature_set_name, X, and Y location.
#  The remaining rows will contain NAs.
#
#  July 22, 2005 - modified the object being returned
#  list of lists to not mess up the order and then we can
#  get the chip dimensions. (BC)
#
#  July 25, 2005 Added SNP nucleotide to feature_type_3, as well as
#  "Control" to feature_type_1 for 'Affx' probes
#
#  Also sorted the columns to match the celfiles and added
#  rows with NA where missing.
#
#  August 4, 2005 sorted the feature_ID so that pm() and friends
#  from oligo will extract data in order(target_strand, feature_type_3,
#  feature_type_2)
#
#  November 15, 2005 modified the order of how the information
#  is stored. Now we're following order(feature_set_name, feature_type_1, ...
#  ..., feature_type_4). Therefore, pm does not need to sort everytime,
#  since the information is already sorted in the PDenv. (BC)
#
#  November 15, 2005 modified the names of some variables:
#   feature_type_1: PM/MM
#   feature_type_2: Allele
#   feature_type_3: Target Strandness
#   feature_type_4: SNP Location
#
#
#  December 9, 2005 modified the names of the pdenvs columns (BC):
##  OLD		MEANING					NEW
##  feature_type_1: feature_type (PM/MM)			feature_type
##  feature_type_2: calls (for SNP chips)			allele
##  feature_type_3: sense/antisense (for SNP chips)		target_strand
##  feature_type_4: SNP (for SNP chips)			snp_location
##
##
## July 5, 2006 Added alleleAB and middle_base to PDenv. The alleleAB is a factor
##              that maps the actual allele to 'A' or 'B', consistent with Affy.
##              The middle_base is a factor (AC, AG, AT, CG, CT, GT) that denotes
##              the two choices for the middle base pair for a given probeset.
##  
###########################################################


### BC: Feb 28, 2006
### oligo/makePlatformDesign now depend on affyio
### so, we're always using Ben's code for parsing...

## ## This function needs to be added to makecdfenv
## CDFintoRList <- function (filename, cdf.path = getwd()){
##     .Call("ReadCDFFileIntoRList", file.path(path.expand(cdf.path), 
##         filename), TRUE, PACKAGE = "makePlatformDesign")
## }

isComp <- function(x){
  ## Check to see if SNPs are complementary or not
  if(all(c("C", "G") %in% unique(x)) ||
     all(c("A", "T") %in% unique(x)))
    TRUE else FALSE
}

mmSNP <- function (seq, start = 1, stop = 0) 
  .Call("MP_ncomplementSeq", seq, as.integer(start), as.integer(stop), 
        PACKAGE = "makePlatformDesign")

snpChange <- function(x, base){
  ## Simple function to replace FASTA SNP
  ## letter with the actual SNP base
  tmp <- strsplit(x, NULL)
  if(length(tmp[[1]]) == 51){
    tmp[[1]][26] <- base
  }else{
    tmp[[1]][17] <- base
  }
  tmp
}
subSeq <- function(x, index,  subdir){
  ## The call to rev() is required to keep the
  ## indexing correct; Affy counts from the 5' end
  ## of the *sense* strand, so if the FASTA sequence
  ## is the antisense strand, we have to reverse after
  ## subsetting.
  if(length(x) == 51){
    start <- index - 3
    end  <- index + 21
    if(subdir == 1)
      subseq <- rev(x[start:end])
    if(subdir == 2)
      subseq <- x[start:end]
  }
  ## So far I only know of 51 and 33-mers
  if(length(x) == 33){
    start <- index - 12
    end <- index + 12
    if(subdir == 1)
      subseq <- rev(x[start:end])
    if(subdir == 2)
      subseq <- x[start:end]
  }
  if(length(x) != 33 && length(x) != 51){
    stop(paste("Incorrect sequence length", length(x), sep=" "))
  }
  subseq
}
readFasta <- function(x){
  ## Used to read in sequences from Affy's FASTA file format
  ## x is the name of the _probe_fasta file
  fasta <- scan(x, what = "character", quiet = TRUE)
  ## Put into data frame so it can be subsetted based on row.names
  tmp <- data.frame(I(toupper(fasta[grep("SNP", fasta) + 1])), 
                    row.names=sub(">affx:", "", fasta[grep("SNP", fasta)]))
}

readAnnot <- function(x){
  ## Used to read in genomic location and chromosomal location
  ## data from Affy's .csv annotation file
  head <- scan(x, what = character(0), nlines = 1, sep = ",", quiet = TRUE)
  head <- gsub(" ", "\\.", head)
  if(!all(c("Probe.Set.ID","Chromosome", "Physical.Position") %in% head))
    stop(paste("Incorrect header information in", x,".\n",
               "Is this the correct annotation file?"))
  tmp <- scan(x, what = strsplit(rep("character", length(head)), ""),
              sep = ",", skip = 1, quiet = TRUE, na.strings = "---")
  names(tmp) <- head
  tmp <- data.frame(tmp$Chromosome, as.numeric(as.character(tmp$Physical.Position)),
                    tmp$Allele.A, tmp$Allele.B, row.names = tmp$Probe.Set.ID)
  tmp
}
  
getSeq <- function(x, direction, fullSeq){
  ## Gets probe sequence at the $Block level
  snp <- x$Name
  index <- x$UnitInfo[,4]
  subdir <- x$Header[4]
  goodSeq <- snpChange(fullSeq, snp)
  tmp <- matrix(NA, nr = length(index), nc = 2)
  ## All these gyrations are required because Affy reports the sequence
  ## from either strand of DNA. Which strand they used is determined by which 'direction'
  ## is listed first in the CDF. If direction = 1, then the sequence is the sense(forward)
  ## strand, if direction = 2, the opposite is true.
  if(direction == 1){
    if(subdir == 1){
      for(i in 1:length(index))
        tmp[i,] <- c(paste(complementSeq(subSeq(goodSeq[[1]], index[i], subdir)),
                        collapse = ""), "Antisense")
      
    }
    if(subdir == 2){
      for(i in 1:length(index))
        tmp[i,] <- c(paste(subSeq(goodSeq[[1]], index[i], subdir), collapse = ""),
                     "Sense")
    }
  }
  if(direction == 2){
    if(subdir == 1){
      for(i in 1:length(index))
        tmp[i,] <- c(paste(subSeq(rev(goodSeq[[1]]), index[i], subdir),
                        collapse = ""), "Antisense")
    }
    if(subdir == 2){
      for(i in 1:length(index))
         tmp[i,] <- c(paste(complementSeq(subSeq(rev(goodSeq[[1]]), index[i], subdir)),
                        collapse = ""), "Sense")
    }
  }
  tmp
}
readSNPCDF <- function(cdf, fasta, annot){
  ## Gets probe sequence using SNP .CDF file and an Affy
  ## _probe_fasta file
  ## require(matchprobes, quietly = TRUE)
  ## require(makePlatformDesign, quietly = TRUE)
  x <- read.cdffile.list(cdf)
  ## BC: Fri Jul 22, 2005 added sizex and sizey
  sizex <- as.integer(x[[1]][[1]][[3]])
  sizey <- as.integer(x[[1]][[1]][[4]])
  
  xy2i <- function(x, y){
    sizey * y + x + 1
  }
  
  seqFrame <- readFasta(fasta)
  locDat <- readAnnot(annot)
  ## Get locations of AFFX and SNP probes
  numaffx <- grep("AFFX", x$UnitNames)
  numsnps <- grep("SNP", x$UnitNames)

  ## Allocate vectors long enough for all data
  len <- sizex * sizey
  FeatureID <- character(len)
  X <- numeric(len)
  Y <- numeric(len)
  PMMM <- character(len)
  Sequence <- character(len)
  SenseAntisense <- character(len)
  SNP <- numeric(len)
  ProbeID <- numeric(len)
  GeneLoc <- numeric(len)
  calls <- character(len)
  Allele <- character(len)
  midbase <- character(len)
  
  cat("Creating environment")
  ## PM and MM info for the AFFX probesets
  for(j in numaffx){
    tmp <- do.call("rbind", lapply(x$Units[[j]]$Block, function(x) x$UnitInfo[,2:3]))
    len <- dim(tmp)[1]
    index <- xy2i(tmp[,1], tmp[,2])
    FeatureID[index] <- rep(x$UnitNames[j], len)
    X[index] <- tmp[,1] + 1
    Y[index] <- tmp[,2] + 1
    
    ## No info for these probes, use NA as placeholder
    PMMM[index] <- "control"
    Sequence[index] <- NA
    SenseAntisense[index] <- NA
    ProbeID[index] <- NA
    GeneLoc[index] <- NA
    SNP[index] <- NA
    calls[index] <- NA
    Allele[index] <- NA
    midbase[index] <- NA
    if(j%%1000 == 0)
      cat(".")
  }
  for(j in numsnps){
    ## Get sequence information
    fullSeq <- seqFrame[x$UnitNames[j],]
    direction <- x$Units[[j]]$Block[[1]]$Header[4]
    sequence <- do.call("rbind", lapply(x$Units[[j]]$Block, function(x) getSeq(x, direction, fullSeq)))
    ## Get PM/MM, X, Y, and ProbeIDs
    tmp <- do.call("rbind", lapply(x$Units[[j]]$Block, function(x) x$UnitInfo[,2:4]))
    len <- dim(tmp)[1]
    ## Extract the SNP nucleotides
    snp.nuc <- unlist(mapply(rep, sapply(x$Units[[j]]$Block, function(x) x$Name),
                             sapply(x$Units[[j]]$Block, function(x) length(x$UnitInfo[,2])),
                             SIMPLIFY = FALSE), use.names = FALSE)
    ## Get SNP position
    snp.pos <- ifelse(sequence[,2] == "Antisense",tmp[,3] - 4, 30 - tmp[,3])
    ## Get Allele (A or B)
    if(snp.nuc[1] == locDat[x$UnitNames[j], 3])
      allele <- ifelse(snp.nuc == locDat[x$UnitNames[j], 3], "A","B")
    else
      allele <- ifelse(snp.nuc == complementSeq(as.character(locDat[x$UnitNames[j], 3])),
                       "A","B")
    ## Get base pairs for middle base
    mid.base <- paste(sort(unique(snp.nuc)), collapse="")
    ## Order things to figure out PMs and MMs - Note that using pbase == tbase
    ## won't work here because if the PM is at the SNP position and the tbase == SNP
    ## then tbase != pbase for both PM and MM
    #org.ord <- 1:len
    #mix <- org.ord[order(tmp[,2], tmp[,1])]
    #pms <- rep(c("PM","MM"), len/2)[order(mix)]
    a <- by(tmp[,2], rep(1:(len/2), each = 2), order)
    pms <- ifelse(unlist(a) == 1, "PM", "MM")
    ## Correct the MM base in the sequences
    mm.ind <- which(pms == "MM")
    sequence[mm.ind,1] <- ifelse(isComp(snp.nuc[mm.ind]) & snp.pos[mm.ind] == 13,
                                 mmSNP(sequence[mm.ind,1], start = 13, stop = 13),
                                 complementSeq(sequence[mm.ind,1], start = 13, stop = 13))
    ## add numeric indicator that links each PM to the corresponding MM
    nums <- rep(1:20, each = 2)
    ## This is probably the slowest part of the function
    index <- xy2i(tmp[,1], tmp[,2])
    FeatureID[index] <- rep(x$UnitNames[j], len)
    X[index] <- tmp[,1] + 1
    Y[index] <- tmp[,2] + 1
    PMMM[index] <- pms
    Sequence[index] <- sequence[,1]
    SenseAntisense[index] <- sequence[,2]
    GeneLoc[index] <- rep(locDat[x$UnitNames[j], 2], len)
    calls[index] <- snp.nuc
    SNP[index] <- snp.pos
    Allele[index] <- allele
    midbase[index] <- mid.base
    ## order ProbeIDs so probes are extracted in correct order
    p.indx <- 1:len %% 2 == 0
    p.ord <- order(order(snp.nuc[p.indx], sequence[,2][p.indx],snp.pos[p.indx]))
    ##    p.ord <- order(order(sequence[,2][p.indx], snp.nuc[p.indx], snp.pos[p.indx]))
    ProbeID[index] <- rep(p.ord, each = 2)
    if(j%%1000 == 0)
      cat(".")
  }
  ## Set oligo-B2 probes to NA
  index <- X == 0
  X <- rep(1:sizex, each = 1, times = sizey)
  Y <- rep(1:sizex, each = sizey, times = 1)
##  FeatureID[index] <- NA

  ## new eSet breaks if name is NA :/
  FeatureID[index] <- "NotIncluded"
  PMMM[index] <- "control"
  Sequence[index] <- NA 
  SenseAntisense[index] <- NA
  ProbeID[index] <- NA
  GeneLoc[index] <- NA
  calls[index] <- NA
  SNP[index] <- NA
  Allele[index] <- NA
  midbase[index] <- NA

  out <- data.frame("middle_base" = midbase,
                    "alleleAB" = Allele,
                    "snp_location" = SNP,
                    "target_strand" = SenseAntisense,
                    "allele" = calls,
                    "feature_type" = PMMM,
                    "genomic_location" = GeneLoc,
                    "sequence" = I(Sequence),
                    "feature_ID" = ProbeID,
                    "feature_set_name" = FeatureID,
                    "Y" = Y,
                    "X" = X)
  nlines <- nrow(out)
  out$order_index <- 1:nlines
  out <- out[order(out$feature_set_name,out$feature_type,out$allele,out$target_strand,out$snp_location),]
  rownames(out) <- 1:nlines
  cat("Done.\n")
  
  return(list(out, list("ncol" = sizex,
                        "nrow" = sizey)))
  
}

## END snpCDF.R
