## BEGIN snpTextCDF.R
#############################################################
#
#  May 2005 James W. MacDonald
#
#  Functions to parse the cdf list that Ben Bolstad's function
#  CDFintoRList() creates. The two files needed are the
#  (binary) .CDF from Affy, and the FASTA probe sequence file
#  also from Affy.
#
#  Output is a list containing:
#  Probe ID
#  X coordinate
#  Y coordinate
#  PM/MM
#  Sequence
#  Sense/antisense
#  SNP position
#
#  Note that these functions don't have any error checking
#  Added to makePlatform design on May 16, 2005
#
#  June 6, 2005 Added probe_id to link PM/MM pairs
#  as well as genomic_location and chromosome to allow
#  localization (and future plotting functions). This requires
#  parsing the Affy .csv annotation file.
#
#  July 19, 2005 Removed the chromosome (but retained the
#  genomic location). Which chromosome the SNP is on will have
#  to come from an annotation package.
#
#  Added parsing of the AFFX probes so the indexing will work
#  out. Will only include the feature_set_name, X, and Y location.
#  The remaining rows will contain NAs.
#
#  July 25, 2005 Added SNP nucleotide to feature_type_3, as well as
#  "Control" to feature_type_1 for 'Affx' probes
#
#  Also sorted the columns to match the celfiles and added
#  rows with NA where missing.
#
#  August 4, 2005 sorted the feature_ID so that pm() and friends
#  from oligo will extract data in order(target_strand, feature_type_3,
#  feature_type_2)
#
#  December 9, 2005 modified the names of the pdenvs columns (BC):
##  OLD		MEANING					NEW
##  feature_type_1: feature_type (PM/MM)			feature_type
##  feature_type_2: calls (for SNP chips)			allele
##  feature_type_3: sense/antisense (for SNP chips)		target_strand
##  feature_type_4: SNP (for SNP chips)			snp_location
##  
###########################################################

####################################
### BC: SEPTEMBER 2005
### ADAPTED FROM James MacDonald
### to read txt CDF (snp chips)
### needs more tests and then i'll
### make the code more integrated
### to the package
####################################

readSNPtxtCDF <- function(cdf, fasta, annot){
  ## Gets probe sequence using SNP .CDF file and an Affy
  ## _probe_fasta file
  ## require(matchprobes, quietly = TRUE)
  ## require(makePlatformDesign, quietly = TRUE)
  x <- txtCDFintoRList(cdf)
  ## BC: Fri Jul 22, 2005 added sizex and sizey
  sizex <- as.integer(x$Chip$Cols)
  sizey <- as.integer(x$Chip$Rows)
  
  xy2i <- function(x, y){
    sizey * y + x + 1
  }
  
  seqFrame <- readFasta(fasta)
  locDat <- readAnnot(annot)
  ## Get locations of AFFX and SNP probes
  x$UnitNames <- sapply(x$Unit, function(obj) obj$Name)
  numaffx <- grep("AFFX", x$UnitNames)
  numsnps <- grep("SNP", x$UnitNames)

  ## Allocate vectors long enough for all data
  len <- sizex * sizey
  FeatureID <- character(len)
  X <- numeric(len)
  Y <- numeric(len)
  PMMM <- character(len)
  Sequence <- character(len)
  SenseAntisense <- character(len)
  SNP <- numeric(len)
  ProbeID <- numeric(len)
  GeneLoc <- numeric(len)
  calls <- character(len)
  
  cat("Creating environment")
  ## PM and MM info for the AFFX probesets
  for(j in numaffx){
    tmp <- do.call("rbind", lapply(x$Unit[[j]]$Unit_Block, function(obj) obj$Unit_Block_Cells[,1:2]))
    len <- nrow(tmp)
    
    ## To get indexes for txt CDFs, it suffices:
    ## do.call("rbind", lapply(x$Unit[[j]]$Unit_Block, function(obj) obj$Unit_Block_Cells[,12]))+1
    ## Not doing that for now to try to unify the codes
    index <- xy2i(tmp[,1], tmp[,2])

    FeatureID[index] <- rep(x$UnitNames[j], len)
    X[index] <- tmp[,1] + 1
    Y[index] <- tmp[,2] + 1
    
    ## No info for these probes, use NA as placeholder
    PMMM[index] <- "control"
    Sequence[index] <- NA
    SenseAntisense[index] <- NA
    ProbeID[index] <- NA
    GeneLoc[index] <- NA
    SNP[index] <- NA
    calls[index] <- NA
    if(j%%1000 == 0)
      cat(".")
  }
  txt2xba <- function(tt){
    Header <- c(n.atoms = tt$NumAtoms,
                n.cells = tt$NumCells,
                n.cellsperatom = 2,
                Direction = tt$Direction,
                firstatom = tt$StartPosition - 1,
                unused = NA)
    Name <- tt$Name
    UnitInfo <- tt$Unit_Block_Cells[,c(11,1:2,6,9:10)]
    UnitInfo[,5] <- tolower(UnitInfo[,5])
    UnitInfo[,6] <- tolower(UnitInfo[,6])
    names(UnitInfo) <- c("atom.number","x","y","index.position","pbase","tbase")
    return(list(Header = Header, Name = Name, UnitInfo = UnitInfo))
  }
  for(j in numsnps){
    ## Get sequence information
    fullSeq <- seqFrame[x$UnitNames[j],]
    
    ## BC: Why the direction of the 1st block?
    ##     Would it work the direction on the block header?
    direction <- x$Unit[[j]]$Unit_Block[[1]]$Direction
    sequence <- do.call("rbind",
                        lapply(x$Unit[[j]]$Unit_Block,
                               function(x){
                                 x <- txt2xba(x)
                                 if (!is.na(fullSeq) & !is.na(direction)){
                                   return(getSeq(x, direction, fullSeq))
                                 }else{
                                   nrows <- nrow(x$UnitInfo)
                                   return(matrix(NA,ncol=2,nrow=nrows))
                                 }
                               }))

    ## Get PM/MM, X, Y, and ProbeIDs
    tmp <- do.call("rbind", lapply(x$Unit[[j]]$Unit_Block, function(x) x$Unit_Block_Cells[,c(1:2,6)]))
    len <- nrow(tmp)
    ## Extract the SNP nucleotides
    snp.nuc <- unlist(mapply(rep, sapply(x$Unit[[j]]$Unit_Block, function(x) x$Name),
                             sapply(x$Unit[[j]]$Unit_Block, function(x) length(x$Unit_Block_Cells[,1])),
                             SIMPLIFY = FALSE), use.names = FALSE)
    ## Get SNP position
    snp.pos <- ifelse(sequence[,2] == "Antisense",tmp[,3] - 4, 30 - tmp[,3])
    ## Order things to figure out PMs and MMs - Note that using pbase == tbase
    ## won't work here because if the PM is at the SNP position and the tbase == SNP
    ## then tbase != pbase for both PM and MM
    org.ord <- 1:len
    mix <- org.ord[order(tmp[,2], tmp[,1])]
    pms <- rep(c("PM","MM"), len/2)[order(mix)]
    ## Correct the MM base in the sequences
    mm.ind <- which(pms == "MM")
    if ( all(!is.na(sequence)))
      sequence[mm.ind,1] <- ifelse(isComp(snp.nuc[mm.ind]) & snp.pos[mm.ind] == 13,
                                 mmSNP(sequence[mm.ind,1], start = 13, stop = 13),
                                 complementSeq(sequence[mm.ind,1], start = 13, stop = 13))
    ## add numeric indicator that links each PM to the corresponding MM
    nums <- rep(1:20, each = 2)
    ## This is probably the slowest part of the function
    index <- xy2i(tmp[,1], tmp[,2])
    FeatureID[index] <- rep(x$UnitNames[j], len)
    X[index] <- tmp[,1] + 1
    Y[index] <- tmp[,2] + 1
    PMMM[index] <- pms
    Sequence[index] <- sequence[,1]
    SenseAntisense[index] <- sequence[,2]
    GeneLoc[index] <- rep(locDat[x$UnitNames[j], 2], len)
    calls[index] <- snp.nuc
    SNP[index] <- snp.pos
    ## order ProbeIDs so probes are extracted in correct order
    p.indx <- 1:len %% 2 == 0
    p.ord <- order(order(snp.nuc[p.indx], sequence[,2][p.indx],snp.pos[p.indx]))
    ##    p.ord <- order(order(sequence[,2][p.indx], snp.nuc[p.indx], snp.pos[p.indx]))
    ProbeID[index] <- rep(p.ord, each = 2)
    if(j%%1000 == 0)
      cat(".")
  }
  ## Set oligo-B2 probes to NA
  index <- X == 0
  X <- rep(1:sizex, each = 1, times = sizey)
  Y <- rep(1:sizex, each = sizey, times = 1)
  FeatureID[index] <- "NotIncluded"
  PMMM[index] <- "control"
  Sequence[index] <- NA 
  SenseAntisense[index] <- NA
  ProbeID[index] <- NA
  GeneLoc[index] <- NA
  calls[index] <- NA
  SNP[index] <- NA

  out <- data.frame("snp_location" = SNP,
                    "target_strand" = SenseAntisense,
                    "allele" = calls,
                    "feature_type" = PMMM,
                    "genomic_location" = GeneLoc,
                    "sequence" = I(Sequence),
                    "feature_ID" = ProbeID,
                    "feature_set_name" = FeatureID,
                    "Y" = Y,
                    "X" = X)
  nlines <- nrow(out)
  out$order_index <- 1:nlines
  out <- out[order(out$feature_set_name,out$feature_type,out$allele,out$target_strand,out$snp_location),]
  rownames(out) <- 1:nlines
  cat("Done.\n")
  
  ## BC: Fri Jul 22, 2005 - modified the object being returned
  ##     list of lists to not mess up the order and then we can
  ##     get the chip dimensions
  return(list(out, list("ncol" = sizex, "nrow" = sizey)))
  
}

## END snpTextCDF.R
