## BEGIN makeCDFenv.R
##------------------------------------------------------------
## (C) Rafael Irizarry, Wolfgang Huber 2003
##
## Modifications by B. M. Bolstad for binary CDF files Feb 2005
## Note that the binary parser ignores the cdffile object
## altogether and creates much of the structure in the c code
##
##------------------------------------------------------------
##
## Modified on May, 06 2005 to support PlatformDesign
## Benilton Carvalho
##
##
## BC: should improve the notation
## BC: this file refers *only* to **expression** arrays
## BC: for SNP arrays, check makeSNPenv.R
## BC: we're assuming that expression arrays are coming
##     with a tab-delim. sequence file.

## BC: On May 26, I changed the name of the function
##     from make.cdf.env to makeCDFenv (affy has the
##     same function)

## December 9, 2005: Modification of the variable names.
## OLD		MEANING					NEW
## feature_type_1: feature_type (PM/MM)			feature_type
## feature_type_2: calls (for SNP chips)			allele
## feature_type_3: sense/antisense (for SNP chips)		target_strand
## feature_type_4: SNP (for SNP chips)			snp_location
## 
## feature_type_1: feature_type (PM/MM)			feature_type
## feature_type_2: sense/antisense (expr chips)		target_strand
## feature_type_3: SNP (for SNP chips)			snp_location
##
##
## February 28, 2006: Fixed bug on reading expression arrays.
##                    Package now depends on affyio in order to use the parsers

makeCDFenv <- function(filename, seqfile, cdf.path = getwd(),
                       compress = FALSE, verbose = TRUE) {
  
  stopifnot(is.logical(verbose), is.logical(compress),
            is.character(cdf.path), is.character(filename))
  stopifnot(length(filename)==1, length(cdf.path)==1,
            length(verbose)==1, length(compress)==1)

  if(verbose) cat("Reading CDF File\n")

  cdflist <- read.cdffile.list(filename)
  cdfType <- check.cdf.type(file.path(path.expand(cdf.path), filename))
  
  if (cdfType=="text"){
    sizex <- cdflist$Chip$Cols
    sizey <- cdflist$Chip$Rows

##     extractQC <- function(qcList){
##       df <- data.frame(X=qcList$QCCells$x,
##                        Y=qcList$QCCells$y,
##                        feature_set_name="QC",
##                        target_strand=NA,
##                        feature_ID=qcList$QCCells$Atom,
##                        sequence=qcList$QCCells$Probe,
##                        feature_type=NA)
##       return(df)
##     }
##     QC <- do.call("rbind", lapply(cdflist$QC, extractQC))

    extractUnit <- function(unitList){
      df <- data.frame(X=unitList$Unit_Block[[1]]$Unit_Block_Cells$x,
                       Y=unitList$Unit_Block[[1]]$Unit_Block_Cells$y,
                       feature_set_name=unitList$Unit_Block[[1]]$Unit_Block_Cells$Qual,
                       target_strand=unitList$Direction,
                       feature_ID=unitList$Unit_Block[[1]]$Unit_Block_Cells$Atom,
                       sequence=unitList$Unit_Block[[1]]$Unit_Block_Cells$Probe,
                       feature_type=!(unitList$Unit_Block[[1]]$Unit_Block_Cells$pbase==
                                      unitList$Unit_Block[[1]]$Unit_Block_Cells$tbase))
      return(df)
    }
    rv <- do.call("rbind", lapply(cdflist$Unit, extractUnit))
    rm(cdflist)
    gc()
    i1 <- rv$feature_type==TRUE
    rv$feature_type[i1] <- "PM"
    rv$feature_type[!i1] <- "MM"

##    rv <- rbind(QC, units)

    tmp <- expand.grid(Y=0:(sizey-1),X=0:(sizex-1))
    tmp$index <- tmp$Y*sizex + tmp$X + 1
    indexin <- rv$Y*sizex + rv$X + 1
    toinclude <- tmp[ !tmp$index %in% indexin,2:1]
    toinclude$feature_set_name <- "NotIncluded"
    toinclude$target_strand <- NA
    toinclude$feature_ID <- NA
    toinclude$sequence <- NA
    toinclude$feature_type <- NA

    rv <- rbind(rv, toinclude)
    rv <- rv[order(rv$Y, rv$X),]
    rv$order_index <- 1:nrow(rv)
  } else if (cdfType=="xda") {

    Info <- function(cdflist1){
      tmp <- cdflist1$Block[[1]]$UnitInfo
      n <- nrow(tmp)
      tmp$name <- rep(cdflist1$Block[[1]]$Name,n)
      tmp$index.position <- NULL
      tmp$target_strand <- rep(as.integer(cdflist1$UnitHeader[2]),cdflist1$UnitHeader[5])
      return(tmp)
    }

    ExtractCol <- function(UnitList,col){
      UnitList[,col]
    }

##    QCInfo <- function(cdflist1){
##      tmp <- cdflist1$QCUnitInfo
##      tmp$X <- tmp$x
##      tmp$Y <- tmp$y
##      tmp$x <- NULL
##      tmp$y <- NULL
##      tmp$feature_type <- NA
##      tmp$feature_type[tmp$PMFlag==1] <- "PM"
##      tmp$feature_type[tmp$PMFlag==0] <- "MM"
##      tmp$location <- rep(as.integer(cdflist1$QCUnitHeader[1]),length(tmp$X))
##      tmp$ProbeLength <- NULL
##      tmp$PMFlag <- NULL
##      tmp$BGProbeFlag <- NULL
##      return(tmp)
##    }

    ## Reading Chip Description
    sizex <- as.integer(cdflist[[1]][[1]][[3]])
    sizey <- as.integer(cdflist[[1]][[1]][[4]])

    ## Getting Unit Info
    units <- lapply(cdflist$Units,Info)
    X = unlist(lapply(units,ExtractCol,2))
    Y = unlist(lapply(units,ExtractCol,3))

    ## NOT BEING USED RIGHT NOW
    ## Getting QC Unit Info
##    qcunits <- lapply(cdflist$QCUnits,QCInfo)
##    Xqc <- unlist(lapply(qcunits,ExtractCol,1))
##    Yqc <- unlist(lapply(qcunits,ExtractCol,2))
##    ft1qc <- unlist(lapply(qcunits,ExtractCol,3))
##    ft2qc <- rep("control",length(Xqc))
##    ft3qc <- unlist(lapply(qcunits,ExtractCol,4))
    ###########################################
    
    tmp <- expand.grid(Y=0:(sizey-1),X=0:(sizex-1))
    tmp$index <- tmp$Y*sizex + tmp$X + 1
    indexin <- Y*sizex + X + 1
    toinclude <- tmp[ !tmp$index %in% indexin,2:1]
    qtde <- length(toinclude[,1])
##    toinclude$feature_set_name <- rep(NA,qtde)
    toinclude$feature_set_name <- "NotIncluded"
    toinclude$target_strand <- rep(NA,qtde)
    toinclude$pbase <- rep(NA,qtde)
    toinclude$tbase <- rep(NA,qtde)
    toinclude$feature_ID <- rep(NA,qtde)
    sequence <- NA

    rm(tmp,indexin,qtde)
    X <- c(X,toinclude$X)
    Y <- c(Y,toinclude$Y)
    feature_set_name <- as.factor(c(unlist(lapply(units,ExtractCol,6)),toinclude$feature_set_name))
    target_strand = c(unlist(lapply(units,ExtractCol,7)),toinclude$target_strand)
    pbase = as.factor(c(toupper(unlist(lapply(units,ExtractCol,4))),toinclude$pbase))
    tbase = as.factor(c(toupper(unlist(lapply(units,ExtractCol,5))),toinclude$tbase))
    feature_ID = as.integer(c(unlist(lapply(units,ExtractCol,1)),toinclude$feature_ID))
    rv <- data.frame(X, Y, feature_set_name, target_strand, pbase, tbase, feature_ID, sequence)

    rm(toinclude,X,Y)
    pm.or.mm <- pmormm2(rv)
    rv$feature_type[pm.or.mm] <- "PM"
    rv$feature_type[!pm.or.mm] <- "MM"
    rv$feature_type <- as.factor(rv$feature_type)
    rm(pm.or.mm,units)
    rv$pbase <- NULL
    rv$tbase <- NULL

    rv <- rv[order(rv$Y,rv$X),]
    rv$order_index <- 1:nrow(rv)
  } else {
    stop(paste("File format for",filename,"not recognized."))
  }

  rv$target_strand[rv$target_strand==1] <- "Sense"
  rv$target_strand[rv$target_strand==2] <- "Antisense"
  rv$target_strand <- as.factor(rv$target_strand)
  
  # getting sequence **only for PMs**
  indrv <- rv$X + sizex*(rv$Y-1)
  seqinfo <- getTabSeq(seqfile)
  indseq <- seqinfo$X + sizex*(seqinfo$Y-1)
  lines <- match(indrv,indseq)
  rv$sequence <- seqinfo$sequence[lines]
  rv$location <- seqinfo$genomic_location[lines]
  rm(seqinfo,indrv,indseq,lines)

  rv <- rv[order(rv$feature_set_name,rv$feature_type,rv$target_strand,rv$location),]

  rv$feature_set_name <- as.character(rv$feature_set_name)
  rv$sequence <- as.character(rv$sequence)
  
  # BC: (X,Y) starts at (1,1)
  rv$X <- rv$X+1
  rv$Y <- rv$Y+1
  
  gc()

  new("platformDesign",
      featureInfo=assign2env(rv),
      featureTypeDescription = list(
        feature_type=list(
                       description="This column distinguishes PM and MM features.\nAnything else is left as an NA",
                       levels=list(PM="Perfect Match probe", MM="Mismatch probe")
                       ),
        target_strand = list(
                       description = "Target Strandness",
                       levels = list(Sense = "Sense", Antisense = "Antisense")
                       ),
        location = list(
                       description = "Genomic Location"
                       )),
      manufacturer="Affymetrix",
      type="expression",
      nrow=sizey,
      ncol=sizex)

}

assign2env <- function(df){
  ## df is a data.frame
  ## the function will return an environment
  ##    containing the variable in the data.frame

  newenv <- new.env()

  ## putting things in the correct order

  vars <- names(df)
  
  ## it seems that if i don't use rev, the variables
  ##  are stored in the reverse order...
  vars <- rev(vars)
  for (i in 1:length(vars)){
    assign(vars[i],
           eval(parse(text=paste("df$",vars[i],sep=""))),
           newenv)
  }
  return(newenv)
}

## END makeCDFenv.R
