##################################################################
#   MAERlibr - library for interfacing MAExplorer with R         #
#   http://maexplorer.sourceforge.net/                           #
#   $Date: 2003/06/14 01:32:08 $ $Revision: 1.1.1.3 $            # 
#   P. Lemkin (NCI), S. Sundaram (SRA/CIT), G. Thornwall(SAIC),  #
#   G. Alvord (CSS/DMS), M. Lubomirski (CSS/DMS)                 #
##################################################################

##
# mae.readHSfile(file) - read horizontally stacked array expression data
# Args: file is the data file name
# Value: data frame with 
# Value: data frame with
#    hData         - data frame of horizontally stacked data
#    sampleNames   - sample names
#    condNames     - unique condition names
#    allCondNames  - condition names for each sample (factor)
#    nSamples      - # of samples
#    nConds        - # of conditions
#    nGenes        - # of nGenes
#    isVertStacked - FALSE 
##
mae.readHSfile <- function(file)
{ # mae.readHSfile
  sampleNames <- scan(file, sep="\t", what="", nlines=1)
  allCondNames <- scan(file, sep="\t", what="", nlines=1, skip=1)
  condNames <- unique(allCondNames [-1])
  hData <- read.table(file, header=TRUE, row.names=1,skip=2,
                       col.names=sampleNames)
  nSamples <- length(sampleNames)
  nConds <- length(condNames)
  nGenes <- nrow(hData)
  return(hData=hData,sampleNames=sampleNames,condNames=condNames,
         allCondNames=allCondNames,nSamples=nSamples,nConds=nConds,
         nGenes=nGenes,isVertStacked=FALSE)

} # end of mae.readHSfile


##
# mae.readVSfile(file) - read vertically stacked array expression data
# Args: file is the data file name
# Value: data frame with
#    vData         - data frame of vertically stacked data
#    sampleNames   - sample names
#    condNames     - unique condition names
#    allCondNames  - condition names for each sample (factor)
#    nSamples      - # of samples
#    nConds        - # of conditions
#    nGenes        - # of nGenes 
#    isVertStacked - TRUE 
#
# Developer: P. Lemkin, G. Thornwall
# Date: 6/10/2003
##
mae.readVSfile <- function(file)
{ # readVSfile
  vData <- read.table(file, header=TRUE)
  allSampleNames <- factor(vData$sampleName)
  allCondNames <- factor(vData$condName)
  sampleNames <- unique(allSampleNames)
  condNames <- unique(allCondNames)
  nSamples <- length(allSampleNames)
  nConds <- length(condNames)
  nGenes <- sum(vData$sampleName==sampleNames[1])
  return(vData=vData,sampleNames=sampleNames,condNames=condNames,
         allCondNames=allCondNames,nSamples=nSamples,nConds=nConds,
         nGenes=nGenes,isVertStacked=TRUE)
} # end of mae.readVSfile


##
# mae.getVSfromHScondData() - extract vertically stacked condition n from 
# horizontally stacked data frame hTbl where hTbl was 
# read with readHSfile().
# Args: hTbl is the horizontally stacked multi-class table
#       n is the condition to extract from hTbl and stack
# Value: vertically stacked data frame with (scn)
# If it is already vertically stacked, then just return hTbl.
#
# Developer: P. Lemkin, G. Thornwall
# Date: 6/10/2003
##
mae.getVSfromHScondData<- function(hTbl, n)
{ # mae.getVSfromHScondData
  if(hTbl$isVertStacked)
    return(scn=hTbl$hTbl)
  p <- hTbl$hData[hTbl$allCondNames==hTbl$condNames[n]]
  scn <- stack(p)
  return(scn=scn)
} # end of mae.getVSfromHScondData


##
# mae.getCondHData() - extract condition set condition n from
# horizontally stacked data frame hTbl where hTbl was read
# with readHSfile().
# Args: hTbl is the horizontally stacked multi-class table
#       n is the condition to extract from hTbl and stack
# Value: data frame with (condData)
#
# Developer: P. Lemkin, G. Thornwall
# Date: 6/10/2003
##
mae.getCondHData<- function(hTbl, n)
{ # mae.getCondHData
  hT <- hTbl$allCondNames[2:length(hTbl$allCondNames)]==hTbl$condNames[n]
  condData <- hTbl$hData[, hT]
  return(condData=condData)
} # end of mae.getCondHData


##
# mae.getCondVData() - extract condition set condition n vertically
# stacked data in data frame vTbl where vTbl was read 
# with readVSfile().
# Args: vTbl is the vertically stacked multi-class table
#       n is the condition to extract from vTbl and stack
# Value: data frame with (condData)
#
# Developer: P. Lemkin, G. Thornwall
# Date: 6/10/2003
##
mae.getCondVData<- function(vTbl, n)
{ # mae.getCondVData
  vertOK <- vTbl$allCondNames[2:length(vTbl$allCondNames)]==vTbl$condNames[n]
  condData <- vTbl$vData[vertOK,]
  return(condData=condData)
} # end of mae.getCondVDatay


##
# mae.maXYpvalue() - simple function for vectorized 2-sided t-test 
# for 2 condition sets of samples. This simple method assumes 
# equal variance. 
# Parameters:
#   xSet and ySet are [1:nGene,1:nX or 1:nY] matrices or data frames
# @return 2-sided p-value list from doing t.test for each gene
#
# Developer: Mariusz Lubomirski,Greg Alvord, CSS/DMS, NCI-Frederick 
# Date: 04/1/2003
##
mae.maXYpvalue <- function(xSet,ySet) 
{ # mae.maXYpvalue 
  xSet.na.mat <- is.na(xSet);
  ySet.na.mat <- is.na(ySet);

  # Collect numeric values only
  nX <- apply(!xSet.na.mat,1,sum);
  nY <- apply(!ySet.na.mat,1,sum);

  # Calculate degrees of freedom
  ndf <- nX + nY - 2;

  s2 <- ( (nX-1)*apply(xSet,1,var,na.rm=T) + 
          (nY-1)*apply(ySet,1,var,na.rm=T) ) / ndf;
  t.stat <- ( apply(xSet,1,mean,na.rm=T) - 
              apply(ySet,1,mean,na.rm=T) ) / 
            sqrt(s2*(1/nX+1/nY));
  p.val.2.sided <- 2 * (1-pt(abs(t.stat), ndf));

  return(p.value=p.val.2.sided); 
} # end of mae.maXYpvalue 


##
# mae.colorBased3dPlotting() - 3d scatter Plot that will generate coding 
# for groups.
# Parameters:
#   groups - Vector of unique groups
#   group  - Matrix of the group for the rows of data
#   data   - The data matrix
# Developer: Shyam ( shyamss@hotmail.com)
# Date: 05/29/2003
##
mae.colorBased3dPlotting <- function(groups,group,data,
                                     xindex=1,yindex=2,zindex=3,
                                     xlabel="PC1",
                                     ylabel="PC2",
                                     zlabel="PC3",mainTitle,
                                     clr=cbind("Yellow","Red","Blue",
                                               "Green","Magenta",
                                               "Pink","Cyan","Black"))
{ # mae.colorBased3dPlotting 
 if(!require(scatterplot3d))
 {
   print("library scatterplot3d not installed, install it from CRAN")
   return;
 } 

 groups <-as.matrix(groups)
 group <- as.matrix(group)
 data <- as.matrix(data)

 clrLoop <- 0
 
 maxValue <- max(abs(data))
 range <- c(-maxValue,maxValue)
 loop <- 1
 for(x in groups)
 { # x loop
 
  clrLoop <- clrLoop+1
   if (clrLoop > 8)
    clrLoop <- 1
  

  boolX <- as.matrix(group[,1]==x)
   pts <- as.matrix(data[boolX,])
 
   if (loop==1)  
     s3d <- scatterplot3d(pts[,xindex],pts[,yindex],pts[,zindex],
                          color=clr[1,clrLoop],xlab=xlabel,ylab=ylabel,
                          zlab=zlabel,main=mainTitle,pch=20,type="p",
                          box=F,grid=T,xlim= range,ylim=range,zlim=range)
   else
     s3d$points3d(pts[,xindex],pts[,yindex],col=clr[1,clrLoop],
                  pch=20,type="p")
   loop <- loop + 1                      # next group
 } # x loop
} # mae.colorBased3dPlotting 


## 
# mae.ColorBasedMatrixPlotting() - plot to generate coding for groups with color.
# Parameters:
#   uniqGroups - Vector of unique groups
#   group  - Matrix of the group for the rows of x
#   data   - The data matrix
# Shyam(shyamss@hotmail.com)
# Date : 07/18/2002
##
mae.colorBasedMatrixPlotting <- function(uniqGroups ,group, data, xindex=1,
                                         yindex=2, gbgcolor="black",
                                         xlabel="PC1", ylabel="PC2", mainTitle,
                                         xrange=range(data),yrange=range(data))
{ # mae.colorBasedMatrixPlotting
  uniqGroups <- as.matrix(uniqGroups )
  group <- as.matrix(group)
  data <- as.matrix(data)

  loop <- 1
  fgcolor <-"black"
  if(gbgcolor == "white") 
    colors <- cbind("yellow","red","blue","green","magenta","pink",
                    "cyan","black")
  else
    colors <- cbind("yellow","red","blue","green","magenta","pink",
                    "cyan","white")

 clr <- 0
 for( x in uniqGroups )
 { # x_loop
   clr <- clr+1
   if (clr > 8)
    clr <- 1
   boolX <- as.matrix(group[,1]==x)
   pts <- matrix(data[boolX,], nc=ncol(data))
   colors[clr]
   if (loop==1)
   { # main plot 1st time through the loop
     plot(xrange,yrange,col="transparent",xlim=xrange,ylim=yrange,xlab=xlabel,
          ylab=ylabel,main=mainTitle,pch=20,col.lab=fgcolor,col.axis=fgcolor,
          col.main=fgcolor)
     if (gbgcolor == "black")
     {
       rect(xrange[1]-(.039*(xrange[2]-xrange[1])),
            yrange[1]-(.039*(yrange[2]-yrange[1])),
            xrange[2]+(.039*(xrange[2]-xrange[1])), 
            yrange[2]+(.039*(yrange[2]-yrange[1])),
            col=gbgcolor, border=gbgcolor)
     }
   }
   points(pts[,xindex],pts[,yindex], col=colors[clr],pch=20 )
   loop <- loop + 1            # next group
  } # x_loop
} # end of mae.colorBasedMatrixPlotting


##
# mae.writeEigenData() - Write the three Eigen data files from
# simplePCAanalysis() computed prcomp() PCA data frame.
# Args:
#   sPCA - the simplePCAanalysis() computed PCA data frame
#   eigenValFile - name of eigen values data file
#   eigenVecsFile - name of eigen vectors (1:6) data file
#   points3Dfile - name of 3D coordinate points for plotting data file
# Developer: Shyam(shyamss@hotmail.com). Edited by P. Lemkin
# Date : 07/18/2002
##
mae.writeEigenData <- function(sPCA,
                               eigenValFile="EigenValues.out", 
                               eigenVecsFile="EigenVectors.out", 
                               points3Dfile="Points3D.out")
{ # mae.writeEigenData 
  # [1] Output the PCA results
  eigenVals <- sPCA$sdev^2
  write.table(file=eigenValFile, eigenVals,
              sep="\t", eol="\n", row.names=FALSE, col.names=FALSE)
  eigenVec <- sPCA$rotation[,1:6]
  write.table(file=eigenVecsFile, eigenVec,
              sep="\t", eol="\n", row.names=FALSE, col.names=FALSE)

  # [2] Output the 3D data for the points.
  points3D <- cbind(sPCA$x[,1],sPCA$x[,2],sPCA$x[,3])
  write.table(file=points3Dfile, points3D,
              sep="\t", eol="\n", row.names=FALSE, col.names=FALSE)
} # end of mae.writeEigenData 


##
# mae.simplePCAanalysis() - compute PCA on data and generate plots in a PDF file.
# For using the color based plotting function, the (uniqGroups, group)
# are required.
# It creates one PDF file with 7 plots:
#    Variance X Components,  
#    three plots: (PC1 vs PC2, PC2 vs PC3, PC1 vs PC3) for "arrays" or "genes",
#    three sorted Loadings plots for PC1, PC2 and PC3.
# Args:
#   data - the horizontally stacked array data (genes X array samples)
#   plotFile - optional name of the PDF file, else use "simplePCA".
#   mode - defines how the data is to be analyzed for computing PCA:
#      "arrays" - analyzed by gene components (Gene Principal Components)
#      "genes" - analyzed by array components (Array Principal Components)
#   scale.data - apply scaling in prcomp, else do not scale
#   uniqGroups - the unique condition names. 
#   group - the condition names/sample. This vector size = number samples
#   allowableEigenVectors - the number of allowable EigenVectors (default 6)
# Return: pca, eigenVals, eigenVec, points3D, loadings
# Developer: Shyam(shyamss@hotmail.com). Edited by P. Lemkin
# Date : 07/18/2002
#
# Notes:
# 1. This routine does the Principal components analysis(mva) for data either
#    [Genes(rows), Arrays(columns)] or vice versa. This makes use of the PCA
#    algorithm in the "mva" package of "R". We use the prcomp as the preferred
#    as the performance is better than the princomp method. The eigen vectors of
#    of the first two components may have a "-1" difference. This does not 
#    affect the analysis, it is just the coordinate that will have the shift
#    amongst the two methods.
# 2. Generation of the detailed information about the PCA 
#    including an analysis file with more details, pdf file with the plots 
#    File that will contain the 3D data points that can be used by a 3D
#    viewer program.
# 3. Experimenting with the plotting of the coefficients of the variables 
#    across the components.
# 4. Also added to the code is the sorting of returned eigen vector(loading)
#    values, so that for the first three components we know the effect of the
#    original variable.
# 5. When the matrix(rows,columns) based on the mode ( genes/arrays) has the 
#    the condition of the NumberOfRows < NumberOfColumns, the Eigen values 
#    upto NumberOfRows will be non-zero values and the remainder will have 
#    zero/singular values. Please read the following reference for more 
#    information. Numerical recipes(SVD): 
#    http://www.library.cornell.edu/nr/bookcpdf/c2-6.pdf
# 6. Noticed that the number of printable eigen vectors were restricted to the
#    first 6 but in cases when we have less than 6 we need to handle likewise.
# 7. This code is also used in the mAdb system (http://nciarray.nci.nih.gov/)
#    from which it has been adapted.
##
mae.simplePCAanalysis <- function(data, plotFile="simplePCA",
                                  mode="arrays", scale.data= TRUE,  
                                  uniqGroups, group, allowableEigenVectors=6 )
{ # mae.simplePCAanalysis  
  if(!require(mva))
    return("library mva not found")

  # [1] Computing the PCA
  if (mode == "arrays")
  { # for arrays, compute PCA on the transpose
    d1pca <- prcomp(t(data),scale=scale.data)
  }
  else
  { #for genes, compute the PCA
    d1pca <- prcomp(data,scale=scale.data)
  }

  # [2] Compute the number of printable Eigen Vectors
  # We limit it here to 6 which can of course be set to higher
  # values but the first few ( 3 or so ) should account for 
  # maximum variance.
  allowableEigenVectors <- 6
  curEigenVectors <- ncol(d1pca$rotation)

  if (curEigenVectors < allowableEigenVectors)
    nEigenVecs <- min(curEigenVectors,allowableEigenVectors)
  else
    nEigenVecs <-allowableEigenVectors

  # [3] Create simple vectors of the PCA results
  eigenVals <- d1pca$sdev^2
  eigenVec <- d1pca$rotation[,1:6]
  # Create 3D data for the points.
  points3D <- cbind(d1pca$x[,1],d1pca$x[,2],d1pca$x[,3])

  # [4] Sorting the loadings
  L1 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,1])
  L2 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,2])
  L3 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,3])
  L1s <- L1[sort.list(L1$VAL),]
  L2s <- L2[sort.list(L2$VAL),]
  L3s <- L3[sort.list(L3$VAL),]

  loadings <- list(L1s, L2s, L3s)    # for return list

  # [5] Put all analysis plots in one PDF file as 2x2 panels/page.
  # 1. Variance across principal components
  # 2. PC1 vs PC2 (arrays or genes)
  # 3. PC2 vs PC3 (arrays or genes)
  # 4. PC1 vs PC3 (arrays or genes)
  # 5. Loading: coefficient PC1 vs variable effects
  # 6. Loading: coefficient PC2 vs variable effects
  # 7. Loading: coefficient PC3 vs variable effects
  #
  pdf(plotFile,encoding="ISOLatin1",width=12,height=12)
  opar <- par(mfrow=c(2,2),pch=19,lty=2)
  
  maxComp1 <- max(abs(d1pca$x[,1]))
  xrange <- c(-maxComp1,maxComp1)
  yrange <- c(-maxComp1,maxComp1)

 # [5.1] Plot Variance across Principal Components for arrays or genes 
 if (mode == "arrays")
 { # process arrays data  
   plot(d1pca,main="Variance across Principal Components by Arrays")

   # Group Color based plots
   mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=1,yindex=2,
                                gbgcolor="white", 
                                mainTitle="Arrays: PC1 Vs PC2",
                                xlabel="PC1",ylabel="PC2",
                                xrange=xrange,yrange=yrange)  
   mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=2,yindex=3,
                                gbgcolor="white", 
                                mainTitle="Arrays: PC2 Vs PC3",
                                xlabel="PC2",ylabel="PC3",
                                xrange=xrange,yrange=yrange)
   mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=1,yindex=3,
                                gbgcolor="white", 
                                mainTitle="Arrays: PC1 Vs PC3",
                                xlabel="PC1",ylabel="PC3",
                                xrange=xrange,yrange=yrange)
 } # process arrays data

 else
 { # process genes data
   plot(d1pca,main="Variance across Principal Components by Genes")

   plot(d1pca$x[,1],d1pca$x[,2],xlim=xrange,ylim=yrange,
        main="Genes: PC1 Vs PC2",xlab="PC1",ylab="PC2",pch=20)
   plot(d1pca$x[,2],d1pca$x[,3],xlim=xrange,ylim=yrange,
        main="Genes: PC2 Vs PC3",xlab="PC2",ylab="PC3",pch=20)
   plot(d1pca$x[,1],d1pca$x[,3],xlim=xrange,ylim=yrange,
        main="Genes: PC1 Vs PC3",xlab="PC1",ylab="PC3",pch=20)
 } # process genes data

 # [5.2] Plot Sorted loadings
 # Eigen Vectors(contained in the columns of the loadings)
 if (mode == "arrays")
   mainLTitle= "Sorted genes coefficient PC1 loading"
 else
   mainLTitle= "Sorted arrays coefficient PC1 loading"
 plot(L1s$VAL[nrow(L1s):1],xlab="Variable Effect",ylab="Coefficient PC1",
      main=mainLTitle,type="l")
 text(L1s$VAL[nrow(L1s):1],labels=paste(L1s$ID),col=4)

 if (mode == "arrays")
   mainLTitle= "Sorted genes coefficient PC2 loading"
 else
   mainLTitle= "Sorted arrays coefficient PC2 loading"
 plot(L2s$VAL[nrow(L2s):1],xlab="Variable Effect",ylab="Coefficient PC2",
      main=mainLTitle,type="l")
 text(L2s$VAL[nrow(L2s):1],labels=paste(L2s$ID),col=4)

 if (mode == "arrays")
   mainLTitle= "Sorted genes coefficient PC3 loading"
 else
   mainLTitle= "Sorted arrays coefficient PC3 loading"
 plot(L3s$VAL[nrow(L1s):1],xlab="Variable Effect",ylab="Coefficient PC3",
      main=mainLTitle,type="l")
 text(L3s$VAL[nrow(L3s):1],labels=paste(L3s$ID),col=4)

 dev.off();

 return(pca=d1pca, eigenVals=eigenVals, eigenVec=eigenVec,
        nEigenVecs=nEigenVecs, points3D=points3D, loadings=loadings)
} # end of mae.simplePCAanalysis


##
# mae.loessPlot() - generate loess line and fitted plots for 
# for two samples if doPlot is true. The plots include:
#      M vs A (loess line)
#      M vs A (loess fitted) plots
# Parameters:
#   x and y single sample matrices or data frames
#   xName and yName are sample names
#   doPlot = TRUE, if do a plot, else just compute the loess data
# Return:
#   AValue is c(log10(as.matrix(x)) + log10(as.matrix(y)))/2
#   MValue is c(log2(as.matrix(x)/as.matrix(y)))
#   dMvsA is data.frame(ID=(cbind(x,y)[,1]),AVAL=AValue, MVAL=MValue)
#   dMvsA.lo is loess fit of (MValue ~ AValue)
#   fitted is (MValue-dMvsA.lo$fitted) 
#   sNames is c(xName,yName)
#
# Developer: S. Sundaram, P. Lemkin, G. Thornwal
# Date: 06/30/2003
##
mae.loessPlot <- function(x,y,xName,yName,doPlot=TRUE)
{ # mae.loessPlot
  AValue <- c(log10(as.matrix(x)) + log10(as.matrix(y)))/2
  MValue <- c(log2(as.matrix(x)/as.matrix(y)))
  dMvsA <- data.frame(ID=(cbind(x,y)[,1]), AVAL=AValue, MVAL=MValue)
  dMvsA.lo <- loess(MVAL ~ AVAL, dMvsA, degree=1, span=.5)
  samplesTitle <- paste(xName,"vs.",yName)
  #
  mTitleL <- paste("M vs A (loess line for",samplesTitle,")")
  if(doPlot) plot(AValue,MValue,main=mTitleL,xlab="A(log10)",ylab="M(log2)")
  points(AValue,dMvsA.lo$fitted, col="red", pch=18);
  mTitleF <- paste("M vs. A (loess fitted for",samplesTitle,")")
  fitted <- MValue-dMvsA.lo$fitted
  if(doPlot) plot(AValue,fitted,main=mTitleF,xlab="A(log10)", ylab="M(log2)")
  sNames=c(xName,yName)
  return(AValue=AValue, MValue=MValue, dMvsA=dMvsA, dMvsA.lo=dMvsA.lo,
         fitted=fitted, sNames=sNames)
} # end of mae.loessPlot


# --- end of MAERlibr --- #