################################################################## # MAERlibr - library for interfacing MAExplorer with R # # http://maexplorer.sourceforge.net/ # # $Date: 2003/06/14 01:32:08 $ $Revision: 1.1.1.3 $ # # P. Lemkin (NCI), S. Sundaram (SRA/CIT), G. Thornwall(SAIC), # # G. Alvord (CSS/DMS), M. Lubomirski (CSS/DMS) # ################################################################## ## # mae.readHSfile(file) - read horizontally stacked array expression data # Args: file is the data file name # Value: data frame with # Value: data frame with # hData - data frame of horizontally stacked data # sampleNames - sample names # condNames - unique condition names # allCondNames - condition names for each sample (factor) # nSamples - # of samples # nConds - # of conditions # nGenes - # of nGenes # isVertStacked - FALSE ## mae.readHSfile <- function(file) { # mae.readHSfile sampleNames <- scan(file, sep="\t", what="", nlines=1) allCondNames <- scan(file, sep="\t", what="", nlines=1, skip=1) condNames <- unique(allCondNames [-1]) hData <- read.table(file, header=TRUE, row.names=1,skip=2, col.names=sampleNames) nSamples <- length(sampleNames) nConds <- length(condNames) nGenes <- nrow(hData) return(hData=hData,sampleNames=sampleNames,condNames=condNames, allCondNames=allCondNames,nSamples=nSamples,nConds=nConds, nGenes=nGenes,isVertStacked=FALSE) } # end of mae.readHSfile ## # mae.readVSfile(file) - read vertically stacked array expression data # Args: file is the data file name # Value: data frame with # vData - data frame of vertically stacked data # sampleNames - sample names # condNames - unique condition names # allCondNames - condition names for each sample (factor) # nSamples - # of samples # nConds - # of conditions # nGenes - # of nGenes # isVertStacked - TRUE # # Developer: P. Lemkin, G. Thornwall # Date: 6/10/2003 ## mae.readVSfile <- function(file) { # readVSfile vData <- read.table(file, header=TRUE) allSampleNames <- factor(vData$sampleName) allCondNames <- factor(vData$condName) sampleNames <- unique(allSampleNames) condNames <- unique(allCondNames) nSamples <- length(allSampleNames) nConds <- length(condNames) nGenes <- sum(vData$sampleName==sampleNames[1]) return(vData=vData,sampleNames=sampleNames,condNames=condNames, allCondNames=allCondNames,nSamples=nSamples,nConds=nConds, nGenes=nGenes,isVertStacked=TRUE) } # end of mae.readVSfile ## # mae.getVSfromHScondData() - extract vertically stacked condition n from # horizontally stacked data frame hTbl where hTbl was # read with readHSfile(). # Args: hTbl is the horizontally stacked multi-class table # n is the condition to extract from hTbl and stack # Value: vertically stacked data frame with (scn) # If it is already vertically stacked, then just return hTbl. # # Developer: P. Lemkin, G. Thornwall # Date: 6/10/2003 ## mae.getVSfromHScondData<- function(hTbl, n) { # mae.getVSfromHScondData if(hTbl$isVertStacked) return(scn=hTbl$hTbl) p <- hTbl$hData[hTbl$allCondNames==hTbl$condNames[n]] scn <- stack(p) return(scn=scn) } # end of mae.getVSfromHScondData ## # mae.getCondHData() - extract condition set condition n from # horizontally stacked data frame hTbl where hTbl was read # with readHSfile(). # Args: hTbl is the horizontally stacked multi-class table # n is the condition to extract from hTbl and stack # Value: data frame with (condData) # # Developer: P. Lemkin, G. Thornwall # Date: 6/10/2003 ## mae.getCondHData<- function(hTbl, n) { # mae.getCondHData hT <- hTbl$allCondNames[2:length(hTbl$allCondNames)]==hTbl$condNames[n] condData <- hTbl$hData[, hT] return(condData=condData) } # end of mae.getCondHData ## # mae.getCondVData() - extract condition set condition n vertically # stacked data in data frame vTbl where vTbl was read # with readVSfile(). # Args: vTbl is the vertically stacked multi-class table # n is the condition to extract from vTbl and stack # Value: data frame with (condData) # # Developer: P. Lemkin, G. Thornwall # Date: 6/10/2003 ## mae.getCondVData<- function(vTbl, n) { # mae.getCondVData vertOK <- vTbl$allCondNames[2:length(vTbl$allCondNames)]==vTbl$condNames[n] condData <- vTbl$vData[vertOK,] return(condData=condData) } # end of mae.getCondVDatay ## # mae.maXYpvalue() - simple function for vectorized 2-sided t-test # for 2 condition sets of samples. This simple method assumes # equal variance. # Parameters: # xSet and ySet are [1:nGene,1:nX or 1:nY] matrices or data frames # @return 2-sided p-value list from doing t.test for each gene # # Developer: Mariusz Lubomirski,Greg Alvord, CSS/DMS, NCI-Frederick # Date: 04/1/2003 ## mae.maXYpvalue <- function(xSet,ySet) { # mae.maXYpvalue xSet.na.mat <- is.na(xSet); ySet.na.mat <- is.na(ySet); # Collect numeric values only nX <- apply(!xSet.na.mat,1,sum); nY <- apply(!ySet.na.mat,1,sum); # Calculate degrees of freedom ndf <- nX + nY - 2; s2 <- ( (nX-1)*apply(xSet,1,var,na.rm=T) + (nY-1)*apply(ySet,1,var,na.rm=T) ) / ndf; t.stat <- ( apply(xSet,1,mean,na.rm=T) - apply(ySet,1,mean,na.rm=T) ) / sqrt(s2*(1/nX+1/nY)); p.val.2.sided <- 2 * (1-pt(abs(t.stat), ndf)); return(p.value=p.val.2.sided); } # end of mae.maXYpvalue ## # mae.colorBased3dPlotting() - 3d scatter Plot that will generate coding # for groups. # Parameters: # groups - Vector of unique groups # group - Matrix of the group for the rows of data # data - The data matrix # Developer: Shyam ( shyamss@hotmail.com) # Date: 05/29/2003 ## mae.colorBased3dPlotting <- function(groups,group,data, xindex=1,yindex=2,zindex=3, xlabel="PC1", ylabel="PC2", zlabel="PC3",mainTitle, clr=cbind("Yellow","Red","Blue", "Green","Magenta", "Pink","Cyan","Black")) { # mae.colorBased3dPlotting if(!require(scatterplot3d)) { print("library scatterplot3d not installed, install it from CRAN") return; } groups <-as.matrix(groups) group <- as.matrix(group) data <- as.matrix(data) clrLoop <- 0 maxValue <- max(abs(data)) range <- c(-maxValue,maxValue) loop <- 1 for(x in groups) { # x loop clrLoop <- clrLoop+1 if (clrLoop > 8) clrLoop <- 1 boolX <- as.matrix(group[,1]==x) pts <- as.matrix(data[boolX,]) if (loop==1) s3d <- scatterplot3d(pts[,xindex],pts[,yindex],pts[,zindex], color=clr[1,clrLoop],xlab=xlabel,ylab=ylabel, zlab=zlabel,main=mainTitle,pch=20,type="p", box=F,grid=T,xlim= range,ylim=range,zlim=range) else s3d$points3d(pts[,xindex],pts[,yindex],col=clr[1,clrLoop], pch=20,type="p") loop <- loop + 1 # next group } # x loop } # mae.colorBased3dPlotting ## # mae.ColorBasedMatrixPlotting() - plot to generate coding for groups with color. # Parameters: # uniqGroups - Vector of unique groups # group - Matrix of the group for the rows of x # data - The data matrix # Shyam(shyamss@hotmail.com) # Date : 07/18/2002 ## mae.colorBasedMatrixPlotting <- function(uniqGroups ,group, data, xindex=1, yindex=2, gbgcolor="black", xlabel="PC1", ylabel="PC2", mainTitle, xrange=range(data),yrange=range(data)) { # mae.colorBasedMatrixPlotting uniqGroups <- as.matrix(uniqGroups ) group <- as.matrix(group) data <- as.matrix(data) loop <- 1 fgcolor <-"black" if(gbgcolor == "white") colors <- cbind("yellow","red","blue","green","magenta","pink", "cyan","black") else colors <- cbind("yellow","red","blue","green","magenta","pink", "cyan","white") clr <- 0 for( x in uniqGroups ) { # x_loop clr <- clr+1 if (clr > 8) clr <- 1 boolX <- as.matrix(group[,1]==x) pts <- matrix(data[boolX,], nc=ncol(data)) colors[clr] if (loop==1) { # main plot 1st time through the loop plot(xrange,yrange,col="transparent",xlim=xrange,ylim=yrange,xlab=xlabel, ylab=ylabel,main=mainTitle,pch=20,col.lab=fgcolor,col.axis=fgcolor, col.main=fgcolor) if (gbgcolor == "black") { rect(xrange[1]-(.039*(xrange[2]-xrange[1])), yrange[1]-(.039*(yrange[2]-yrange[1])), xrange[2]+(.039*(xrange[2]-xrange[1])), yrange[2]+(.039*(yrange[2]-yrange[1])), col=gbgcolor, border=gbgcolor) } } points(pts[,xindex],pts[,yindex], col=colors[clr],pch=20 ) loop <- loop + 1 # next group } # x_loop } # end of mae.colorBasedMatrixPlotting ## # mae.writeEigenData() - Write the three Eigen data files from # simplePCAanalysis() computed prcomp() PCA data frame. # Args: # sPCA - the simplePCAanalysis() computed PCA data frame # eigenValFile - name of eigen values data file # eigenVecsFile - name of eigen vectors (1:6) data file # points3Dfile - name of 3D coordinate points for plotting data file # Developer: Shyam(shyamss@hotmail.com). Edited by P. Lemkin # Date : 07/18/2002 ## mae.writeEigenData <- function(sPCA, eigenValFile="EigenValues.out", eigenVecsFile="EigenVectors.out", points3Dfile="Points3D.out") { # mae.writeEigenData # [1] Output the PCA results eigenVals <- sPCA$sdev^2 write.table(file=eigenValFile, eigenVals, sep="\t", eol="\n", row.names=FALSE, col.names=FALSE) eigenVec <- sPCA$rotation[,1:6] write.table(file=eigenVecsFile, eigenVec, sep="\t", eol="\n", row.names=FALSE, col.names=FALSE) # [2] Output the 3D data for the points. points3D <- cbind(sPCA$x[,1],sPCA$x[,2],sPCA$x[,3]) write.table(file=points3Dfile, points3D, sep="\t", eol="\n", row.names=FALSE, col.names=FALSE) } # end of mae.writeEigenData ## # mae.simplePCAanalysis() - compute PCA on data and generate plots in a PDF file. # For using the color based plotting function, the (uniqGroups, group) # are required. # It creates one PDF file with 7 plots: # Variance X Components, # three plots: (PC1 vs PC2, PC2 vs PC3, PC1 vs PC3) for "arrays" or "genes", # three sorted Loadings plots for PC1, PC2 and PC3. # Args: # data - the horizontally stacked array data (genes X array samples) # plotFile - optional name of the PDF file, else use "simplePCA". # mode - defines how the data is to be analyzed for computing PCA: # "arrays" - analyzed by gene components (Gene Principal Components) # "genes" - analyzed by array components (Array Principal Components) # scale.data - apply scaling in prcomp, else do not scale # uniqGroups - the unique condition names. # group - the condition names/sample. This vector size = number samples # allowableEigenVectors - the number of allowable EigenVectors (default 6) # Return: pca, eigenVals, eigenVec, points3D, loadings # Developer: Shyam(shyamss@hotmail.com). Edited by P. Lemkin # Date : 07/18/2002 # # Notes: # 1. This routine does the Principal components analysis(mva) for data either # [Genes(rows), Arrays(columns)] or vice versa. This makes use of the PCA # algorithm in the "mva" package of "R". We use the prcomp as the preferred # as the performance is better than the princomp method. The eigen vectors of # of the first two components may have a "-1" difference. This does not # affect the analysis, it is just the coordinate that will have the shift # amongst the two methods. # 2. Generation of the detailed information about the PCA # including an analysis file with more details, pdf file with the plots # File that will contain the 3D data points that can be used by a 3D # viewer program. # 3. Experimenting with the plotting of the coefficients of the variables # across the components. # 4. Also added to the code is the sorting of returned eigen vector(loading) # values, so that for the first three components we know the effect of the # original variable. # 5. When the matrix(rows,columns) based on the mode ( genes/arrays) has the # the condition of the NumberOfRows < NumberOfColumns, the Eigen values # upto NumberOfRows will be non-zero values and the remainder will have # zero/singular values. Please read the following reference for more # information. Numerical recipes(SVD): # http://www.library.cornell.edu/nr/bookcpdf/c2-6.pdf # 6. Noticed that the number of printable eigen vectors were restricted to the # first 6 but in cases when we have less than 6 we need to handle likewise. # 7. This code is also used in the mAdb system (http://nciarray.nci.nih.gov/) # from which it has been adapted. ## mae.simplePCAanalysis <- function(data, plotFile="simplePCA", mode="arrays", scale.data= TRUE, uniqGroups, group, allowableEigenVectors=6 ) { # mae.simplePCAanalysis if(!require(mva)) return("library mva not found") # [1] Computing the PCA if (mode == "arrays") { # for arrays, compute PCA on the transpose d1pca <- prcomp(t(data),scale=scale.data) } else { #for genes, compute the PCA d1pca <- prcomp(data,scale=scale.data) } # [2] Compute the number of printable Eigen Vectors # We limit it here to 6 which can of course be set to higher # values but the first few ( 3 or so ) should account for # maximum variance. allowableEigenVectors <- 6 curEigenVectors <- ncol(d1pca$rotation) if (curEigenVectors < allowableEigenVectors) nEigenVecs <- min(curEigenVectors,allowableEigenVectors) else nEigenVecs <-allowableEigenVectors # [3] Create simple vectors of the PCA results eigenVals <- d1pca$sdev^2 eigenVec <- d1pca$rotation[,1:6] # Create 3D data for the points. points3D <- cbind(d1pca$x[,1],d1pca$x[,2],d1pca$x[,3]) # [4] Sorting the loadings L1 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,1]) L2 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,2]) L3 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,3]) L1s <- L1[sort.list(L1$VAL),] L2s <- L2[sort.list(L2$VAL),] L3s <- L3[sort.list(L3$VAL),] loadings <- list(L1s, L2s, L3s) # for return list # [5] Put all analysis plots in one PDF file as 2x2 panels/page. # 1. Variance across principal components # 2. PC1 vs PC2 (arrays or genes) # 3. PC2 vs PC3 (arrays or genes) # 4. PC1 vs PC3 (arrays or genes) # 5. Loading: coefficient PC1 vs variable effects # 6. Loading: coefficient PC2 vs variable effects # 7. Loading: coefficient PC3 vs variable effects # pdf(plotFile,encoding="ISOLatin1",width=12,height=12) opar <- par(mfrow=c(2,2),pch=19,lty=2) maxComp1 <- max(abs(d1pca$x[,1])) xrange <- c(-maxComp1,maxComp1) yrange <- c(-maxComp1,maxComp1) # [5.1] Plot Variance across Principal Components for arrays or genes if (mode == "arrays") { # process arrays data plot(d1pca,main="Variance across Principal Components by Arrays") # Group Color based plots mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=1,yindex=2, gbgcolor="white", mainTitle="Arrays: PC1 Vs PC2", xlabel="PC1",ylabel="PC2", xrange=xrange,yrange=yrange) mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=2,yindex=3, gbgcolor="white", mainTitle="Arrays: PC2 Vs PC3", xlabel="PC2",ylabel="PC3", xrange=xrange,yrange=yrange) mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=1,yindex=3, gbgcolor="white", mainTitle="Arrays: PC1 Vs PC3", xlabel="PC1",ylabel="PC3", xrange=xrange,yrange=yrange) } # process arrays data else { # process genes data plot(d1pca,main="Variance across Principal Components by Genes") plot(d1pca$x[,1],d1pca$x[,2],xlim=xrange,ylim=yrange, main="Genes: PC1 Vs PC2",xlab="PC1",ylab="PC2",pch=20) plot(d1pca$x[,2],d1pca$x[,3],xlim=xrange,ylim=yrange, main="Genes: PC2 Vs PC3",xlab="PC2",ylab="PC3",pch=20) plot(d1pca$x[,1],d1pca$x[,3],xlim=xrange,ylim=yrange, main="Genes: PC1 Vs PC3",xlab="PC1",ylab="PC3",pch=20) } # process genes data # [5.2] Plot Sorted loadings # Eigen Vectors(contained in the columns of the loadings) if (mode == "arrays") mainLTitle= "Sorted genes coefficient PC1 loading" else mainLTitle= "Sorted arrays coefficient PC1 loading" plot(L1s$VAL[nrow(L1s):1],xlab="Variable Effect",ylab="Coefficient PC1", main=mainLTitle,type="l") text(L1s$VAL[nrow(L1s):1],labels=paste(L1s$ID),col=4) if (mode == "arrays") mainLTitle= "Sorted genes coefficient PC2 loading" else mainLTitle= "Sorted arrays coefficient PC2 loading" plot(L2s$VAL[nrow(L2s):1],xlab="Variable Effect",ylab="Coefficient PC2", main=mainLTitle,type="l") text(L2s$VAL[nrow(L2s):1],labels=paste(L2s$ID),col=4) if (mode == "arrays") mainLTitle= "Sorted genes coefficient PC3 loading" else mainLTitle= "Sorted arrays coefficient PC3 loading" plot(L3s$VAL[nrow(L1s):1],xlab="Variable Effect",ylab="Coefficient PC3", main=mainLTitle,type="l") text(L3s$VAL[nrow(L3s):1],labels=paste(L3s$ID),col=4) dev.off(); return(pca=d1pca, eigenVals=eigenVals, eigenVec=eigenVec, nEigenVecs=nEigenVecs, points3D=points3D, loadings=loadings) } # end of mae.simplePCAanalysis ## # mae.loessPlot() - generate loess line and fitted plots for # for two samples if doPlot is true. The plots include: # M vs A (loess line) # M vs A (loess fitted) plots # Parameters: # x and y single sample matrices or data frames # xName and yName are sample names # doPlot = TRUE, if do a plot, else just compute the loess data # Return: # AValue is c(log10(as.matrix(x)) + log10(as.matrix(y)))/2 # MValue is c(log2(as.matrix(x)/as.matrix(y))) # dMvsA is data.frame(ID=(cbind(x,y)[,1]),AVAL=AValue, MVAL=MValue) # dMvsA.lo is loess fit of (MValue ~ AValue) # fitted is (MValue-dMvsA.lo$fitted) # sNames is c(xName,yName) # # Developer: S. Sundaram, P. Lemkin, G. Thornwal # Date: 06/30/2003 ## mae.loessPlot <- function(x,y,xName,yName,doPlot=TRUE) { # mae.loessPlot AValue <- c(log10(as.matrix(x)) + log10(as.matrix(y)))/2 MValue <- c(log2(as.matrix(x)/as.matrix(y))) dMvsA <- data.frame(ID=(cbind(x,y)[,1]), AVAL=AValue, MVAL=MValue) dMvsA.lo <- loess(MVAL ~ AVAL, dMvsA, degree=1, span=.5) samplesTitle <- paste(xName,"vs.",yName) # mTitleL <- paste("M vs A (loess line for",samplesTitle,")") if(doPlot) plot(AValue,MValue,main=mTitleL,xlab="A(log10)",ylab="M(log2)") points(AValue,dMvsA.lo$fitted, col="red", pch=18); mTitleF <- paste("M vs. A (loess fitted for",samplesTitle,")") fitted <- MValue-dMvsA.lo$fitted if(doPlot) plot(AValue,fitted,main=mTitleF,xlab="A(log10)", ylab="M(log2)") sNames=c(xName,yName) return(AValue=AValue, MValue=MValue, dMvsA=dMvsA, dMvsA.lo=dMvsA.lo, fitted=fitted, sNames=sNames) } # end of mae.loessPlot # --- end of MAERlibr --- #