#------------------------------------------------------------------------------- # # P-values correction for gene set analysis (SnpEff gsa) # # # Pablo Cingolani 2013 #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # Main #------------------------------------------------------------------------------- show = TRUE # Used for debugging show = FALSE # Used for production pvalCoefThreshold = 10^-12 # Adjust only if linearmodel p-value is less than this #--- # Parse command line arguments #--- if( ! exists('fileName') ) { args <- commandArgs(trailingOnly = TRUE) fileName = args[1] outFileName = args[2] minScoreCount = args[3] } #--- # Read data #--- d = read.table(fileName, sep="\t", header=TRUE) # p-values or scores. # Note: the conversion 'as.numeric(as.character())' is necesary # because R reads '0.6049666666666666' as a number, # but reads '0.60496666666666666' as a string (yes, R is amazing!) p = as.numeric(as.character(d$score)) c = d$scoreCount # number of variants # Filter out p-values of zero OR counts of zero (because of log) #keep = (p > 0) & (c > 0) keep = (p > 0) & (p < 1 ) & (c > 0) & (c > minScoreCount) p = p[keep] lp = qnorm(p) # Convert to z-scores using an inverse normal distribution c = c[keep] lc = log10(c) #--- # Linear model #--- lmfit = lm( lp ~ lc ) sumLmfit = summary(lmfit) pvalCoef = sumLmfit$coefficients[2,4] res = lmfit$residuals # Residuals padj = p # Keep some un-adjusted values padj[keep] = pnorm(res) # Adjusted p-values if( show ) { print(sumLmfit) cat('Slope:\t', sumLmfit$coefficients[2], '\tp-value:\t', pvalCoef, 'File:\t', fileName, '\n'); par(mfrow=c(2,2)) smoothScatter( lc, lp, main="Z-Scores", xlab="Number of scores", ylab="Z-Score" ) lines(lowess(lc, lp ), col='orange') smoothScatter( lc, res, main="Adjusted Z-Scores", xlab="Number of scores", ylab="Z-Score" ) lines(lowess(lc, res ), col='orange') plot( density(lp) , main="Z-Scores distribution", xlab="red:Adjusted black:Unadjusted") lines( density(res) , col='red') plot( density(lc) , main="log10(Number of scores)", xlab="") } #--- # Decide whether to use corrected scores or not. # Not very significant? Use original scores #--- if( pvalCoef > pvalCoefThreshold ) { padj = d$score } #--- # Create output file #--- dout = data.frame( geneId = d$geneId, score = padj ) write.table( dout, file=outFileName, quote=FALSE, row.names = FALSE, col.names = FALSE, sep="\t")