#-------------------------------------------------------------------------------
#
# Bayes-factor correction for gene set analysis (SnpEff gsa)
#
#
#															Pablo Cingolani 2013
#-------------------------------------------------------------------------------


#-------------------------------------------------------------------------------
# Main
#-------------------------------------------------------------------------------

show = TRUE		# Used for debugging
show = FALSE	# Used for production

pvalCoefThreshold = 10^-12	# Adjust only if linearmodel p-value is less than this

#---
# Parse command line arguments
#---
if( ! exists('fileName') ) {
    args <- commandArgs(trailingOnly = TRUE)
    fileName = args[1]
    outFileName = args[2]
    minScoreCount = args[3]
}

#---
# Read data
#---
d = read.table(fileName, sep="\t", header=TRUE)

# p-values or scores. 
# Note: the conversion 'as.numeric(as.character())' is necesary 
# because R reads '0.6049666666666666' as a number, 
# but reads       '0.60496666666666666' as a string (yes, R is amazing!)
p = as.numeric(as.character(d$score))	
c = d$scoreCount						# number of variants 

keep = (c > 0)

p = p[keep]
mu = mean(p)
sigma = sd(p)
lp = as.vector(scale(p[keep]))		# Scale to mean=0 and var=1 => Convert to z-scores using an inverse normal distribution

c = c[keep]
lc = log10(c)

#---
# Linear model
#---
lmfit = lm( lp ~ lc )
sumLmfit =  summary(lmfit)
pvalCoef = sumLmfit$coefficients[2,4]

res = lmfit$residuals		# Residuals
padj = sigma * res + mu	# Adjusted bayesian factor

# Don't adjust under 'minScoreCount'
padj[ c <= minScoreCount ] = p[ c <= minScoreCount ]

if( show ) {
	print(sumLmfit)
	cat('Slope:\t', sumLmfit$coefficients[2], '\tp-value:\t', pvalCoef, 'File:\t', fileName, '\n');

	par(mfrow=c(2,2))
	smoothScatter( lc, p, main="Z-Scores", xlab="Number of scores", ylab="Z-Score" )
	lines(lowess(lc, p ), col='orange')

	smoothScatter( lc, padj, main="Adjusted Z-Scores", xlab="Number of scores", ylab="Z-Score" )
	lines(lowess(lc, padj ), col='orange')

	plot( density(p) , main="Bayes-Factors distribution", xlab="red:Adjusted black:Unadjusted")
	lines( density(padj) , col='red')

	plot( density(lc) , main="log10(Number of scores)", xlab="")
}

#---
# Decide whether to use corrected scores or not
#---
if( pvalCoef < pvalCoefThreshold ) { 
	so = padj		# Significant? Then use correction
	so[ c <= minScoreCount ] = p[ c <= minScoreCount ]
} else {
	so = d$score	# Not very significant? Use original scores
}

#---
# Create output file
#---
dout = data.frame( geneId = d$geneId[keep], score = so )
write.table( dout, file=outFileName, quote=FALSE, row.names = FALSE, col.names = FALSE, sep="\t")