#!/usr/bin/python2
# 

"""Module Description

Copyright (c) 2009 H. Gene Shin <shin@jimmy.harvard.edu>

This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).

site.py gives an average enrichment profile of given regions of interest (eg, binding sites or motifs).

@status:  experimental
@version: $Revision$
@author:  H. Gene Shin
@contact: shin@jimmy.harvard.edu
"""

# ------------------------------------
# python modules
# ------------------------------------
import os
import sys
import time
import subprocess
import string
import logging
import re
import itertools
from optparse import OptionParser

# -----------------------------------
# my modules
# -----------------------------------
import CEAS.inout as inout
import CEAS.corelib as corelib
import CEAS.R as R

# ------------------------------------
# constants
# ------------------------------------
logging.basicConfig(level=20,
                    format='%(levelname)-5s @ %(asctime)s: %(message)s ',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    stream=sys.stderr,
                    filemode="w"
                    )

# ------------------------------------
# Misc functions
# ------------------------------------
error   = logging.critical		# function alias
warn    = logging.warning
debug   = logging.debug
info    = logging.info

# ------------------------------------
# classes
# ------------------------------------
class WigProfilerwBed:
    """WigProfiler for regions in Bed"""
    
    def __init__(self, span=1000, step=20, dir=False):
        """Constructor"""
        
        # parameters
        self.span = span
        self.step = step
        self.dir = dir
        
        # output values
        self.names = []
        self.profiles = []
        self.xaxis = []
        
        
    def set_params(self, span, step, dir):
        """Set parameters
        
        Parameters:
        1. wig: an Wig object in Cistrome.Assoc.inout
        2. bed: a Bed object in Cistrome.Assoc.inout
        3. span: span size from the center of each region
        4. dir: if set True, the start and end of '-' will be regarded as the end and start.
        """

        self.span=span
        self.step=step
        self.dir=dir
        
        
    def capture_regions(self, wig, bed):
        """Capture the regions that will be profiled"""

        wigint = self.estimate_wig_interval(wig)    # estimate the wig interval for where2 
        step = self.step        
        hf = 1.0*step/2
        binned = []
        chroms = list(set(wig.get_chroms()).intersection(set(bed.get_chroms())))
        chroms = corelib.sort_chroms(chroms)
        
        for chrom in chroms:
            x=wig[chrom][0]
            y=wig[chrom][1]
            
            # if the direction is considered, include strand info from the BED
            if self.dir:
                try:
                    bediter=itertools.izip(bed[chrom]['start'],bed[chrom]['end'], bed[chrom]['strand'])
                except KeyError:
                    dummy=['' for i in xrange(len(bed[chrom]['start']))]
                    bediter=itertools.izip(bed[chrom]['start'],bed[chrom]['end'], dummy)
            else:
                dummy=['' for i in xrange(len(bed[chrom]['start']))]
                bediter=itertools.izip(bed[chrom]['start'],bed[chrom]['end'], dummy)
            
            # to reduce the search time, the initial search point is updated everytime. 
            #init = 0  
            for begin, cease, strand in bediter:
                # get the center, right edge and left edge for search
                center = (begin+cease) /2
                left = center -self.span - hf
                right = center + self.span + hf
                n = int(right-left)/step
                
                # get the region from the wig, binning
                #start, end= corelib.where2(left, right, x[init:], wigint)
                #start, end = corelib.where(left, right, x[init:])
                #regionx = x[init+start:init+end]
                #regiony = y[init+start:init+end]
                # test if updating the initial search point everytime might have caused a problem
                start, end = corelib.where(left, right, x)
                regionx = x[start:end]
                regiony = y[start:end]
                
                if regionx and regiony:
                    bins = corelib.linspace(left, right, n+1)
                    this = corelib.binxy_equibin(bins, regionx, regiony, binfunc='middle', NaN=False)
                    if strand == '-': this.reverse()
                else:
                    #this = [0] * n
                    this = []
                    
                # save the binned signal
                binned.append(this)
                    
                # update initial search point   
                #init+=start
                
        return binned
    
    
    def estimate_wig_interval(self, wig):
        """Estimate the interval between two consecutive points. 
        This method is exactly the same as estimate_wig_interval function in inout.py.
        
        This methods select randomly 10 regions in each chromosome and take the median of two consecutive intervals.
        """   
        
        chroms = wig.get_chroms()
        if not chroms: return None
    
        n_random_positions = 10
        intervals = []
        for chrom in chroms:
            len_this_chr = len(wig[chrom][0])
            a = corelib.randints(0, len_this_chr - 2, 2 * n_random_positions)   # we need at least two element array to get difference
            a.sort()
            starts = [a[i] for i in xrange(len(a)) if i%2 == 0]
            ends = [a[i] + 2 for i in xrange(len(a)) if i%2 == 1]# we need at least two element array to get difference

            for start, end in itertools.izip(starts, ends):
                intervals.append(corelib.median(corelib.diff(wig[chrom][0][start:end])))
             
        return corelib.median(intervals)
        
        
    def get_breaks(self, start, end):
        """Return breaks for bins
        
        Parameters:
        1. start: the start value
        2. end: the end value. This end value is included in the resulting breaks.
        
        """
        step = self.step
        n = (end-start) / step + 1
        breaks = map(lambda x: int(round(x)), corelib.linspace(start, end, n))
        
        return breaks
       
   
    def profile(self, wig, bed):
        """Wrapper function of WigProfilewBed. 
        Through this function, the user can set the parameters for profiling and get 
        a list of profiles of the regions.
        
        """
        
        start = -1 * self.span
        end = self.span
        return self.capture_regions(wig, bed)
   
   
# ------------------------------------
# functions
# ------------------------------------
def prepare_optparser ():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    usage = "usage: %prog <-w wig -b bed> [options]"
    description = "sitepro -- Average profile around given genomic sites"
    
    # option processor
    optparser = OptionParser(version="%prog 0.6.6 (package version 1.0.2)",description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="Show this help message and exit.")
    optparser.add_option("-w","--wig",dest="wig",type="string", action="append",\
                         help="input WIG file. WARNING: both fixedStep and variableStep WIG formats are accepted. Multiple WIG files can be given via -w (--wig) individually (eg -w WIG1.wig, -w WIG2.wig). WARNING! multiple wig and bed files are not allowed.")
    optparser.add_option("-b","--bed",dest="bed",type="string", action="append",\
                         help="BED file of regions of interest. (eg, binding sites or motif locations) Multiple BED files can be given via -b (--bed) individually (eg -b BED1.bed -b BED2.bed). WARNING! multiple wig and bed files are not allowed.")
    optparser.add_option("--span",dest="span",type="int",\
                         help="Span from the center of each BED region in both directions(+/-) (eg, [c - span, c + span], where c is the center of a region), default:1000 bp", default=1000)   
    optparser.add_option("--pf-res", dest="pf_res", type="int",\
                          help="Profiling resolution, default: 50 bp", default=50) 
    optparser.add_option("--dir",action="store_true",dest="dir",\
                         help="If set, the direction (+/-) is considered in profiling. If no strand info given in the BED, this option is ignored.",default=False)
    optparser.add_option("--dump",action="store_true",dest="dump",\
                         help="If set, profiles are dumped as a TXT file",default=False)
    optparser.add_option("--name",dest="name",type="string",
                         help="Name of this run. If not given, the body of the bed file name will be used,")
    optparser.add_option("-l","--label",dest="label",type="string", action="append",\
                         help="Labels of the wig files. If given, they are used as the legends of the plot and in naming the TXT files of profile dumps; otherwise, the WIG file names will be used as the labels. Multiple labels can be given via -l (--label) individually (eg, -l LABEL1 -l LABEL2). WARNING! The number and order of the labels must be the same as the WIG files.", default=None)
    #optparser.add_option("--log",action="store_true",dest="log",\
    #                     help="If set, a log file is recorded in the current working directory.",default=False) 
    return optparser


def opt_validate (optparser):
    """Validate options from a OptParser object.

    Ret: Validated options object.
    """
    (options,args) = optparser.parse_args()
    
    # input BED file and GDB must be given 
    if not (options.wig and options.bed):
        optparser.print_help()
        sys.exit(1)
    else:
        if len(options.wig) > 1 and len(options.bed) > 1:
            error("Either a single BED file and multiple WIG files or multiple BED files and a single WIG file are allowed.")
            sys.exit(1)

    # split the wig file names
    
    if options.wig:
        for wig in options.wig:
            if not os.path.isfile(wig):
                error("Check -w (--wig). No such file as '%s'" %wig)
                sys.exit(1)
    
    if options.bed:
        for bed in options.bed:
            if not os.path.isfile(bed):
                error('Check -b (--bed). No such file exists:%s' %bed)
                sys.exit(1)
            
     # get namename
    if not options.name:
        #options.name=os.path.split(options.bed)[-1].rsplit('.bed',2)[0]
        options.name="sitepro_%s" %(time.strftime("%Y.%b.%d.%H-%M-%S", time.localtime()))
        
    # get the aliases
    if len(options.wig) > 1:
        if options.label:
            if len(options.label) != len(options.wig):
                error("The number and order of the labels must be the same as the WIG files. Check -w and -l options.")
                sys.exit(1)
        else:
            options.label = map(lambda x: os.path.split(x)[1].rsplit('.wig')[0], options.wig)    
    elif len(options.bed) > 1:
        if options.label:
            if len(options.label) != len(options.bed):
                error("The number and order of the labels must be the same as the BED files. Check -b and -l options.")
                sys.exit(1)
        else:
            options.label = map(lambda x: os.path.split(x)[1].rsplit('.bed')[0], options.bed)
    else:  # when given very standard inputs, one bed and one wig
        if options.label:
            if len(options.label) != 1:
                error("Only one label must be given with one BED and one WIG. Check -l option.")
                sys.exit(1)
        else:
            options.label = [os.path.split(options.wig[0])[1].rsplit('.wig')[0]]
            
    # print arguments 
    options.argtxt = "# ARGUMENTS:\n"
    options.argtxt += "\n".join(("# name: %s" %options.name, "# BED file(s): %s" %str(options.bed)[1:-1], "# WIG file(s): %s" %str(options.wig)[1:-1], "# span: %s bp" %str(options.span), "# resolution: %s bp" %str(options.pf_res), "# direction (+/-): %s" %("ON"*options.dir + "OFF"*(not options.dir))))
    
    # if logfile is written.
    #if options.log:
    #    logf = open(options.name + ".log", 'w')
    #    logf.write(options.argtxt + "\n")
    #    logf.close()
        
    return options


def draw_siteprofiles(sitebreaks, avg_siteprof):
    """Return a R script that draws the average profile on the given sites"""
    
    #comment
    R.comment('')
    R.comment('Draw wig profiles around binding sites')
    R.comment('')
    
    if avg_siteprof == None:
        trim_siteprof = [0]*len(sitebreaks)
    else:
        trim_siteprof = avg_siteprof
    minlen = min([len(sitebreaks), len(trim_siteprof)])
    rscript=inout.draw_single_profile(sitebreaks[:minlen], trim_siteprof[:minlen],col=["red"],main='Average Profile around the Center of Sites',xlab='Relative Distance from the Center (bp)',ylab='Average Profile',ylim=[],v=0)
    
    return rscript
    
    
def draw_multiple_siteprofiles(sitebreaks, avg_siteprofs, legends):
    """Return a R script that draws multiple average profiles on the given sites"""
    
    #comment
    R.comment('')
    R.comment('Draw multiple wig profiles around binding sites')
    R.comment('')
        
    # handle if None is in the avg_siteprofs
    trim_siteprofs = []
    for siteprof in avg_siteprofs:
        if siteprof == None:
            temp = [0]*len(sitebreaks)
        else:
            temp = siteprof
        trim_siteprofs.append(temp)
    minlen = min([len(sitebreaks), min(map(len, trim_siteprofs))])
    trim_siteprofs = map(lambda x: x[:minlen], trim_siteprofs)
    rscript=inout.draw_multiple_profiles2(sitebreaks[:minlen], trim_siteprofs, cols=[], main='Average Profiles around the Center of Sites',xlab='Relative Distance from the Center (bp)',ylab='Average Profile',ylim=[],v=0, legends=legends)
    
    return rscript
    

def dump(chrom, sites, siteprofs):
    """Dump the sites and their profiles in a long string
    
    """
    
    starts = sites['start']
    ends = sites['end']
    
    # if no names are allowed, put no names for the output
    try:
        names = sites['name']
    except KeyError:
        names = [''] * len(starts)

    txt = ''
    for start, end, name, siteprof in itertools.izip(starts, ends, names, siteprofs):
        s = map(str, siteprof)
        # print out the regions
        if name == '':
            txt += "%s\t%d\t%d\t%s\n" %(chrom, start, end, ','.join(s)+',')
        else:
            txt += "%s\t%d\t%d\t\t%s\t%s\n" %(chrom, start, end, name, ','.join(s)+',')

    return txt


def catBEDs(BEDlist):
    """mergeBEDs merges multiple BED files in BEDlist into a one BED object. 
    In this case, a unique ID is given to the regions of each BED object to be merged
    """
    
    cBED = inout.Bed()
    cBED.bed = {}
    IDs = xrange(len(BEDlist))
    for BED, ID in itertools.izip(BEDlist, IDs) :
        chroms = BED.get_chroms()
        for chrom in chroms:
        
            # if the first bed to this chromosome, initialize
            if not cBED.has_key(chrom):
               cBED[chrom] = {'start': array('l', []), 'end': array('l', []), 'name': [], \
               'score': array('d', []), 'strand':[]}
            
            # chromosome length
            chrom_len = len(BED[chrom]['start'])
            
            # copy into the merged BED object
            cBED[chrom]['start'].extend(BED[chrom]['start'])
            cBED[chrom]['end'].extend(BED[chrom]['end'])
            cBED[chrom]['name'].extend([str(ID)] * chrom_len)
            cBED[chrom]['score'].extend([0] * chrom_len)
            try:
            	cBED[chrom]['strand'].extend(BED[chrom]['strand'])
            except KeyError:
                cBED[chrom]['strand'].extend([''] * chrom_len)
    
    return cBED
            
# ------------------------------------
# Main function
# ------------------------------------
def main():
    
    # read the options and validate them
    options=opt_validate(prepare_optparser())
    
    info ("\n" + options.argtxt)
    
    # reading a gene annotation table   
    jobcount=1
    
    # read regions of interest (bed file)
    info("#%d read the bed file(s) of regions of interest..." %jobcount)
        
    Siteslist = []
    for bed in options.bed:
        BED = inout.Bed()
        BED.read(bed)
        BED.sort()
        Siteslist.append(BED)
    
    # cat the BEd files to one and give a unique ID to the regions of each BED
    #Sites = catBEDs(Siteslist)
    
    # delete the individual BEDs
    #del Siteslist
    
    # sort for profiling
    #Sites.sort()
    jobcount += 1
    
    # create a profiler object
    profwbed=WigProfilerwBed(span=options.span, step=options.pf_res, dir=options.dir)
    # get the breaks
    sitebreaks = profwbed.get_breaks(-1*options.span, options.span)
       
    # initialize avg_siteprofs with empty list
    super_avg_siteprofs = []
    # run multiple wig files
    wigl = 0
    for wig in options.wig:
        # Doing profiling
        chrom=''
        chrcount=1
        FIRST=True
        avg_siteprofs = map(lambda x: [], Siteslist)
        avg_spcounts = map(lambda x: [], Siteslist)
        fixedStep = False
        
        info( "#%d run sitepro on %s..." %(jobcount, os.path.split(wig)[-1]))
            
        for line in open(wig,'r').xreadlines():
            if not line: continue
            # read a chromosome
            if re.search(r'track',line): 
                try:
                    description=re.search(r'description="(\w+)"\s',line).group(1)
                except AttributeError:
                    pass
                continue
            
            # check if fixedStep or variableStep
            if re.search(r'fixedStep', line):
                fixedStep = True
                step = int(re.search(r'step=(\S+)\s', line).group(1))
                position = int(re.search(r'start=(\S+)\s', line).group(1))
            
            if re.search(r'chrom=(\S+)\s',line):
                newchrom=re.search(r'chrom=(\S+)\s',line).group(1)
                try:
                    newchrom=inout.standard_chroms[newchrom]
                except KeyError:
                    pass
                continue
        	
            l=line.strip().split()

            # the beginning
            if chrom=='' and chrom!=newchrom:
                chrom=newchrom
            
                # if this chromosome is not in chromosome list of the BED, just ignore it
                info("#%d-%d read and process %s..." %(jobcount, chrcount, chrom))                        
                input=inout.Wig()
                
                # if fixedStep, calculate the position from start and step
                if fixedStep:
                    row = [position, l[-1]]
                    position += step
                else:
                   	row = l
                   	
                # add the new line to the Wig object
                input.add_line(chrom, row)
                chrcount += 1

            elif chrom!='' and chrom!=newchrom:    # new chromosome
            
                #for Sites, avg_siteprof, avg_spcount in itertools.izip(Siteslist, avg_siteprofs, avg_spcounts):
                for i in xrange(len(Siteslist)): 
                    chroms_bed = Siteslist[i].get_chroms()
                    if chrom in chroms_bed:    
                        # wig profiling for given regions of interest
                        siteprofs=profwbed.profile(input, Siteslist[i])
                        avg_sp, spcount=corelib.mean_col_by_col(siteprofs, counts=True)
                                        
                        if not FIRST:    # if not first chromosome    
                            # average site profiles
                            avg_siteprofs[i],avg_spcounts[i]=corelib.weight_mean_col_by_col([avg_siteprofs[i],avg_sp],[avg_spcounts[i],spcount],counts=True)
                            
                            # if --dump, dump the profiles along with their corresponding BED regions
                            if options.dump:
                                nm = options.label[wigl + i] + '_dump.txt'
                                dfhd = open(nm, 'a')
                                dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs))
                                dfhd.close()
                    
                            del avg_sp,spcount
                
                        else:   # if first chromosome
                            avg_siteprofs[i]=avg_sp
                            avg_spcounts[i]=spcount
                            
                            # open a txt file to dump profiles if --dump is set
                            if options.dump:
                    	        nm = options.label[wigl + i] + '_dump.txt'
                                dfhd = open(nm, 'w')
                                dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs))
                                dfhd.close()
                    
                        # de1lete unnucessary variables to maximize usable memory
                        del siteprofs   
                 
                # set chrom to the new chromosome
                if FIRST: FIRST = False
                chrom=newchrom
                info("#%d-%d read and process %s..." %(jobcount, chrcount, chrom))
                input=inout.Wig()
                	
                # if fixedStep, calculate the position from start and step
                if fixedStep:
                    row = [position, l[-1]]
                    position += step
                else:
                   	row = l
                   	
                # add the new line to the Wig object
                input.add_line(chrom, row)
                chrcount+=1
            else:    # in the middle of chromosome
                # if fixedStep, calculate the position from start and step
                if fixedStep:
                    row = [position, l[-1]]
                    position += step
                else:
                   	row = l
                   	
                # add the new line to the Wig object
                input.add_line(chrom, row)
                
		# the last chromosome
        #for Sites, avg_siteprof, avg_spcount in xrange(len(Siteslist)): #itertools.izip(Siteslist, avg_siteprofs, avg_spcounts):
        for i in xrange(len(Siteslist)): #itertools.izip(Siteslist, avg_siteprofs, avg_spcounts):
            
            chroms_bed = Siteslist[i].get_chroms()
            if chrom in chroms_bed:            
                # doing profiling!

                siteprofs=profwbed.profile(input, Siteslist[i])
                avg_sp,spcount=corelib.mean_col_by_col(siteprofs,counts=True)           
                
                if not FIRST:    # the first chromosome profiling
                    avg_siteprofs[i],avg_spcounts[i] = corelib.weight_mean_col_by_col([avg_siteprofs[i],avg_sp],[avg_spcounts[i],spcount],counts=True)
                    
                    # if --dump, dump the profiles along with their corresponding BED regions
                    if options.dump:
                        nm = options.label[wigl+i] + '_dump.txt'
                        dfhd = open(nm, 'a')
                        dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs))
                        dfhd.close()
            
                    del avg_sp,spcount        
                else:
                    # average site profiles
                    avg_siteprofs[i]=avg_sp
                    avg_spcounts[i]=spcount
            
                    # if --dump, dump the profiles along with their corresponding BED regions
                    if options.dump:
            	        nm = options.label[wigl+i] + '_dump.txt'
                        dfhd = open(nm, 'w')
                        dfhd.write(dump(chrom, Siteslist[i][chrom], siteprofs))
                        dfhd.close()
            
                # delete unnecessary variables
                del siteprofs

        #if options.dump:
        #    dfhd.close()
        
        #print avg_siteprofs
        # save the site profile of current wig file
        super_avg_siteprofs.extend(avg_siteprofs)
        
    	jobcount+=1
    	wigl += 1
    
    # write the R script
    info('#%d writing R script of profiling...' %jobcount)
    ofhd=open(options.name+'.R','w')
    pdfname=options.name+'.pdf'
    rscript=R.pdf(pdfname, height=6, width=8.5)  
    
    # if single profile; otherwise multiple profiles
    if len(options.wig)==1 and len(options.bed) == 1:
        rscript += draw_siteprofiles(sitebreaks, super_avg_siteprofs[0])
    else:
        #print super_avg_siteprofs
        rscript += draw_multiple_siteprofiles(sitebreaks, super_avg_siteprofs, options.label)
    ofhd.write(rscript)    # write wig profiling
    ofhd.write(R.devoff())
    ofhd.close()
   
    jobcount+=1
     
    # Run R directly - if any exceptions, just pass
    try:
        p = subprocess.Popen("R" + " --vanilla < %s"  %(options.name+'.R'), shell=True)
        sts = os.waitpid(p.pid, 0)
        if options.dump:
            info ('#... cong! See %s for the graphical result of sitepro and %s for the dumped profiles!' %(options.name+'.pdf', options.name+'.txt'))
        else:    
            info ('#... cong! See %s for the graphical result of sitepro!' % (options.name+'.pdf'))
            
    except:       
        info ('#... cong! Run %s using R for the graphical result of sitepro! sitepro could not run R directly.' %(options.name+'.R'))
        
# program running
if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        warn("User interrupts me! ;-) See you!")
        sys.exit(0)