#!/usr/bin/env python3

################################################################
# copyright (c) 2022 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied.  See the License for the specific language
# governing permissions and limitations under the License. 
################################################################

# ann_pfam_www0.py takes an annotation file from fasta36 -V with a line of the form:

# sp|P0810|GSTM1_RAT [tab] seqlen
#
# and returns:
# >P08010|GSTM2_RAT
# 3	-	81	GST_N~1
# 105	-	189	GST_C~2

# This version has been re-written in python to use the EBI/InterPro API parsing json
# to get Pfam protein information:  https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/uniprot/P09488
# to get Pfam domain information: https://www.ebi.ac.uk/interpro/api/entry/pfam/PF02798
#
# currently, it does not provide clan information, because the
# EBI/Interpro/Pfam API does not provide clan information
#
# to get clan (set) information:
# https://www.ebi.ac.uk/interpro/api/set/pfam/entry/pfam/PF02798
#
# which provides (among other things):
# "results": {"metadata": {
#                "accession": "CL0172",
#                "name": "Thioredoxin",
#                "source_database": "pfam"}, ... }

import fileinput
import sys
import re
import json
import argparse
import urllib.request
import urllib.error

interpro_prot_url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/uniprot/"
interpro_domain_url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/"
interpro_clan_url = "https://www.ebi.ac.uk/interpro/api/set/pfam/entry/pfam/"

def get_pfam_id_www( acc):

    if (acc == 'NODOM'):
        return 'NODOM'

    try:
        req = urllib.request.urlopen(interpro_domain_url + acc)

    except urllib.error.URLError as e:
        prot_info = ''
        sys.stderr.write(e.read().decode('utf-8')+'\n')

    else:
        prot_info = req.read().decode('utf-8')


    json_info= json.loads(prot_info)

## json looks like:
##  'results': [
##      {'metadata': {'accession': 'PF02798', 'name': 'Glutathione S-transferase, N-terminal domain', 'source_database': 'pfam', 'type': 'domain', 'integrated': 'IPR004045', 'member_databases': None, 'go_terms': None}, 
##       'proteins': [
##           {'accession': 'p09488', 
##            'protein_length': 218,
##            'source_database': 'reviewed',
##            'organism': '9606',
##            'entry_protein_locations': [
##                {'fragments': [{'start': 5,'end': 82,'dc-status': 'CONTINUOUS'}],
##                 'model': 'PF02798',
##                 'score': 5.2e-21}]
##            }
##       ]

    pf_dom_id  = json_info['metadata']['name']['short']

    return(pf_dom_id)

def get_clan_info_www(pf_acc):

    if (pf_acc=='NODOM'):
        return {'name':'NODOM','accession':'NODOM'}

    try:
        req = urllib.request.urlopen(interpro_clan_url + pf_acc)

    except urllib.error.URLError as e:
        prot_info = ''
        sys.stderr.write(e.read().decode('utf-8')+'\n')

    else:
        clan_info = req.read().decode('utf-8')


    json_info= json.loads(clan_info)

## results look like:
## "results": [
##      {
##          "metadata": {
##              "accession": "CL0172",
##              "name": "Thioredoxin",
##              "source_database": "pfam"
##          },
## ...
##      }
## ]
##

    if ('results') in json_info:
        return(json_info['results'][0]['metadata'])
    else:
        return None

def get_seq_acc(seq_id):

    if (re.search(r'^gi\|',seq_id)):
        (tmp, gi, sdb, acc, id) = seq_id.split('|')

    elif (re.search(r'^(sp|tr|up)\|', seq_id)):
        (sdb, acc, id) = seq_id.split('|')
    else:
      acc = re.split(r'\s',seq_id)[0]

    acc = re.sub(r'\.\d+$','',acc)

    return acc

def get_pfam_www(acc):

    try:
        req = urllib.request.urlopen(interpro_prot_url + acc)

    except urllib.error.URLError as e:
        prot_info = ''
        sys.stderr.write(e.read().decode('utf-8')+'\n')
        return ([], 0)

    else:
        prot_info = req.read().decode('utf-8')

        if (len(prot_info) == 0):
            return ([],0)

    json_info= json.loads(prot_info)

    pf_dom_list = []

    prot_len = json_info['results'][0]['proteins'][0]['protein_length']

    for result in json_info['results']:
        for protein in result['proteins']:
            for entry in protein['entry_protein_locations']:
                for frag in entry['fragments']:
                    pf_dom_list.append({'pf_acc':entry['model'], 'start':frag['start'], 'end':frag['end'], 'score':entry['score']})
                
    pf_dom_list.sort(key = lambda x: x['start'])

    return(pf_dom_list, prot_len)

def add_nodoms(pf_dom_list, seq_len, min_nodom=10):

    if (len(pf_dom_list) == 0):
        return pf_dom_list

    prev_dom = {'end':0}

    npf_domains = []

    for curr_dom in pf_dom_list:
        if(curr_dom['start'] - prev_dom['end'] > min_nodom):
            new_dom = {'start':prev_dom['end']+1, 'end':curr_dom['start']-1, 'pf_acc':'NODOM'}
            npf_domains.append(new_dom)
        npf_domains.append(curr_dom)
        prev_dom = {'end':curr_dom['end']}

    if (seq_len - pf_dom_list[-1]['end'] > min_nodom):
        new_dom = {'start':pf_dom_list[-1]['end']+1, 'end':seq_len, 'pf_acc':'NODOM'}
        npf_domains.append(new_dom)

    return npf_domains

def print_doms(seq_id, color_ix, args, dom_colors, dom_names, clan_info):
               
    this_acc = get_seq_acc(seq_id)

    if (args.lav):
        print_fmt = "%d\t%d\t%s~%s"
    else:
        print_fmt = "%d\t-\t%d\t%s~%s"

    (pf_dom_list, prot_len)  = get_pfam_www(this_acc)

    ## add no-doms if requested
    if (args.neg_doms):
        pf_dom_list = add_nodoms(pf_dom_list, prot_len, args.min_nodom)

    for dom in pf_dom_list:

        pf_acc = dom['pf_acc']

        ## check if domain has color number
        if (pf_acc in dom_colors):
            dom_color = dom_colors[pf_acc]
        else:
            dom_color = dom_colors[pf_acc] = str(color_ix)
            color_ix += 1

        ## check if domain_acc has short name
        if (pf_acc not in dom_names):
            pf_id = dom_names[pf_acc] = get_pfam_id_www(pf_acc)
            pf_clan_info = clan_info[pf_acc] = get_clan_info_www(pf_acc)
        else:
            pf_id = dom_names[pf_acc]
            pf_clan_info = clan_info[pf_acc]


        this_clan_info = clan_info[pf_acc]


        ## display id or acc?
        if (this_clan_info):
            pf_info = 'C.'+ this_clan_info['name']
        else:
            pf_info = pf_id

        if (args.pfam_acc):
            if (pf_info and not args.no_clans):
                pf_info = this_clan_info['accession']
            else:
                pf_info = pf_acc

        if (args.acc_comment):
            pf_info = "%s{%s}"%(pf_info,pf_acc)

        if (args.bound_comment):
            dom_color = "%d:%d"%(dom['start'],dom['end'])

        print(print_fmt%(dom['start'],dom['end'],pf_info,dom_color))

    return color_ix

def read_print_fd(fd, args):

    dom_colors = {'NODOM':'0'}
    dom_names = {}
    clan_info =  {'NODOM':{'name':'NODOM','accession':'NODOM'}}

    color_ix = 1

    for line in fd:
        line = line.strip('\n')
        seq_id = line.split('\t')[0]

        print(">%s"%(seq_id))
        color_ix = print_doms(seq_id, color_ix, args, dom_colors, dom_names, clan_info)

def main() :

    parser=argparse.ArgumentParser(description='ann_pfam_www.py P12345')
    parser.add_argument('--lav',dest='lav',action='store_true',default=False)
    parser.add_argument('--acc_comment',dest='acc_comment',action='store_true',default=False)
    parser.add_argument('--bound_comment',dest='bound_comment',action='store_true',default=False)
    parser.add_argument('--neg_doms',dest='neg_doms',action='store_true',default=False)
    parser.add_argument('--neg-doms',dest='neg_doms',action='store_true',default=False)
    parser.add_argument('--no-clans',dest='no_clans',action='store_true',default=False)
    parser.add_argument('--no_clans',dest='no_clans',action='store_true',default=False)
    parser.add_argument('--min_nodom',dest='min_nodom',action='store',default=10)
    parser.add_argument('--no_over',dest='no_over',action='store_true',default=False)
    parser.add_argument('--no-over',dest='no_over',action='store_true',default=False)
    parser.add_argument('--pfacc',dest='pfam_acc',action='store_true',default=False)
    parser.add_argument('--pfam_acc',dest='pfam_acc',action='store_true',default=False)

    parser.add_argument('files', metavar='FILE', help='files to read, stdin if empty', nargs='*')
    args=parser.parse_args()

    ## check for stdin input
    if (len(args.files) == 0):
        read_print_fd(sys.stdin, args)
        return

    try:
        fd = open(args.files[0],'r')
        read_print_fd(fd, args)
        fd.close()

    except:
        color_ix = 1
        dom_colors = {'NODOM':'0'}
        dom_names = {}
        clan_info = {'NODOM':{'name':'NODOM','accession':'NODOM'}}

        for seq_id in args.files:
            print(">%s"%(seq_id))
            print_doms(seq_id, color_ix, args, dom_colors, dom_names, clan_info)

if __name__ == '__main__':
    main()