#!/usr/bin/python

## get_protein.py -- 
## get a protein sequence from Uniprot or NCBI/Refseq using the accession
##

import sys
import re
import textwrap
from urllib2 import urlopen

ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" 
uniprot_url = "https://www.uniprot.org/uniprot/"

sub_range = ''
for acc in sys.argv[1:]:

  if (re.search(r':',acc)):
    (acc, sub_range) = acc.split(':')

  if (re.match(r'^(sp|tr|iso|ref)\|',acc)):
      acc=acc.split('|')[1]

  if (re.match(r'[NX]P_',acc)):
    db_type="protein"

    seq_args = "db=%s&id=" % (db_type) + ",".join(sys.argv[1:])  + "&rettype=fasta"
    seq_html = urlopen(ncbi_url + seq_args).read()
  else:
    seq_html = urlopen(uniprot_url + acc + ".fasta").read()

  header=''
  seq = ''
  for line in seq_html.split('\n'):
    if (line and line[0]=='>'):
      # print out old one if there
      if (header):
        if (sub_range):
          start, stop = sub_range.split('-')
          start, stop = int(start), int(stop)
          if (start > 0):
            start -= 1
          new_seq = seq[start:stop]
        else:
          start = 0
          new_seq = seq

        if (start > 0):
          print "%s @C%d" %(header, start+1)
        else:
          print header
        print '\n'.join(textwrap.wrap(new_seq))

      header = line;
      seq = ''
    else:
      seq += line

start=0
if (sub_range):
  start, stop = sub_range.split('-')
  start, stop = int(start), int(stop)
  if (start > 0):
    start -= 1
    new_seq = seq[start:stop]
else:
  new_seq = seq

if (start > 0):
  print "%s @C:%d" %(header, start+1)
else:
  print header

print '\n'.join(textwrap.wrap(new_seq))