#!/usr/bin/env python3 ## get_protein.py -- ## get a protein sequence from Uniprot or NCBI/Refseq using the accession ## ## modified to work with mysql.connector, urllib.request 7-Nov-2022 ## import sys import re import textwrap import mysql.connector import time import urllib.request import urllib.error ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" uniprot_url = "https://rest.uniprot.org/uniprotkb/" (host, user, passwd) = ("wrpxdb.bioch.virginia.edu","web_user","fasta_www") db_r = mysql.connector.connect(host=host,user=user,password=passwd,database="seqdb_demox") db_u = mysql.connector.connect(host=host,user=user,password=passwd,database="uniprot") cur_r = db_r.cursor(dictionary=True, buffered=True) cur_u = db_u.cursor(dictionary=True, buffered=True) get_refseq_sql = '''select acc, descr, seq from annot join protein using(prot_id) where acc="%s"''' get_up_sql = '''select db, acc, id, descr, seq from uniprot.annot2 join uniprot.protein using(acc) where acc="%s"''' get_up_iso_sql = '''select db, acc, id, descr, seq from uniprot.annot2_iso join uniprot.protein_iso using(acc) where acc="%s"''' def get_ncbi_sql(acc, cur): result = cur.execute(get_refseq_sql%(acc,)) if (result): acc, descr, seq = cur.fetchone() seq = re.sub(r'(.{60})',r'\g<1>\n',seq) return ">%s %s\n%s\n"%(acc, descr, seq) else: return False def get_ncbi_www(acc): db_type="protein" seq_args = "db=%s&id=" % (db_type) + acc + "&rettype=fasta" ## seq_html = urlopen(ncbi_url + seq_args) try: req = urllib.request.urlopen(ncbi_url+seq_args) except urllib.error.URLError as e: seq_html = '' sys.stderr.print(e.read().decode('utf-8')+'\n') else: seq_html=req.read().decode('utf-8') time.sleep(0.3) return seq_html def get_uniprot_sql(acc, cur): cur.execute(get_up_sql%(acc,)) row = cur.fetchone() if (row): return ">%s|%s|%s %s\n%s\n"%(row['db'], row['acc'], row['id'], row['descr'], row['seq']) else: cur.execute(get_up_iso_sql%(acc,)) db, acc, id, descr, seq = cur.fetchone() if (row): return ">%s|%s|%s %s\n%s\n"%(row['db'], row['acc'], row['id'], row['descr'], row['seq']) else: return False def get_uniprot_www(acc): try: up_req = urllib.request.urlopen(uniprot_url + acc + ".fasta") except urllib.error.URLError as e: seq_html = '' sys.stderr.print(e.read().decode('utf-8')+'\n') else: seq_html=up_req.read().decode('utf-8') return seq_html def print_seq(seq_html, sub_range): if not seq_html: return lines = seq_html.split('\n'); header = lines[0] seq = ''.join(lines[1:]) if (sub_range): start, stop = sub_range.split('-') start, stop = int(start), int(stop) if (start > 0): start -= 1 new_seq = seq[start:stop] else: start = 0 new_seq = seq if (start > 0): print("%s @C%d" %(header, start+1)) else: print(header) print('\n'.join(textwrap.wrap(new_seq))) sub_range = '' for acc in sys.argv[1:]: if (re.search(r':',acc)): (acc, sub_range) = acc.split(':') if (re.match(r'^(sp|tr|iso|ref)\|',acc)): acc=acc.split('|')[1] if (re.match(r'[NXYW]P_',acc)): seq_html = get_ncbi_sql(acc,cur_r) if (not seq_html): seq_html = get_ncbi_www(acc) else: seq_html = get_uniprot_sql(acc,cur_u) if (not seq_html): seq_html = get_uniprot_www(acc) print_seq(seq_html, sub_range)