#!/usr/bin/env python3 ## get_protein_www.py -- ## get a protein sequence from the Uniprot or NCBI/Refseq web sites using the accession ## ## modified to work with urllib.request 7-Nov-2022 ## modified to allow argparse arguments for identifier 20-Mar-2023 import argparse import sys import re import textwrap import time import urllib.request import urllib.error def main(): ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" uniprot_url = "https://rest.uniprot.org/uniprotkb/" sub_range = '' parser=argparse.ArgumentParser(description='get protein sequences from uniprot/ncbi') parser.add_argument('--id', help='substitute id',action='store',default='') parser.add_argument('accs', nargs='*', help='accessions') args=parser.parse_args() for acc in args.accs: if (re.search(r':',acc)): (acc, sub_range) = acc.split(':') if (re.match(r'^(sp|tr|iso|ref)\|',acc)): acc=acc.split('|')[1] if (re.match(r'[A-Z]P_\d+',acc)): # get refseq db_type="protein" seq_args = "db=%s&id=" % (db_type) + acc + "&rettype=fasta" url_string = ncbi_url + seq_args else: # get uniprot acc_fields = acc.split('|') if (len(acc_fields)==1): url_string = uniprot_url + acc + ".fasta" else: url_string = uniprot_url + acc_fields[0] + ".fasta" try: req = urllib.request.urlopen(url_string) except urllib.error.URLError as e: seq_html = '' sys.stderr.write(e.read().decode('utf-8')+'\n') continue else: seq_html=req.read().decode('utf-8') time.sleep(0.3) if (not sub_range): if (args.id): seq_html = re.sub('>','>%s '%(args.id),seq_html) print(seq_html) else: (start, stop) = sub_range.split('-') (start, stop) = (int(start), int(stop)) lines = seq_html.split('\n') header=lines[0] seq = ''.join(lines[1:]) if (start > 0): start -= 1 new_seq = seq[start:stop] ## print the header if (start > 0): print("%s @C:%d" %(header, start+1)) else: print(header) print('\n'.join(textwrap.wrap(new_seq))) if __name__ == '__main__': main()