#!/usr/bin/env python # Copyright 2001 by Gavin E. Crooks. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. import getopt import sys import types import urllib from Bio.SCOP import * def usage() : print \ """Extract a SCOP domain's ATOM and HETATOM records from the relevant PDB file. For example: scop_pdb.py astral-rapid-access-1.55.raf dir.cla.scop.txt_1.55 d3hbib_ A result file, d3hbib_.ent, will be generated in the working directory. The required RAF file can be found at [http://astral.stanford.edu/raf.html], and the SCOP CLA file at [http://scop.berkeley.edu/parse/index.html]. Note: Errors will occur if the PDB file has been altered since the creation of the SCOP CLA and ASTRAL RAF files. Usage: scop_pdb [-h] [-i file] [-o file] [-p pdb_url_prefix] raf_url cla_url [sid] [sid] [sid] ... -h -- Print this help message. -i file -- Input file name. Each line should start with an sid (Scop domain identifier). Blank lines, and lines starting with '#' are ignored. If file is '-' then data is read from stdin. If not given then sids are taken from the command line. -o file -- Output file name. If '-' then data is written to stdout. If not given then data is written to files named sid+'.ent'. -p pdb_url-- A URL for PDB files. The token '%s' will be replaced with the 4 character PDB ID. If the pdb_url is not given then the latest PDB file is retrieved directly from rcsb.org. raf_url -- The URL or filename of an ASTRAL Rapid Access File sequence map. See [http://astral.stanford.edu/raf.html] cla_url -- The URL or filename of a SCOP parsable CLA file. See [http://scop.berkeley.edu/parse/index.html] sid -- A SCOP domain identifier. e.g. d3hbib_ """ default_pdb_url = "http://www.rcsb.org/pdb/cgi/export.cgi/somefile.pdb?" \ "format=PDB&pdbId=%s&compression=None" #default_pdb_url = "file://usr/local/db/pdb/data/010331/snapshot/all/pdb%s.ent" def open_pdb(pdbid, pdb_url=None) : if pdb_url ==None: pdb_url = default_pdb_url url = pdb_url % pdbid fn, header = urllib.urlretrieve(url) return open(fn) def main(): try: opts, args = getopt.getopt(sys.argv[1:], "hp:o:i:", ["help", "usage","pdb=","output=","input="]) except getopt.GetoptError: # print help information and exit: usage() sys.exit(2) input= None in_handle = None output = None pdb_url = None cla_url = None raf_url = None for o, a in opts: if o in ("-h", "--help","--usage"): usage() sys.exit() elif o in ("-o", "--output"): output = a elif o in ("-i", "--input"): input = a elif o in ("-p", "--pdb"): pdb_url = a if len(args) <2 : print >>sys.stderr, \ "Not enough arguments. Try --help for more details." sys.exit(2) raf_url = args[0] cla_url = args[1] (raf_filename, headers) = urllib.urlretrieve(raf_url) seqMapIndex = Raf.SeqMapIndex(raf_filename) (cla_filename, headers) = urllib.urlretrieve(cla_url) claIndex = Cla.Index(cla_filename) if input == None : sids = args[2:] elif input == '-' : sids = sys.stdin.xreadlines() else : in_handle = open(input) sids = in_handle.xreadlines() try: for sid in sids : if not sid or sid[0:1]=='#': continue id = sid[0:7] pdbid=id[1:5] s = pdbid[0:1] if s=='0' or s=='s' : print >>sys.stderr,"No coordinates for domain "+id continue if output == None : filename = id+".ent" out_handle = open(filename, "w+") elif output == '-' : out_handle = sys.stdout else : out_handle = open(output, "w+") try: try: claRec = claIndex[id] residues = claRec.residues seqMap = seqMapIndex.getSeqMap(residues) pdbid = residues.pdbid f = open_pdb(pdbid, pdb_url) try: seqMap.getAtoms(f, out_handle) finally : f.close() except (IOError, KeyError, RuntimeError), e: print >>sys.stderr, "I cannot do SCOP domain ",id,":",e finally : out_handle.close() finally : if in_handle != None : in_handle.close() if __name__ == "__main__": main()