"""Parser for FSSP files, used in a database of protein fold classifications. This is a module to handle FSSP files. For now it parses only the header, summary and alignment sections. See: Holm and Sander (1996) The FSSP database: fold classification based on structure-structure alignment of proteins. functions: read_fssp(file_handle): reads an fssp file into the records. Returns a tuple of two instances. mult_align: returns a Biopython alignment object """ import re from . import fssp_rec from Bio.Align import Generic from Bio import Alphabet fff_rec = fssp_rec.fff_rec header_records = { 'database' : re.compile('^DATABASE'), 'pdbid': re.compile('^PDBID'), 'header': re.compile('^HEADER'), 'compnd': re.compile('^COMPND'), 'author': re.compile('^AUTHOR'), 'source': re.compile('^SOURCE'), 'seqlength': re.compile('^SEQLENGTH'), 'nalign': re.compile('^NALIGN') } summary_title = re.compile('## +SUMMARY') summary_rec = re.compile(' *[0-9]+: +[1-9][0-9a-z]{3,3}') alignments_title= re.compile('## +ALIGNMENTS') alignments_rec = re.compile(' *[0-9]+ +-{0,1}[0-9]+') equiv_title = re.compile('## +EQUIVALENCES') class FSSPHeader: def __init__(self): self.database = None self.pdbid = '' self.header = '' self.compnd = '' self.source = '' self.author = [] self.seqlength = 0 self.nalign = 0 def fill_header(self,inline): for i in header_records: if header_records[i].match(inline): if i == 'database' or i == 'seqlength' or i == 'nalign': setattr(self,i,int(inline.split()[1])) elif i == 'compnd' or i == 'author': setattr(self,i,inline.split()[1:]) elif i == 'source' or i == 'header': attr = inline[inline.find(' ')+1:].strip() setattr(self,i,attr) else: setattr(self,i,inline.split()[1]) class PosAlign: def __init__(self,inStr): inStr = inStr.strip() if len(inStr) != 1 and len(inStr)!= 2: raise ValueError('PosAlign: length not 2 chars' + inStr) if inStr == '..': self.aa = '-' self.gap = 1 else: self.gap = 0 self.aa = inStr[0] if self.aa == self.aa.lower(): self.aa = 'C' if len(inStr) == 2: self.ss = inStr[1].upper() else: self.ss = '0' def __repr__(self): if self.gap: outstring = '..' else: outstring = self.aa+self.ss.lower() return outstring __str__ = __repr__ class FSSPSumRec: """ Contains info from an FSSP summary record""" def __init__(self,in_str): self.raw = in_str in_rec = in_str.strip().split() # print in_rec self.nr = int(in_rec[0][:-1]) self.pdb1 = in_rec[1][:4] if len(in_rec[1]) == 4: self.chain1='0' elif len(in_rec[1]) == 5: self.chain1=in_rec[1][4] else: raise ValueError('Bad PDB ID 1') self.pdb2 = in_rec[2][:4] if len(in_rec[2]) == 4: self.chain2='0' elif len(in_rec[2]) == 5: self.chain2=in_rec[2][4] else: raise ValueError('Bad PDB ID 2') self.zscore = float(in_rec[3]) self.rmsd = float(in_rec[4]) self.lali = float(in_rec[5]) self.lseq2 = float(in_rec[6]) self.pID = float(in_rec[7]) self.revers = int(in_rec[8]) self.permut = int(in_rec[9]) self.nfrag = int(in_rec[10]) self.topo = in_rec[11] self.doc = '' for i in in_rec[12:]: self.doc = self.doc + i + ' ' self.doc = self.doc.rstrip() + '\n' def __repr__(self): return self.raw __str__ = __repr__ class FSSPAlignRec: def __init__(self,in_fff_rec): # print in_fff_rec self.abs_res_num = int(in_fff_rec[fssp_rec.align.abs_res_num]) self.pdb_res_num = in_fff_rec[fssp_rec.align.pdb_res_num].strip() self.chain_id = in_fff_rec[fssp_rec.align.chain_id] if self.chain_id == ' ': self.chain_id = '0' self.res_name = in_fff_rec[fssp_rec.align.res_name] if self.res_name == self.res_name.lower(): self.res_name = 'C' self.ss1 = in_fff_rec[fssp_rec.align.ss1] self.turn3 = in_fff_rec[fssp_rec.align.turn3] self.turn4 = in_fff_rec[fssp_rec.align.turn4] self.turn5 = in_fff_rec[fssp_rec.align.turn5] self.pos_align_dict = {} self.PosAlignList = [] def add_align_list(self,align_list): for i in align_list: self.PosAlignList.append(PosAlign(i)) def pos_align_list2dict(self): j = 1 for i in self.PosAlignList: self.pos_align_dict[j] = i j = j + 1 class FSSPAlignDict(dict): def __init__(self): # The following two dictionaries are pointers to records in self # The first dictionary is a "pdb_residue_number: self_key" # The second dictionary is a "absolute_residue_number: self_key" self.pdb_res_dict = {} self.abs_res_dict = {} self.data = {} def build_resnum_list(self): for i in self: self.abs_res_dict[self[i].abs_res_num] = i self.pdb_res_dict[self[i].pdb_res_num] = i # Given an absolute residue number & chain, returns the relevant fssp # record def abs(self,num): return self[self.abs_res_dict[num]] # Given an PDB residue number & chain, returns the relevant fssp # record def pdb(self,num): return self[self.pdb_res_dict[num]] # Returns a sequence string def sequence(self,num): s = '' sorted_pos_nums = list(self.abs_res_dict.keys()) sorted_pos_nums.sort() for i in sorted_pos_nums: s += self.abs(i).pos_align_dict[num].aa return s def fasta_mult_align(self): mult_align_dict = {} for j in self.abs(1).pos_align_dict: mult_align_dict[j] = '' for fssp_rec in self.values(): for j in fssp_rec.pos_align_dict: mult_align_dict[j] += fssp_rec.pos_align_dict[j].aa seq_order = list(mult_align_dict.keys()) seq_order.sort() out_str = '' for i in seq_order: out_str += '> %d\n' % i k = 0 for j in mult_align_dict[i]: k += 1 if k % 72 == 0: out_str += '\n' out_str += j out_str += '\n' return out_str class FSSPSumDict(dict): pass # # Process a fssp file into its constituents. Return a 2-tuple containing # a list of FSSPSumRecs and a dictionary of alignment records. # def read_fssp(fssp_handle): header = FSSPHeader() sum_dict = FSSPSumDict() align_dict = FSSPAlignDict() # fssp_handle=open(fssp_handlename) curline = fssp_handle.readline() while not summary_title.match(curline): # Still in title header.fill_header(curline) curline = fssp_handle.readline() if not summary_title.match(curline): raise ValueError('Bad FSSP file: no summary record found') curline = fssp_handle.readline() #Read the title line, discard curline = fssp_handle.readline() #Read the next line # Process the summary records into a list while summary_rec.match(curline): cur_sum_rec = FSSPSumRec(curline) sum_dict[cur_sum_rec.nr] = cur_sum_rec curline = fssp_handle.readline() # Outer loop: process everything up to the EQUIVALENCES title record while not equiv_title.match(curline): while (not alignments_title.match(curline) and not equiv_title.match(curline)): curline = fssp_handle.readline() if not alignments_title.match(curline): if equiv_title.match(curline): # print "Reached equiv_title" break else: raise ValueError('Bad FSSP file: no alignments title record found') if equiv_title.match(curline): break # If we got to this point, this means that we have matched an # alignments title. Parse the alignment records in a loop. curline = fssp_handle.readline() #Read the title line, discard curline = fssp_handle.readline() #Read the next line while alignments_rec.match(curline): align_rec = FSSPAlignRec(fff_rec(curline)) key = align_rec.chain_id+align_rec.res_name+str(align_rec.pdb_res_num) align_list = curline[fssp_rec.align.start_aa_list:].strip().split() if key not in align_dict: align_dict[key] = align_rec align_dict[key].add_align_list(align_list) curline = fssp_handle.readline() if not curline: print('EOFEOFEOF') raise EOFError for i in align_dict.values(): i.pos_align_list2dict() del i.PosAlignList align_dict.build_resnum_list() return (header, sum_dict, align_dict)