# Copyright 2008 by Peter Cock. All rights reserved. # # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. from Bio.Align.Generic import Alignment from Interfaces import AlignmentIterator, SequentialAlignmentWriter class EmbossWriter(SequentialAlignmentWriter) : """Emboss alignment writer Writes a simplfied version of the EMBOSS pairs/simple file format. A lot of the information their tools record in their headers is not available and is ommitted. """ def write_header(self) : handle = self.handle handle.write("########################################\n") handle.write("# Program: Biopython\n") try : handle.write("# Report_file: %s\n" % handle.name) except AttributeError : pass handle.write("########################################\n") def write_footer(self) : handle = self.handle handle.write("#---------------------------------------\n") handle.write("#---------------------------------------\n") def write_alignment(self, alignment) : """Use this to write (another) single alignment to an open file.""" handle = self.handle records = alignment.get_all_seqs() handle.write("#=======================================\n") handle.write("#\n") handle.write("# Aligned_sequences: %i\n" % len(records)) for i, record in enumerate(records) : handle.write("# %i: %s\n" % (i+1, record.id)) handle.write("#\n") handle.write("# Length: %i\n" % alignment.get_alignment_length()) handle.write("#\n") handle.write("#=======================================\n") handle.write("\n") #... assert False class EmbossIterator(AlignmentIterator) : """Emboss alignment iterator For reading the (pairwise) alignments from EMBOSS tools in what they call the "pairs" and "simple" formats. """ def next(self) : handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() <> "#=======================================" : line = handle.readline() if not line : return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#" : #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":",1) key = parts[0].lower().strip() if key == "aligned_sequences" : number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs) : line = handle.readline() parts = line[1:].strip().split(":",1) assert i+1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length" : length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None : raise SyntaxError("Number of sequences missing!") if length_of_seqs is None : raise SyntaxError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment <> number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] index = 0 #Parse the seqs while line : if len(line) > 21 : id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id==ids[index] or id == ids[index][:len(id)] #Check the start... assert int(start) - 1 == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says start %i:\n%s" \ % (len(seqs[index]), id, int(start), seqs[index]) seqs[index] += seq #Check the end ... assert int(end) == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says end %i:\n%s" \ % (len(seqs[index]), id, int(end), seqs[index]) index += 1 if index >= number_of_seqs : index = 0 else : #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "" : #Just a spacer? pass else : print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================" : #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment <> len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) for id, seq in zip(ids, seqs) : if len(seq) <> length_of_seqs : raise SyntaxError("Error parsing alignment - sequences of different length?") alignment.add_sequence(id, seq) return alignment if __name__ == "__main__" : print "Running a quick self-test" #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple simple_example = \ """######################################## # Program: alignret # Rundate: Wed Jan 16 17:16:13 2002 # Report_file: stdout ######################################## #======================================= # # Aligned_sequences: 4 # 1: IXI_234 # 2: IXI_235 # 3: IXI_236 # 4: IXI_237 # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 131 # Identity: 95/131 (72.5%) # Similarity: 127/131 (96.9%) # Gaps: 25/131 (19.1%) # Score: 100.0 # # #======================================= IXI_234 1 TSPASIRPPAGPSSRPAMVSSRRTRPSPPGPRRPTGRPCCSAAPRRPQAT 50 IXI_235 1 TSPASIRPPAGPSSR---------RPSPPGPRRPTGRPCCSAAPRRPQAT 41 IXI_236 1 TSPASIRPPAGPSSRPAMVSSR--RPSPPPPRRPPGRPCCSAAPPRPQAT 48 IXI_237 1 TSPASLRPPAGPSSRPAMVSSRR-RPSPPGPRRPT----CSAAPRRPQAT 45 |||||:|||||||||::::::: |||||:||||:::::|||||:||||| IXI_234 51 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSRSAG 100 IXI_235 42 GGWKTCSGTCTTSTSTRHRGRSGW----------RASRKSMRAACSRSAG 81 IXI_236 49 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSR--G 96 IXI_237 46 GGYKTCSGTCTTSTSTRHRGRSGYSARTTTAACLRASRKSMRAACSR--G 93 ||:||||||||||||||||||||:::::::::::||||||||||||| | IXI_234 101 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 131 IXI_235 82 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 112 IXI_236 97 SRPPRFAPPLMSSCITSTTGPPPPAGDRSHE 127 IXI_237 94 SRPNRFAPTLMSSCLTSTTGPPAYAGDRSHE 124 |||:||||:|||||:|||||||::||||||| #--------------------------------------- #--------------------------------------- """ #http://emboss.sourceforge.net/docs/themes/alnformats/align.pair pair_example = \ """######################################## # Program: water # Rundate: Wed Jan 16 17:23:19 2002 # Report_file: stdout ######################################## #======================================= # # Aligned_sequences: 2 # 1: IXI_234 # 2: IXI_235 # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 131 # Identity: 112/131 (85.5%) # Similarity: 112/131 (85.5%) # Gaps: 19/131 (14.5%) # Score: 591.5 # # #======================================= IXI_234 1 TSPASIRPPAGPSSRPAMVSSRRTRPSPPGPRRPTGRPCCSAAPRRPQAT 50 ||||||||||||||| |||||||||||||||||||||||||| IXI_235 1 TSPASIRPPAGPSSR---------RPSPPGPRRPTGRPCCSAAPRRPQAT 41 IXI_234 51 GGWKTCSGTCTTSTSTRHRGRSGWSARTTTAACLRASRKSMRAACSRSAG 100 |||||||||||||||||||||||| |||||||||||||||| IXI_235 42 GGWKTCSGTCTTSTSTRHRGRSGW----------RASRKSMRAACSRSAG 81 IXI_234 101 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 131 ||||||||||||||||||||||||||||||| IXI_235 82 SRPNRFAPTLMSSCITSTTGPPAWAGDRSHE 112 #--------------------------------------- #--------------------------------------- """ pair_example2 = \ """######################################## # Program: needle # Rundate: Sun 27 Apr 2007 17:20:35 # Commandline: needle # [-asequence] Spo0F.faa # [-bsequence] paired_r.faa # -sformat2 pearson # Align_format: srspair # Report_file: ref_rec .needle ######################################## #======================================= # # Aligned_sequences: 2 # 1: ref_rec # 2: gi|94968718|receiver # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 124 # Identity: 32/124 (25.8%) # Similarity: 64/124 (51.6%) # Gaps: 17/124 (13.7%) # Score: 112.0 # # #======================================= ref_rec 1 KILIVDD----QYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDL 46 :|:.|| :.|.|::|.: :.|.....:|.:|.||:.:..:..|.: gi|94968718|r 1 -VLLADDHALVRRGFRLMLED--DPEIEIVAEAGDGAQAVKLAGELHPRV 47 ref_rec 47 VLLDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALT 96 |::|..:|||.|::..|:::....:|.|:::|.:.|...::.:.|.||.. gi|94968718|r 48 VVMDCAMPGMSGMDATKQIRTQWPDIAVLMLTMHSEDTWVRLALEAGANG 97 ref_rec 97 HFAK-PFDIDEIRDAV-------- 111 :..| ..|:|.|: || gi|94968718|r 98 YILKSAIDLDLIQ-AVRRVANGET 120 #======================================= # # Aligned_sequences: 2 # 1: ref_rec # 2: gi|94968761|receiver # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 119 # Identity: 34/119 (28.6%) # Similarity: 58/119 (48.7%) # Gaps: 9/119 ( 7.6%) # Score: 154.0 # # #======================================= ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDLVLLD 50 ||||||:......|:..|...|::.....|.::||:|...:..||:|.| gi|94968761|r 1 -ILIVDDEANTLASLSRAFRLAGHEATVCDNAVRALEIAKSKPFDLILSD 49 ref_rec 51 MKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHFAK 100 :.:||.||:.:|:.:|.......|::|:....::|..::..||||....| gi|94968761|r 50 VVMPGRDGLTLLEDLKTAGVQAPVVMMSGQAHIEMAVKATRLGALDFLEK 99 ref_rec 101 PFDIDEIRDAV-------- 111 |...|::...| gi|94968761|r 100 PLSTDKLLLTVENALKLKR 118 #======================================= # # Aligned_sequences: 2 # 1: ref_rec # 2: gi|94967506|receiver # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 120 # Identity: 29/120 (24.2%) # Similarity: 53/120 (44.2%) # Gaps: 9/120 ( 7.5%) # Score: 121.0 # # #======================================= ref_rec 1 -KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTKERPDLVLL 49 .|::|||..|..:.:..||.:.|:..........|.:.:.....||.:: gi|94967506|r 1 LHIVVVDDDPGTCVYIESVFAELGHTCKSFVRPEAAEEYILTHPVDLAIV 50 ref_rec 50 DMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHFA 99 |:.:....|:|:|:|.:|....:..:|:|....|:|...|...||:.:.. gi|94967506|r 51 DVYLGSTTGVEVLRRCRVHRPKLYAVIITGQISLEMAARSIAEGAVDYIQ 100 ref_rec 100 KPFDIDEIRDAV-------- 111 ||.|||.:.:.. gi|94967506|r 101 KPIDIDALLNIAERALEHKE 120 #======================================= # # Aligned_sequences: 2 # 1: ref_rec # 2: gi|94970045|receiver # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 118 # Identity: 30/118 (25.4%) # Similarity: 64/118 (54.2%) # Gaps: 9/118 ( 7.6%) # Score: 126.0 # # #======================================= ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIVTK--ERPDLVL 48 :|:|:|:..:|....:.....||:...|.:|.:||.:.:| ||.|::: gi|94970045|r 1 -VLLVEDEEALRAAAGDFLETRGYKIMTARDGTEALSMASKFAERIDVLI 49 ref_rec 49 LDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHF 98 .|:.:||:.|..:.:.:..|....:|:.|:.|.: :.:..:.|:.:.:.| gi|94970045|r 50 TDLVMPGISGRVLAQELVKIHPETKVMYMSGYDD-ETVMVNGEIDSSSAF 98 ref_rec 99 -AKPFDID----EIRDAV 111 .|||.:| :||:.: gi|94970045|r 99 LRKPFRMDALSAKIREVL 116 #======================================= # # Aligned_sequences: 2 # 1: ref_rec # 2: gi|94970041|receiver # Matrix: EBLOSUM62 # Gap_penalty: 10.0 # Extend_penalty: 0.5 # # Length: 125 # Identity: 35/125 (28.0%) # Similarity: 70/125 (56.0%) # Gaps: 18/125 (14.4%) # Score: 156.5 # # #======================================= ref_rec 1 KILIVDDQYGIRILLNEVFNKEGYQTFQAANGLQALDIV--TKERPDLVL 48 .:|:|:|:.|:|.|:..:.:::||...:|.:|.:||:|| :.::.|::| gi|94970041|r 1 TVLLVEDEEGVRKLVRGILSRQGYHVLEATSGEEALEIVRESTQKIDMLL 50 ref_rec 49 LDMKIPGMDGIEILKRMKVIDENIRVIIMTAYGELDMIQESKELGALTHF 98 .|:.:.||.|.|:.:|:::...:::||.|:.|.:..:::. |.||.. gi|94970041|r 51 SDVVLVGMSGRELSERLRIQMPSLKVIYMSGYTDDAIVRH----GVLTES 96 ref_rec 99 A----KPFDIDEIRDAV-------- 111 | |||..|.:...| gi|94970041|r 97 AEFLQKPFTSDSLLRKVRAVLQKRQ 121 #--------------------------------------- #--------------------------------------- """ from StringIO import StringIO alignments = list(EmbossIterator(StringIO(pair_example))) assert len(alignments) == 1 assert len(alignments[0].get_all_seqs()) == 2 assert [r.id for r in alignments[0].get_all_seqs()] \ == ["IXI_234", "IXI_235"] alignments = list(EmbossIterator(StringIO(simple_example))) assert len(alignments) == 1 assert len(alignments[0].get_all_seqs()) == 4 assert [r.id for r in alignments[0].get_all_seqs()] \ == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] alignments = list(EmbossIterator(StringIO(pair_example + simple_example))) assert len(alignments) == 2 assert len(alignments[0].get_all_seqs()) == 2 assert len(alignments[1].get_all_seqs()) == 4 assert [r.id for r in alignments[0].get_all_seqs()] \ == ["IXI_234", "IXI_235"] assert [r.id for r in alignments[1].get_all_seqs()] \ == ["IXI_234", "IXI_235", "IXI_236", "IXI_237"] #for a in EmbossIterator(StringIO(pair_example2)) : # print "Next:" # for r in a.get_all_seqs() : # print r.seq.tostring()[:20] + "...", r.id alignments = list(EmbossIterator(StringIO(pair_example2))) assert len(alignments) == 5 assert len(alignments[0].get_all_seqs()) == 2 assert [r.id for r in alignments[0].get_all_seqs()] \ == ["ref_rec", "gi|94968718|receiver"] assert [r.id for r in alignments[4].get_all_seqs()] \ == ["ref_rec", "gi|94970041|receiver"] print "Done"