# Copyright 2019, National Marrow Donor Program (NMPD). All rights reserved. # Written by Peter Cock, The James Hutton Institute, under contract to NMDP. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.AlignIO support for GCG MSF format. The file format was produced by the GCG PileUp and and LocalPileUp tools, and later tools such as T-COFFEE and MUSCLE support it as an optional output format. The original GCG toola would write gaps at ends of each sequence which could be missing data as tildes (``~``), whereas internal gaps were periods (``.``) instead. This parser replaces both with minus signs (``-``) for consistency with the rest of ``Bio.AlignIO``. You are expected to use this module via the Bio.AlignIO functions (or the Bio.SeqIO functions if you want to work directly with the gapped sequences). """ from __future__ import print_function from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment from .Interfaces import AlignmentIterator class MsfIterator(AlignmentIterator): """GCG MSF alignment iterator.""" _header = None # for caching lines between __next__ calls def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration # Whitelisted headers we know about. known_headers = ["!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"] # Examples in "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 # would often start as follows: # # !!AA_MUTIPLE_ALIGNMENT 1.0 # PileUp of: @/usr/users2/culhane/... # # etc with other seemingly free format text before getting to the # MSF/Type/Check line and the following Name: lines block and // line. # # MUSCLE just has a line "PileUp", while other sources just use the line # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" # (nucleotide). if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known GCG MSF header: %s" % (line.strip().split()[0], ", ".join(known_headers)) ) while line and " MSF: " not in line: line = handle.readline() if not line: raise ValueError("Reached end of file without MSF/Type/Check header line") # Quoting from "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. # Page 31: # # "Header information is before a .. (double dot) in a GCG format file. # The file will also have a checksum specific for that file." # # This was followed by a single non-aligned sequence, but this convention # appears to also be used in the GCG MSF files. Quoting other examples in # this reference, page 31: # # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. # # Except from page 148: # # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. # # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: # # MSF: 689 Type: N Check: 0000 .. # # By observation, the MSF value is the column count, type is N (nucleotide) # or P (protein / amino acid). # # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, # # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf # !!NA_MULTIPLE_ALIGNMENT 1.0 # # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. # # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 # Name: G29385 Len: 633 Check: 391 Weight: 1.00 # # // # parts = line.strip("\n").split() offset = parts.index("MSF:") if ( parts[offset + 2] != "Type:" or parts[-3] not in ("Check:", "CompCheck:") or parts[-1] != ".." ): raise ValueError( "GCG MSF header line should be " "' MSF: Type: Check: ..', " " not: %r" % line ) try: aln_length = int(parts[offset + 1]) except ValueError: aln_length = -1 if aln_length < 0: raise ValueError( "GCG MSF header line should have MDF: for column count, not %r" % parts[offset + 1] ) seq_type = parts[offset + 3] if seq_type not in ["P", "N"]: raise ValueError( "GCG MSF header line should have 'Type: P' (protein) " "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type ) # There should be a blank line after that header line, then the Name: lines # # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here, # # PileUp # # # # MSF: 628 Type: P Check: 147 .. # # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 # # // ids = [] lengths = [] checks = [] weights = [] line = handle.readline() while line and line.strip() != "//": line = handle.readline() if line.strip().startswith("Name: "): if " Len: " in line and " Check: " in line and " Weight: " in line: rest = line[line.index("Name: ") + 6 :].strip() name, rest = rest.split(" Len: ") length, rest = rest.split(" Check: ") check, weight = rest.split(" Weight: ") name = name.strip() if name.endswith(" oo"): # T-COFFEE oddity, ignore this name = name[:-3] if name in ids: raise ValueError("Duplicated ID of %r" % name) if " " in name: raise NotImplementedError("Space in ID %r" % name) ids.append(name) # Expect aln_length <= int(length.strip()), see below lengths.append(int(length.strip())) checks.append(int(check.strip())) weights.append(float(weight.strip())) else: raise ValueError("Malformed GCG MSF name line: %r" % line) if not line: raise ValueError("End of file while looking for end of header // line.") if aln_length != max(lengths): # In broken examples from IMGTHLA was possible to continue # https://github.com/ANHIG/IMGTHLA/issues/201 max_length = max(lengths) max_count = sum(1 for _ in lengths if _ == max_length) raise ValueError( "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s" % (aln_length, max_count, len(ids), max_length) ) line = handle.readline() if not line: raise ValueError("End of file after // line, expected sequences.") if line.strip(): raise ValueError("After // line, expected blank line before sequences.") # Now load the sequences seqs = [[] for _ in ids] # list of empty lists completed_length = 0 while completed_length < aln_length: # Note might have a coordinate header line (seems to be optional) for idx, name in enumerate(ids): line = handle.readline() if idx == 0 and not line.strip(): # T-COFFEE uses two blank lines between blocks, rather than one while line and not line.strip(): line = handle.readline() if not line: raise ValueError("End of file where expecting sequence data.") # print("Looking for seq for %s in line: %r" % (name, line)) words = line.strip().split() # Should we use column numbers, rather than assuming no spaces in names? if idx == 0 and words and words[0] != name: # print("Actually have a coord line") # Hopefully this is a coordinate header before the first seq try: i = int(words[0]) except ValueError: i = -1 if i != completed_length + 1: raise ValueError( "Expected GCG MSF coordinate line starting %i, got: %r" % (completed_length + 1, line) ) if len(words) > 1: # Final block usually not full 50 chars, so expect start only. if len(words) != 2: i = -1 else: try: i = int(words[1]) except ValueError: i = -1 if i != ( completed_length + 50 if completed_length + 50 < aln_length else aln_length ): raise ValueError( "Expected GCG MSF coordinate line %i to %i, got: %r" % ( completed_length + 1, completed_length + 50 if completed_length + 50 < aln_length else aln_length, line, ) ) line = handle.readline() words = line.strip().split() # print("Still looking for seq for %s in line: %r" % (name, line)) # Dealt with any coordinate header line, should now be sequence if not words: # Should be sequence here, but perhaps its a short one? if ( lengths[idx] < aln_length and len("".join(seqs[idx])) == lengths[idx] ): # Is this actually allowed in the format? Personally I would # expect a line with name and a block of trailing ~ here. pass else: raise ValueError( "Expected sequence for %s, got: %r" % (name, line) ) elif words[0] == name: assert len(words) > 1, line # print(i, name, repr(words)) seqs[idx].extend(words[1:]) else: raise ValueError("Expected sequence for %r, got: %r" % (name, line)) # TODO - check the sequence lengths thus far are consistent # with blocks of 50? completed_length += 50 line = handle.readline() if line.strip(): raise ValueError("Expected blank line, got: %r" % line) # Skip over any whitespace at the end... while True: line = handle.readline() if not line: # End of file, no more alignments break elif not line.strip(): # Blank line, ignore pass elif line.strip().split()[0] in known_headers: # Looks like the start of another alignment: self._header = line break else: raise ValueError("Unexpected line after GCG MSF alignment: %r" % line) # Combine list of strings into single string, remap gaps seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs] # Apply any trailing padding for short sequences padded = False for idx, (length, s) in enumerate(zip(lengths, seqs)): if len(s) < aln_length and len(s) == length: padded = True seqs[idx] = s + "-" * (aln_length - len(s)) if padded: import warnings from Bio import BiopythonParserWarning warnings.warn( "One of more alignment sequences were truncated and have been gap padded", BiopythonParserWarning, ) records = ( SeqRecord( Seq(s, self.alphabet), id=i, name=i, description=i, annotations={"weight": w}, ) for (i, s, w) in zip(ids, seqs, weights) ) # This will check alignment lengths are self-consistent: align = MultipleSeqAlignment(records, self.alphabet) # Check matches the header: if align.get_alignment_length() != aln_length: raise ValueError( "GCG MSF headers said alignment length %i, but have %i" % (aln_length, align.get_alignment_length()) ) return align