# Copyright 2018 by Ariel Aptekmann. # All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. from __future__ import print_function from Bio.Alphabet import IUPAC from Bio import Seq from Bio import motifs import math def read(handle): """Parse the text output of the MEME program into a meme.Record object. Example: >>> from Bio.motifs import minimal >>> with open("meme.output.txt") as f: ... record = meme.read(f) ... >>> for motif in record: ... print(motif.name, motif.evalue) ... You can access individual motifs in the record by their index or find a motif by its name: Example: >>> from Bio import motifs >>> with open("test_minimal.meme") as f: ... record = motifs.parse(f, 'minimal') ... >>> motif = record[0] >>> print(motif.name) LEXA >>> motif = record['LEXA'] >>> print(motif.name) LEXA This function wont retrieve instances, as there are none in minimal meme format. """ motif_number = 0 record = Record() _read_version(record, handle) _read_alphabet(record, handle) _read_background(record, handle) while True: for line in handle: if line.startswith('MOTIF'): break else: return record name = line.split()[1] motif_number += 1 length, num_occurrences, evalue = _read_motif_statistics(line, handle) counts = _read_lpm(line, handle) # {'A': 0.25, 'C': 0.25, 'T': 0.25, 'G': 0.25} motif = motifs.Motif(alphabet=record.alphabet, counts=counts) motif.background = record.background motif.length = length motif.num_occurrences = num_occurrences motif.evalue = evalue motif.name = name record.append(motif) assert len(record) == motif_number return record class Record(list): """Class for holding the results of a minimal MEME run.""" def __init__(self): """Initialize record class values.""" self.version = "" self.datafile = "" self.command = "" self.alphabet = None self.background = {} self.sequences = [] def __getitem__(self, key): if isinstance(key, str): for motif in self: if motif.name == key: return motif else: return list.__getitem__(self, key) # Everything below is private def _read_background(record, handle): for line in handle: if line.startswith('Background letter frequencies'): break else: raise ValueError("Improper input file. File should contain a line starting background frequencies.") try: line = next(handle) except StopIteration: raise ValueError("Unexpected end of stream: Expected to find line starting background frequencies.") line = line.strip() ls = line.split() A, C, G, T = float(ls[1]), float(ls[3]), float(ls[5]), float(ls[7]) record.background = {'A': A, 'C': C, 'G': G, 'T': T} def _read_version(record, handle): for line in handle: if line.startswith('MEME version'): break else: raise ValueError("Improper input file. File should contain a line starting MEME version.") line = line.strip() ls = line.split() record.version = ls[2] def _read_alphabet(record, handle): for line in handle: if line.startswith('ALPHABET'): break else: raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") if not line.startswith('ALPHABET= '): raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) line = line.strip().replace('ALPHABET= ', '') if line == 'ACGT': al = IUPAC.unambiguous_dna else: al = IUPAC.protein record.alphabet = al def _read_lpm(line, handle): counts = [[], [], [], []] for line in handle: freqs = line.split() if len(freqs) != 4: break counts[0].append(int(float(freqs[0]) * 1000000)) counts[1].append(int(float(freqs[1]) * 1000000)) counts[2].append(int(float(freqs[2]) * 1000000)) counts[3].append(int(float(freqs[3]) * 1000000)) c = {} c['A'] = counts[0] c['C'] = counts[1] c['G'] = counts[2] c['T'] = counts[3] return c def _read_motif_statistics(line, handle): # minimal : # letter-probability matrix: alength= 4 w= 19 nsites= 17 E= 4.1e-009 for line in handle: if line.startswith('letter-probability matrix:'): break num_occurrences = int(line.split("nsites=")[1].split()[0]) length = int(line.split("w=")[1].split()[0]) evalue = float(line.split("E=")[1].split()[0]) return length, num_occurrences, evalue def _read_motif_name(handle): for line in handle: if 'sorted by position p-value' in line: break else: raise ValueError('Unexpected end of stream: Failed to find motif name') line = line.strip() words = line.split() name = " ".join(words[0:2]) return name