# Copyright 2015 by Gert Hulselmans. All rights reserved. # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Parse XMS motif files.""" from Bio import motifs from xml.dom import minidom, Node import re class XMSScanner: """Class for scanning XMS XML file.""" def __init__(self, doc): """Generate motif Record from xms document, an XML-like motif pfm file.""" self.record = Record() for child in doc.getElementsByTagName("motif"): if child.nodeType == Node.ELEMENT_NODE: self.handle_motif(child) def handle_motif(self, node): """Read the motif's name and column from the node and add the motif record.""" motif_name = self.get_text(node.getElementsByTagName("name")) nucleotide_counts = {"A": [], "C": [], "G": [], "T": []} for column in node.getElementsByTagName("column"): [nucleotide_counts[nucleotide].append(float(nucleotide_count)) for nucleotide, nucleotide_count in zip(["A", "C", "G", "T"], self.get_acgt(column))] motif = motifs.Motif(alphabet="GATC", counts=nucleotide_counts) motif.name = motif_name self.record.append(motif) def get_property_value(self, node, key_name): """Extract the value of the motif's property named key_name from node.""" for cur_property in node.getElementsByTagName("prop"): right_property = False cur_value = None for child in cur_property.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "key" and self.get_text([child]) == key_name: right_property = True if child.tagName == "value": cur_value = self.get_text([child]) if right_property: return cur_value return None def get_acgt(self, node): """Get and return the motif's weights of A, C, G, T.""" a, c, g, t = 0.0, 0.0, 0.0, 0.0 for weight in node.getElementsByTagName("weight"): if weight.getAttribute("symbol") == "adenine": a = float(self.get_text([weight])) elif weight.getAttribute("symbol") == "cytosine": c = float(self.get_text([weight])) elif weight.getAttribute("symbol") == "guanine": g = float(self.get_text([weight])) elif weight.getAttribute("symbol") == "thymine": t = float(self.get_text([weight])) return a, c, g, t def get_text(self, nodelist): """Return a string representation of the motif's properties listed on nodelist .""" retlist = [] for node in nodelist: if node.nodeType == Node.TEXT_NODE: retlist.append(node.wholeText) elif node.hasChildNodes: retlist.append(self.get_text(node.childNodes)) return re.sub(r"\s+", " ", "".join(retlist)) class Record(list): """Class to store the information in a XMS matrix table. The record inherits from a list containing the individual motifs. """ def __str__(self): return "\n".join(str(motif) for motif in self) def read(handle): """Read motifs in XMS matrix format from a file handle. XMS is an XML format for describing regulatory motifs and PSSMs. This format was defined by Thomas Down, and used in the NestedMICA and MotifExplorer programs. """ xms_doc = minidom.parse(handle) record = XMSScanner(xms_doc).record return record