# Copyright (C) 2011, 2016 by Brandon Invergo (b.invergo@gmail.com) # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. import re line_floats_re = re.compile(r"-*\d+\.\d+") try: float("nan") _nan_float = float except ValueError: # Happens prior to Python 2.6 depending on C library, e.g. breaks on WinXP def _nan_float(text): try: return float(text) except ValueError: if text.lower() == "nan": import struct return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0] else: raise def parse_basics(lines, results): """Parse the basic information that should be present in most codeml output files.""" # multi_models is used to designate there being results for more than one # model in the file multi_models = False multi_genes = False version_re = re.compile(r".+ \(in paml version (\d+\.\d+[a-z]*).*") model_re = re.compile(r"Model:\s+(.+)") num_genes_re = re.compile(r"\(([0-9]+) genes: separate data\)") # In codeml 4.1, the codon substitution model is headed by: # "Codon frequencies:" # In codeml 4.3+ it is headed by: # "Codon frequency model:" codon_freq_re = re.compile(r"Codon frequenc[a-z\s]{3,7}:\s+(.+)") siteclass_re = re.compile(r"Site-class models:\s*([^\s]*)") for line in lines: # Find all floating point numbers in this line line_floats_res = line_floats_re.findall(line) line_floats = [_nan_float(val) for val in line_floats_res] # Get the program version number version_res = version_re.match(line) if version_res is not None: results["version"] = version_res.group(1) continue # Find the model description at the beginning of the file. model_res = model_re.match(line) if model_res is not None: results["model"] = model_res.group(1) # Find out if more than one genes are analyzed num_genes_res = num_genes_re.search(line) if num_genes_res is not None: results["genes"] = [] num_genes = int(num_genes_res.group(1)) for n in range(num_genes): results["genes"].append({}) multi_genes = True continue # Get the codon substitution model codon_freq_res = codon_freq_re.match(line) if codon_freq_res is not None: results["codon model"] = codon_freq_res.group(1) continue # Find the site-class model name at the beginning of the file. # This exists only if a NSsites class other than 0 is used. # If there's no site class model listed, then there are multiple # models in the results file # Example match: "Site-class models: PositiveSelection" siteclass_res = siteclass_re.match(line) if siteclass_res is not None: siteclass_model = siteclass_res.group(1) if siteclass_model != "": results["site-class model"] = siteclass_model multi_models = False else: multi_models = True # Get the maximum log-likelihood if "ln Lmax" in line and line_floats: results["lnL max"] = line_floats[0] return (results, multi_models, multi_genes) def parse_nssites(lines, results, multi_models, multi_genes): """Determine which NSsites models are present and parse them.""" ns_sites = {} model_re = re.compile(r"Model (\d+):\s+(.+)") gene_re = re.compile(r"Gene\s+([0-9]+)\s+.+") siteclass_model = results.get("site-class model") if not multi_models: # If there's only one model in the results, find out # which one it is and then parse it. if siteclass_model is None: siteclass_model = "one-ratio" current_model = {"one-ratio": 0, "NearlyNeutral": 1, "PositiveSelection": 2, "discrete": 3, "beta": 7, "beta&w>1": 8, "M2a_rel": 22}[siteclass_model] if multi_genes: genes = results["genes"] current_gene = None gene_start = None model_results = None for line_num, line in enumerate(lines): gene_res = gene_re.match(line) if gene_res: if current_gene is not None: assert model_results is not None parse_model(lines[gene_start:line_num], model_results) genes[current_gene - 1] = model_results gene_start = line_num current_gene = int(gene_res.group(1)) model_results = {"description": siteclass_model} if len(genes[current_gene - 1]) == 0: model_results = parse_model(lines[gene_start:], model_results) genes[current_gene - 1] = model_results else: model_results = {"description": siteclass_model} model_results = parse_model(lines, model_results) ns_sites[current_model] = model_results else: # If there are multiple models in the results, scan through # the file and send each model's text to be parsed individually. current_model = None model_start = None for line_num, line in enumerate(lines): # Find model names. If this is found on a line, # all of the following lines until the next time this matches # contain results for Model X. # Example match: "Model 1: NearlyNeutral (2 categories)" model_res = model_re.match(line) if model_res: if current_model is not None: # We've already been tracking a model, so it's time # to send those lines off for parsing before beginning # a new one. parse_model(lines[model_start:line_num], model_results) ns_sites[current_model] = model_results model_start = line_num current_model = int(model_res.group(1)) model_results = {"description": model_res.group(2)} if ns_sites.get(current_model) is None: # When we reach the end of the file, we'll still have one more # model to parse. model_results = parse_model(lines[model_start:], model_results) ns_sites[current_model] = model_results # Only add the ns_sites dict to the results if we really have results. # Model M0 is added by default in some cases, so if it exists, make sure # it's not empty if len(ns_sites) == 1: m0 = ns_sites.get(0) if not m0 or len(m0) > 1: results["NSsites"] = ns_sites elif len(ns_sites) > 1: results["NSsites"] = ns_sites return results def parse_model(lines, results): """Parse an individual NSsites model's results.""" parameters = {} SEs_flag = False dS_tree_flag = False dN_tree_flag = False w_tree_flag = False num_params = None tree_re = re.compile(r"^\([\w #:',.()]*\);\s*$") branch_re = re.compile(r"\s+(\d+\.\.\d+)[\s+\d+\.\d+]+") model_params_re = re.compile(r"(?