# Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com)
# Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""I/O function wrappers for the Newick file format.

See: http://evolution.genetics.washington.edu/phylip/newick_doc.html
"""

import re
from Bio._py3k import StringIO

from Bio.Phylo import Newick


class NewickError(Exception):
    """Exception raised when Newick object construction cannot continue."""

    pass


tokens = [
    (r"\(", "open parens"),
    (r"\)", "close parens"),
    (r"[^\s\(\)\[\]\'\:\;\,]+", "unquoted node label"),
    (r"\:\ ?[+-]?[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", "edge length"),
    (r"\,", "comma"),
    (r"\[(\\.|[^\]])*\]", "comment"),
    (r"\'(\\.|[^\'])*\'", "quoted node label"),
    (r"\;", "semicolon"),
    (r"\n", "newline"),
]
tokenizer = re.compile("(%s)" % "|".join(token[0] for token in tokens))
token_dict = {name: re.compile(token) for token, name in tokens}


# ---------------------------------------------------------
# Public API


def parse(handle, **kwargs):
    """Iterate over the trees in a Newick file handle.

    :returns: generator of Bio.Phylo.Newick.Tree objects.

    """
    return Parser(handle).parse(**kwargs)


def write(trees, handle, plain=False, **kwargs):
    """Write a trees in Newick format to the given file handle.

    :returns: number of trees written.

    """
    return Writer(trees).write(handle, plain=plain, **kwargs)


# ---------------------------------------------------------
# Input


def _parse_confidence(text):
    if text.isdigit():
        return int(text)
        # NB: Could make this more consistent by treating as a percentage
        # return int(text) / 100.
    try:
        return float(text)
        # NB: This should be in [0.0, 1.0], but who knows what people will do
        # assert 0 <= current_clade.confidence <= 1
    except ValueError:
        return None


def _format_comment(text):
    return "[%s]" % (text.replace("[", "\\[").replace("]", "\\]"))


def _get_comment(clade):
    if hasattr(clade, "comment") and clade.comment:
        return _format_comment(str(clade.comment))
    else:
        return ""


class Parser(object):
    """Parse a Newick tree given a file handle.

    Based on the parser in ``Bio.Nexus.Trees``.
    """

    def __init__(self, handle):
        """Initialize file handle for the Newick Tree."""
        self.handle = handle

    @classmethod
    def from_string(cls, treetext):
        """Instantiate the Newick Tree class from the given string."""
        handle = StringIO(treetext)
        return cls(handle)

    def parse(
        self, values_are_confidence=False, comments_are_confidence=False, rooted=False
    ):
        """Parse the text stream this object was initialized with."""
        self.values_are_confidence = values_are_confidence
        self.comments_are_confidence = comments_are_confidence
        self.rooted = rooted
        buf = ""
        unicodeChecked = False
        unicodeLines = ("\xef", "\xff", "\xfe", "\x00")
        for line in self.handle:
            if not unicodeChecked:
                # check for unicode byte order marks on first line only,
                # these lead to parsing errors (on Python 2)
                if line.startswith(unicodeLines):
                    raise NewickError(
                        "The file or stream you attempted to parse includes "
                        "unicode byte order marks.  You must convert it to "
                        "ASCII before it can be parsed."
                    )
                unicodeChecked = True
            buf += line.rstrip()
            if buf.endswith(";"):
                yield self._parse_tree(buf)
                buf = ""
        if buf:
            # Last tree is missing a terminal ';' character -- that's OK
            yield self._parse_tree(buf)

    def _parse_tree(self, text):
        """Parse the text representation into an Tree object (PRIVATE)."""
        tokens = re.finditer(tokenizer, text.strip())

        new_clade = self.new_clade
        root_clade = new_clade()

        current_clade = root_clade
        entering_branch_length = False

        lp_count = 0
        rp_count = 0
        for match in tokens:
            token = match.group()

            if token.startswith("'"):
                # quoted label; add characters to clade name
                current_clade.name = token[1:-1]

            elif token.startswith("["):
                # comment
                current_clade.comment = token[1:-1]
                if self.comments_are_confidence:
                    # Try to use this comment as a numeric support value
                    current_clade.confidence = _parse_confidence(current_clade.comment)

            elif token == "(":
                # start a new clade, which is a child of the current clade
                current_clade = new_clade(current_clade)
                entering_branch_length = False
                lp_count += 1

            elif token == ",":
                # if the current clade is the root, then the external parentheses
                # are missing and a new root should be created
                if current_clade is root_clade:
                    root_clade = new_clade()
                    current_clade.parent = root_clade
                # start a new child clade at the same level as the current clade
                parent = self.process_clade(current_clade)
                current_clade = new_clade(parent)
                entering_branch_length = False

            elif token == ")":
                # done adding children for this parent clade
                parent = self.process_clade(current_clade)
                if not parent:
                    raise NewickError("Parenthesis mismatch.")
                current_clade = parent
                entering_branch_length = False
                rp_count += 1

            elif token == ";":
                break

            elif token.startswith(":"):
                # branch length or confidence
                value = float(token[1:])
                if self.values_are_confidence:
                    current_clade.confidence = value
                else:
                    current_clade.branch_length = value

            elif token == "\n":
                pass

            else:
                # unquoted node label
                current_clade.name = token

        if not lp_count == rp_count:
            raise NewickError("Number of open/close parentheses do not match.")

        # if ; token broke out of for loop, there should be no remaining tokens
        try:
            next_token = next(tokens)
            raise NewickError(
                "Text after semicolon in Newick tree: %s" % next_token.group()
            )
        except StopIteration:
            pass

        self.process_clade(current_clade)
        self.process_clade(root_clade)
        return Newick.Tree(root=root_clade, rooted=self.rooted)

    def new_clade(self, parent=None):
        """Return new Newick.Clade, optionally with temporary reference to parent."""
        clade = Newick.Clade()
        if parent:
            clade.parent = parent
        return clade

    def process_clade(self, clade):
        """Remove node's parent and return it. Final processing of parsed clade."""
        if (
            (clade.name)
            and not (self.values_are_confidence or self.comments_are_confidence)
            and (clade.confidence is None)
            and (clade.clades)
        ):
            clade.confidence = _parse_confidence(clade.name)
            if clade.confidence is not None:
                clade.name = None

        if hasattr(clade, "parent"):
            parent = clade.parent
            parent.clades.append(clade)
            del clade.parent
            return parent


# ---------------------------------------------------------
# Output


class Writer(object):
    """Based on the writer in Bio.Nexus.Trees (str, to_string)."""

    def __init__(self, trees):
        """Initialize parameter for Tree Writer object."""
        self.trees = trees

    def write(self, handle, **kwargs):
        """Write this instance's trees to a file handle."""
        count = 0
        for treestr in self.to_strings(**kwargs):
            handle.write(treestr + "\n")
            count += 1
        return count

    def to_strings(
        self,
        confidence_as_branch_length=False,
        branch_length_only=False,
        plain=False,
        plain_newick=True,
        ladderize=None,
        max_confidence=1.0,
        format_confidence="%1.2f",
        format_branch_length="%1.5f",
    ):
        """Return an iterable of PAUP-compatible tree lines."""
        # If there's a conflict in the arguments, we override plain=True
        if confidence_as_branch_length or branch_length_only:
            plain = False
        make_info_string = self._info_factory(
            plain,
            confidence_as_branch_length,
            branch_length_only,
            max_confidence,
            format_confidence,
            format_branch_length,
        )

        def newickize(clade):
            """Convert a node tree to a Newick tree string, recursively."""
            label = clade.name or ""
            if label:
                unquoted_label = re.match(token_dict["unquoted node label"], label)
                if (not unquoted_label) or (unquoted_label.end() < len(label)):
                    label = "'%s'" % label.replace("\\", "\\\\").replace("'", "\\'")

            if clade.is_terminal():  # terminal
                return label + make_info_string(clade, terminal=True)
            else:
                subtrees = (newickize(sub) for sub in clade)
                return "(%s)%s" % (",".join(subtrees), label + make_info_string(clade))

        # Convert each tree to a string
        for tree in self.trees:
            if ladderize in ("left", "LEFT", "right", "RIGHT"):
                # Nexus compatibility shim, kind of
                tree.ladderize(reverse=(ladderize in ("right", "RIGHT")))
            rawtree = newickize(tree.root) + ";"
            if plain_newick:
                yield rawtree
                continue
            # Nexus-style (?) notation before the raw Newick tree
            treeline = ["tree", (tree.name or "a_tree"), "="]
            if tree.weight != 1:
                treeline.append("[&W%s]" % round(float(tree.weight), 3))
            if tree.rooted:
                treeline.append("[&R]")
            treeline.append(rawtree)
            yield " ".join(treeline)

    def _info_factory(
        self,
        plain,
        confidence_as_branch_length,
        branch_length_only,
        max_confidence,
        format_confidence,
        format_branch_length,
    ):
        """Return a function that creates a nicely formatted node tag (PRIVATE)."""
        if plain:
            # Plain tree only. That's easy.
            def make_info_string(clade, terminal=False):
                return _get_comment(clade)

        elif confidence_as_branch_length:
            # Support as branchlengths (eg. PAUP), ignore actual branchlengths
            def make_info_string(clade, terminal=False):
                if terminal:
                    # terminal branches have 100% support
                    return (":" + format_confidence % max_confidence) + _get_comment(
                        clade
                    )
                else:
                    return (":" + format_confidence % clade.confidence) + _get_comment(
                        clade
                    )

        elif branch_length_only:
            # write only branchlengths, ignore support
            def make_info_string(clade, terminal=False):
                return (
                    ":" + format_branch_length % clade.branch_length
                ) + _get_comment(clade)

        else:
            # write support and branchlengths (e.g. .con tree of mrbayes)
            def make_info_string(clade, terminal=False):
                if (
                    terminal
                    or not hasattr(clade, "confidence")
                    or clade.confidence is None
                ):
                    return (":" + format_branch_length) % (
                        clade.branch_length or 0.0
                    ) + _get_comment(clade)
                else:
                    return (format_confidence + ":" + format_branch_length) % (
                        clade.confidence,
                        clade.branch_length or 0.0,
                    ) + _get_comment(clade)

        return make_info_string