# Copyright 2019 by Michiel de Hoon.  All rights reserved.
# Based on code contributed and copyright 2016 by Peter Cock.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the UCSC nib file format.

Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:

    - ``0`` - T
    - ``1`` - C
    - ``2`` - A
    - ``3`` - G
    - ``4`` - N (unknown)

As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:

    - ``8`` - t
    - ``9`` - c
    - ``a`` - a
    - ``b`` - g
    - ``c`` - n (unknown)

A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
    >>> print("%i %s..." % (len(record), record.seq[:20]))
    50 nAGAAGagccgcNGgCActt...

For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
"""

from __future__ import print_function

from Bio.File import as_handle
from Bio.SeqIO.Interfaces import SequenceWriter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import struct
import sys

try:
    hex2bytes = bytes.fromhex  # python3
except AttributeError:
    # python 2
    hex2bytes = lambda s: s.decode("hex")  # noqa: E731

if sys.version_info < (3,):
    # python2
    import binascii

    bytes2hex = binascii.hexlify
elif sys.version_info < (3, 5):
    # python 3.4
    import binascii

    bytes2hex = lambda b: binascii.hexlify(b).decode("ascii")  # noqa: E731
else:
    # python 3.5 and later
    bytes2hex = lambda b: b.hex()  # noqa: E731

try:
    int.from_bytes  # python3
except AttributeError:

    def byte2int(b, byteorder):
        """Convert byte array to integer."""
        if byteorder == "little":
            return struct.unpack("<i", b)[0]
        elif byteorder == "big":
            return struct.unpack(">i", b)[0]


else:
    # python 3
    byte2int = lambda b, byteorder: int.from_bytes(b, byteorder)  # noqa: E731


try:
    maketrans = str.maketrans  # python3
except AttributeError:
    import string

    maketrans = string.maketrans


# This is a generator function!
def NibIterator(handle, alphabet=None):
    """Iterate over a nib file and yield a SeqRecord.

        - handle - input file in the nib file format as defined by UCSC.
          This must be opened in binary mode!
        - alphabet - always ignored.

    Note that a nib file always contains only one sequence record.
    The sequence of the resulting SeqRecord object should match the sequence
    generated by Jim Kent's nibFrag utility run with the -masked option.

    This function is used internally via the Bio.SeqIO functions:

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
    >>> print("%s %i" % (record.seq, len(record)))
    nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

    You can also call it directly:

    >>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
    ...     for record in NibIterator(handle):
    ...         print("%s %i" % (record.seq, len(record)))
    ...
    nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

    """
    if alphabet is not None:
        raise ValueError("Alphabets are ignored.")
    with as_handle(handle, "rb") as handle:
        word = handle.read(4)

        # check if file is empty
        if not word:
            raise ValueError("Empty file.")

        signature = bytes2hex(word)
        if signature == "3a3de96b":
            byteorder = "little"  # little-endian
        elif signature == "6be93d3a":
            byteorder = "big"  # big-endian
        else:
            raise ValueError("unexpected signature in Nib header")
        number = handle.read(4)
        length = byte2int(number, byteorder)
        data = handle.read()
        indices = bytes2hex(data)
        if length % 2 == 0:
            if len(indices) != length:
                raise ValueError("Unexpected file size")
        elif length % 2 == 1:
            if len(indices) != length + 1:
                raise ValueError("Unexpected file size")
            indices = indices[:length]
        if not set(indices).issubset("0123489abc"):
            raise ValueError("Unexpected sequence data found in file")
        table = maketrans("0123489abc", "TCAGNtcagn")
        nucleotides = indices.translate(table)
        sequence = Seq(nucleotides)
        record = SeqRecord(sequence)
        yield record


class NibWriter(SequenceWriter):
    """Nib file writer."""

    def __init__(self, handle):
        """Initialize an Nib writer object.

        Arguments:
         - handle - Output handle, in binary write mode.
        """
        self.handle = handle
        byteorder = sys.byteorder
        if byteorder == "little":  # little-endian
            signature = "3a3de96b"
        elif byteorder == "big":  # big-endian
            signature = "6be93d3a"
        else:
            raise RuntimeError("unexpected system byte order %s" % byteorder)
        handle.write(hex2bytes(signature))

    def write_file(self, records):
        """Use this to write an entire file containing the given record."""
        count = 0
        for record in records:
            count += 1
        if count == 0:
            raise ValueError("Must have one sequence")
        if count > 1:
            raise ValueError("More than one sequence found")
        handle = self.handle
        sequence = record.seq
        nucleotides = str(sequence)
        length = len(sequence)
        handle.write(struct.pack("i", length))
        table = maketrans("TCAGNtcagn", "0123489abc")
        padding = length % 2
        suffix = padding * "T"
        nucleotides += suffix
        if not set(nucleotides).issubset("ACGTNacgtn"):
            raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only")
        indices = nucleotides.translate(table)
        handle.write(hex2bytes(indices))
        return count


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest(verbose=0)