# Copyright 2019 by Michiel de Hoon. All rights reserved. # Based on code contributed and copyright 2016 by Peter Cock. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.SeqIO support for the UCSC nib file format. Nib stands for nibble (4 bit) representation of nucleotide sequences. The two nibbles in a byte each store one nucleotide, represented numerically as follows: - ``0`` - T - ``1`` - C - ``2`` - A - ``3`` - G - ``4`` - N (unknown) As the first bit in a nibble is set if the nucleotide is soft-masked, we additionally have: - ``8`` - t - ``9`` - c - ``a`` - a - ``b`` - g - ``c`` - n (unknown) A nib file contains only one sequence record. You are expected to use this module via the Bio.SeqIO functions under the format name "nib": >>> from Bio import SeqIO >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") >>> print("%i %s..." % (len(record), record.seq[:20])) 50 nAGAAGagccgcNGgCActt... For detailed information on the file format, please see the UCSC description at https://genome.ucsc.edu/FAQ/FAQformat.html. """ from __future__ import print_function from Bio.File import as_handle from Bio.SeqIO.Interfaces import SequenceWriter from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord import struct import sys try: hex2bytes = bytes.fromhex # python3 except AttributeError: # python 2 hex2bytes = lambda s: s.decode("hex") # noqa: E731 if sys.version_info < (3,): # python2 import binascii bytes2hex = binascii.hexlify elif sys.version_info < (3, 5): # python 3.4 import binascii bytes2hex = lambda b: binascii.hexlify(b).decode("ascii") # noqa: E731 else: # python 3.5 and later bytes2hex = lambda b: b.hex() # noqa: E731 try: int.from_bytes # python3 except AttributeError: def byte2int(b, byteorder): """Convert byte array to integer.""" if byteorder == "little": return struct.unpack("i", b)[0] else: # python 3 byte2int = lambda b, byteorder: int.from_bytes(b, byteorder) # noqa: E731 try: maketrans = str.maketrans # python3 except AttributeError: import string maketrans = string.maketrans # This is a generator function! def NibIterator(handle, alphabet=None): """Iterate over a nib file and yield a SeqRecord. - handle - input file in the nib file format as defined by UCSC. This must be opened in binary mode! - alphabet - always ignored. Note that a nib file always contains only one sequence record. The sequence of the resulting SeqRecord object should match the sequence generated by Jim Kent's nibFrag utility run with the -masked option. This function is used internally via the Bio.SeqIO functions: >>> from Bio import SeqIO >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib") >>> print("%s %i" % (record.seq, len(record))) nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 You can also call it directly: >>> with open("Nib/test_even_bigendian.nib", "rb") as handle: ... for record in NibIterator(handle): ... print("%s %i" % (record.seq, len(record))) ... nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50 """ if alphabet is not None: raise ValueError("Alphabets are ignored.") with as_handle(handle, "rb") as handle: word = handle.read(4) # check if file is empty if not word: raise ValueError("Empty file.") signature = bytes2hex(word) if signature == "3a3de96b": byteorder = "little" # little-endian elif signature == "6be93d3a": byteorder = "big" # big-endian else: raise ValueError("unexpected signature in Nib header") number = handle.read(4) length = byte2int(number, byteorder) data = handle.read() indices = bytes2hex(data) if length % 2 == 0: if len(indices) != length: raise ValueError("Unexpected file size") elif length % 2 == 1: if len(indices) != length + 1: raise ValueError("Unexpected file size") indices = indices[:length] if not set(indices).issubset("0123489abc"): raise ValueError("Unexpected sequence data found in file") table = maketrans("0123489abc", "TCAGNtcagn") nucleotides = indices.translate(table) sequence = Seq(nucleotides) record = SeqRecord(sequence) yield record class NibWriter(SequenceWriter): """Nib file writer.""" def __init__(self, handle): """Initialize an Nib writer object. Arguments: - handle - Output handle, in binary write mode. """ self.handle = handle byteorder = sys.byteorder if byteorder == "little": # little-endian signature = "3a3de96b" elif byteorder == "big": # big-endian signature = "6be93d3a" else: raise RuntimeError("unexpected system byte order %s" % byteorder) handle.write(hex2bytes(signature)) def write_file(self, records): """Use this to write an entire file containing the given record.""" count = 0 for record in records: count += 1 if count == 0: raise ValueError("Must have one sequence") if count > 1: raise ValueError("More than one sequence found") handle = self.handle sequence = record.seq nucleotides = str(sequence) length = len(sequence) handle.write(struct.pack("i", length)) table = maketrans("TCAGNtcagn", "0123489abc") padding = length % 2 suffix = padding * "T" nucleotides += suffix if not set(nucleotides).issubset("ACGTNacgtn"): raise ValueError("Sequence should contain A,C,G,T,N,a,c,g,t,n only") indices = nucleotides.translate(table) handle.write(hex2bytes(indices)) return count if __name__ == "__main__": from Bio._utils import run_doctest run_doctest(verbose=0)