# Copyright 2009 by Peter Cock. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Unittests for the Seq objects.""" from __future__ import print_function import warnings import unittest import sys from Bio import BiopythonWarning from Bio.Alphabet import generic_protein, generic_nucleotide from Bio.Alphabet import generic_dna, generic_rna from Bio.Alphabet import _check_type_compatible from Bio.Alphabet.IUPAC import protein, extended_protein from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna, ambiguous_rna from Bio.Data.IUPACData import ambiguous_dna_values, ambiguous_rna_values from Bio.Seq import Seq, UnknownSeq, MutableSeq, translate from Bio.Data.CodonTable import TranslationError, CodonTable if sys.version_info[0] < 3: from string import maketrans else: maketrans = str.maketrans # This is just the standard table with less stop codons # (replaced with coding for O as an artifical example) special_table = CodonTable(forward_table={ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'O', 'TGT': 'C', 'TGC': 'C', 'TGA': 'O', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, start_codons=['TAA', 'TAG', 'TGA'], stop_codons=['TAG']) Chilodonella_uncinata_table = CodonTable(forward_table={ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, start_codons=['ATG'], stop_codons=['TAA']) class StringMethodTests(unittest.TestCase): _examples = [ # These are length 9, a multiple of 3 for translation tests: Seq("ACGTGGGGT", generic_protein), Seq("ACGTGGGGT", generic_nucleotide), Seq("ACGTGGGGT", generic_dna), Seq("ACGUGGGGU", generic_rna), Seq("GG", generic_protein), Seq("GG", generic_nucleotide), Seq("GG", generic_dna), Seq("GG", generic_rna), Seq("A", generic_protein), Seq("A", generic_nucleotide), Seq("A", generic_dna), Seq("A", generic_rna), UnknownSeq(1), UnknownSeq(1, character="n"), UnknownSeq(1, generic_rna), UnknownSeq(1, generic_rna, "n"), UnknownSeq(1, generic_rna, "N"), UnknownSeq(12, generic_rna, "N"), UnknownSeq(12, generic_dna, "N"), UnknownSeq(12, generic_nucleotide, "N"), UnknownSeq(12, generic_protein, "X"), UnknownSeq(12, character="X"), UnknownSeq(12), ] for seq in _examples[:]: if isinstance(seq, Seq): _examples.append(seq.tomutable()) _start_end_values = [0, 1, 2, 1000, -1, -2, -999] def _test_method(self, method_name, pre_comp_function=None, start_end=False): """Check this method matches the plain string's method.""" self.assertTrue(isinstance(method_name, str)) for example1 in self._examples: if not hasattr(example1, method_name): # e.g. MutableSeq does not support find continue str1 = str(example1) for example2 in self._examples: if not hasattr(example2, method_name): # e.g. MutableSeq does not support find continue str2 = str(example2) i = getattr(example1, method_name)(str2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(str2), i, j)) try: i = getattr(example1, method_name)(example2) j = getattr(str1, method_name)(str2) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s) = %i, not %i" % (repr(example1), method_name, repr(example2), i, j)) except TypeError: # TODO - Check the alphabets do clash! pass if start_end: for start in self._start_end_values: i = getattr(example1, method_name)(str2, start) j = getattr(str1, method_name)(str2, start) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, i, j)) for end in self._start_end_values: i = getattr(example1, method_name)(str2, start, end) j = getattr(str1, method_name)(str2, start, end) if pre_comp_function: i = pre_comp_function(i) j = pre_comp_function(j) if i != j: raise ValueError("%s.%s(%s, %i, %i) = %i, not %i" % (repr(example1), method_name, repr(str2), start, end, i, j)) def test_str_count(self): """Check matches the python string count method.""" self._test_method("count", start_end=True) def test_str_find(self): """Check matches the python string find method.""" self._test_method("find", start_end=True) def test_str_rfind(self): """Check matches the python string rfind method.""" self._test_method("rfind", start_end=True) def test_str_startswith(self): """Check matches the python string startswith method.""" self._test_method("startswith", start_end=True) self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC"))) # Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "startswith"): # e.g. MutableSeq does not support this continue subs = tuple([example1[start:start + 2] for start in range(0, len(example1) - 2, 3)]) subs_str = tuple([str(s) for s in subs]) self.assertEqual(str(example1).startswith(subs_str), example1.startswith(subs)) self.assertEqual(str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual(str(example1).startswith(subs_str, 3), example1.startswith(subs, 3)) self.assertEqual(str(example1).startswith(subs_str, 2, 6), example1.startswith(subs, 2, 6)) def test_str_endswith(self): """Check matches the python string endswith method.""" self._test_method("endswith", start_end=True) self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE"))) # Now check with a tuple of sub sequences for example1 in self._examples: if not hasattr(example1, "endswith"): # e.g. MutableSeq does not support this continue subs = tuple([example1[start:start + 2] for start in range(0, len(example1) - 2, 3)]) subs_str = tuple([str(s) for s in subs]) self.assertEqual(str(example1).endswith(subs_str), example1.endswith(subs)) self.assertEqual(str(example1).startswith(subs_str), example1.startswith(subs_str)) # strings! self.assertEqual(str(example1).endswith(subs_str, 3), example1.endswith(subs, 3)) self.assertEqual(str(example1).endswith(subs_str, 2, 6), example1.endswith(subs, 2, 6)) def test_str_strip(self): """Check matches the python string strip method.""" self._test_method("strip", pre_comp_function=str) def test_str_rstrip(self): """Check matches the python string rstrip method.""" self._test_method("rstrip", pre_comp_function=str) def test_str_split(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_rsplit(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_lsplit(self): """Check matches the python string rstrip method.""" # Calling (r)split should return a list of Seq-like objects, we'll # just apply str() to each of them so it matches the string method self._test_method("rstrip", pre_comp_function=lambda x: [str(y) for y in x]) def test_str_length(self): """Check matches the python string __len__ method.""" for example1 in self._examples: str1 = str(example1) self.assertEqual(len(example1), len(str1)) def test_str_upper(self): """Check matches the python string upper method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.upper()), str1.upper()) def test_str_lower(self): """Check matches the python string lower method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue str1 = str(example1) self.assertEqual(str(example1.lower()), str1.lower()) def test_str_hash(self): for example1 in self._examples: if isinstance(example1, MutableSeq): continue with warnings.catch_warnings(): # Silence change in behaviour warning warnings.simplefilter('ignore', BiopythonWarning) self.assertEqual(hash(str(example1)), hash(example1), "Hash mismatch, %r for %r vs %r for %r" % (hash(str(example1)), id(example1), hash(example1), example1)) def test_str_comparison(self): for example1 in self._examples: for example2 in self._examples: with warnings.catch_warnings(): # Silence alphabet warning warnings.simplefilter('ignore', BiopythonWarning) self.assertEqual(str(example1) == str(example2), example1 == example2, "Checking %r == %r" % (example1, example2)) self.assertEqual(str(example1) != str(example2), example1 != example2, "Checking %r != %r" % (example1, example2)) self.assertEqual(str(example1) < str(example2), example1 < example2, "Checking %r < %r" % (example1, example2)) self.assertEqual(str(example1) <= str(example2), example1 <= example2, "Checking %r <= %r" % (example1, example2)) self.assertEqual(str(example1) > str(example2), example1 > example2, "Checking %r > %r" % (example1, example2)) self.assertEqual(str(example1) >= str(example2), example1 >= example2, "Checking %r >= %r" % (example1, example2)) def test_str_getitem(self): """Check slicing and indexing works like a string.""" for example1 in self._examples: str1 = str(example1) for i in self._start_end_values: if abs(i) < len(example1): self.assertEqual(str(example1[i]), str1[i]) self.assertEqual(str(example1[:i]), str1[:i]) self.assertEqual(str(example1[i:]), str1[i:]) for j in self._start_end_values: self.assertEqual(str(example1[i:j]), str1[i:j]) for step in range(-3, 4): if step == 0: try: print(example1[i:j:step]) self._assert(False) # Should fail! except ValueError: pass else: self.assertEqual(str(example1[i:j:step]), str1[i:j:step]) def test_tomutable(self): """Check obj.tomutable() method.""" for example1 in self._examples: if isinstance(example1, MutableSeq): continue mut = example1.tomutable() self.assertTrue(isinstance(mut, MutableSeq)) self.assertEqual(str(mut), str(example1)) self.assertEqual(mut.alphabet, example1.alphabet) def test_toseq(self): """Check obj.toseq() method.""" for example1 in self._examples: try: seq = example1.toseq() except AttributeError: self.assertTrue(isinstance(example1, Seq)) continue self.assertTrue(isinstance(seq, Seq)) self.assertEqual(str(seq), str(example1)) self.assertEqual(seq.alphabet, example1.alphabet) def test_the_complement(self): """Check obj.complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) # This only does the unambiguous cases if any(("U" in str1, "u" in str1, example1.alphabet == generic_rna)): mapping = maketrans("ACGUacgu", "UGCAugca") elif any(("T" in str1, "t" in str1, example1.alphabet == generic_dna, example1.alphabet == generic_nucleotide)): mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: # TODO - look at alphabet? raise ValueError(example1) self.assertEqual(str1.translate(mapping), str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_reverse_complement(self): """Check obj.reverse_complement() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: comp = example1.reverse_complement() except ValueError as e: self.assertEqual(str(e), "Proteins do not have complements!") continue str1 = str(example1) # This only does the unambiguous cases if any(("U" in str1, "u" in str1, example1.alphabet == generic_rna)): mapping = maketrans("ACGUacgu", "UGCAugca") elif any(("T" in str1, "t" in str1, example1.alphabet == generic_dna, example1.alphabet == generic_nucleotide)): mapping = maketrans("ACGTacgt", "TGCAtgca") elif "A" not in str1 and "a" not in str1: mapping = maketrans("CGcg", "GCgc") else: # TODO - look at alphabet? continue self.assertEqual(str1.translate(mapping)[::-1], str(comp)) self.assertEqual(comp.alphabet, example1.alphabet) def test_the_transcription(self): """Check obj.transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.transcribe() except ValueError as e: if str(e) == "Proteins cannot be transcribed!": continue if str(e) == "RNA cannot be transcribed!": continue raise e str1 = str(example1) if len(str1) % 3 != 0: # TODO - Check for or silence the expected warning? continue self.assertEqual(str1.replace("T", "U").replace("t", "u"), str(tran)) self.assertEqual(tran.alphabet, generic_rna) # based on limited examples def test_the_back_transcription(self): """Check obj.back_transcribe() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue try: tran = example1.back_transcribe() except ValueError as e: if str(e) == "Proteins cannot be back transcribed!": continue if str(e) == "DNA cannot be back transcribed!": continue raise e str1 = str(example1) self.assertEqual(str1.replace("U", "T").replace("u", "t"), str(tran)) self.assertEqual(tran.alphabet, generic_dna) # based on limited examples def test_the_translate(self): """Check obj.translate() method.""" mapping = "" for example1 in self._examples: if isinstance(example1, MutableSeq): continue if len(example1) % 3 != 0: # TODO - Check for or silence the expected warning? continue try: tran = example1.translate() except ValueError as e: if str(e) == "Proteins cannot be translated!": continue raise e # This is based on the limited example not having stop codons: if tran.alphabet not in [extended_protein, protein, generic_protein]: print(tran.alphabet) self.fail() # TODO - check the actual translation, and all the optional args def test_the_translation_of_stops(self): """Check obj.translate() method with stop codons.""" misc_stops = "TAATAGTGAAGAAGG" for nuc in [Seq(misc_stops), Seq(misc_stops, generic_nucleotide), Seq(misc_stops, generic_dna), Seq(misc_stops, unambiguous_dna)]: self.assertEqual("***RR", str(nuc.translate())) self.assertEqual("***RR", str(nuc.translate(1))) self.assertEqual("***RR", str(nuc.translate("SGC0"))) self.assertEqual("**W**", str(nuc.translate(table=2))) self.assertEqual("**WRR", str(nuc.translate(table='Yeast Mitochondrial'))) self.assertEqual("**WSS", str(nuc.translate(table=5))) self.assertEqual("**WSS", str(nuc.translate(table=9))) self.assertEqual("**CRR", str(nuc.translate(table='Euplotid Nuclear'))) self.assertEqual("***RR", str(nuc.translate(table=11))) self.assertEqual("***RR", str(nuc.translate(table='11'))) self.assertEqual("***RR", str(nuc.translate(table='Bacterial'))) self.assertEqual("**GRR", str(nuc.translate(table=25))) self.assertEqual("", str(nuc.translate(to_stop=True))) self.assertEqual("O*ORR", str(nuc.translate(table=special_table))) self.assertEqual("*QWRR", str(nuc.translate(table=Chilodonella_uncinata_table))) # These test the Bio.Seq.translate() function - move these?: self.assertEqual("*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table)) self.assertEqual("O*ORR", translate(str(nuc), table=special_table)) self.assertEqual("", translate(str(nuc), to_stop=True)) self.assertEqual("***RR", translate(str(nuc), table='Bacterial')) self.assertEqual("***RR", translate(str(nuc), table='11')) self.assertEqual("***RR", translate(str(nuc), table=11)) self.assertEqual("**W**", translate(str(nuc), table=2)) self.assertEqual(str(Seq("TAT").translate()), "Y") self.assertEqual(str(Seq("TAR").translate()), "*") self.assertEqual(str(Seq("TAN").translate()), "X") self.assertEqual(str(Seq("NNN").translate()), "X") self.assertEqual(str(Seq("TAt").translate()), "Y") self.assertEqual(str(Seq("TaR").translate()), "*") self.assertEqual(str(Seq("TaN").translate()), "X") self.assertEqual(str(Seq("nnN").translate()), "X") self.assertEqual(str(Seq("tat").translate()), "Y") self.assertEqual(str(Seq("tar").translate()), "*") self.assertEqual(str(Seq("tan").translate()), "X") self.assertEqual(str(Seq("nnn").translate()), "X") def test_the_translation_of_invalid_codons(self): """Check obj.translate() method with invalid codons.""" for codon in ["TA?", "N-N", "AC_", "Ac_"]: for nuc in [Seq(codon), Seq(codon, generic_nucleotide), Seq(codon, generic_dna), Seq(codon, unambiguous_dna)]: try: print(nuc.translate()) self.fail("Translating %s should fail" % codon) except TranslationError: pass def test_the_translation_of_ambig_codons(self): """Check obj.translate() method with ambiguous codons.""" for letters, ambig_values in [(ambiguous_dna.letters, ambiguous_dna_values), (ambiguous_rna.letters, ambiguous_rna_values)]: ambig = set(letters) for c1 in ambig: for c2 in ambig: for c3 in ambig: values = set(str(Seq(a + b + c).translate()) for a in ambig_values[c1] for b in ambig_values[c2] for c in ambig_values[c3]) t = str(Seq(c1 + c2 + c3).translate()) if t == "*": self.assertEqual(values, set("*")) elif t == "X": self.assertTrue(len(values) > 1, "translate('%s') = '%s' not '%s'" % (c1 + c2 + c3, t, ",".join(values))) elif t == "Z": self.assertEqual(values, set("EQ")) elif t == "B": self.assertEqual(values, set("DN")) elif t == "J": self.assertEqual(values, set("LI")) else: self.assertEqual(values, set(t)) # TODO - Use the Bio.Data.IUPACData module for the # ambiguous protein mappings? def test_init_typeerror(self): """Check Seq __init__ gives TypeError exceptions.""" # Only expect it to take strings and unicode - not Seq objects! self.assertRaises(TypeError, Seq, (1066)) self.assertRaises(TypeError, Seq, (Seq("ACGT", generic_dna))) # TODO - Addition... if __name__ == "__main__": runner = unittest.TextTestRunner(verbosity=2) unittest.main(testRunner=runner)