# coding: utf-8 from __future__ import print_function, division, absolute_import from nose.tools import raises, assert_raises from cutadapt.seqio import Sequence from cutadapt.adapters import (Adapter, Match, ColorspaceAdapter, FRONT, BACK, parse_braces, LinkedAdapter, AdapterStatistics) def test_issue_52(): adapter = Adapter( sequence='GAACTCCAGTCACNNNNN', where=BACK, max_error_rate=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True) read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, remove_before=False, adapter=adapter, read=read) assert am.wildcards() == 'GGC' """ The result above should actually be 'CGGC' since the correct alignment is this one: adapter GAACTCCAGTCACNNNNN mismatches X X read CCCCAGAACTACAGTC-CCGGC Since we do not keep the alignment, guessing 'GGC' is the best we can currently do. """ def test_issue_80(): # This issue turned out to not be an actual issue with the alignment # algorithm. The following alignment is found because it has more matches # than the 'obvious' one: # # TCGTATGCCGTCTTC # =========X==XX= # TCGTATGCCCTC--C # # This is correct, albeit a little surprising, since an alignment without # indels would have only two errors. adapter = Adapter( sequence="TCGTATGCCGTCTTC", where=BACK, max_error_rate=0.2, min_overlap=3, read_wildcards=False, adapter_wildcards=False) read = Sequence(name="seq2", sequence="TCGTATGCCCTCC") result = adapter.match_to(read) assert result.errors == 3, result assert result.astart == 0, result assert result.astop == 15, result def test_str(): a = Adapter('ACGT', where=BACK, max_error_rate=0.1) str(a) str(a.match_to(Sequence(name='seq', sequence='TTACGT'))) ca = ColorspaceAdapter('0123', where=BACK, max_error_rate=0.1) str(ca) @raises(ValueError) def test_color(): ColorspaceAdapter('0123', where=FRONT, max_error_rate=0.1) def test_parse_braces(): assert parse_braces('') == '' assert parse_braces('A') == 'A' assert parse_braces('A{0}') == '' assert parse_braces('A{1}') == 'A' assert parse_braces('A{2}') == 'AA' assert parse_braces('A{2}C') == 'AAC' assert parse_braces('ACGTN{3}TGACCC') == 'ACGTNNNTGACCC' assert parse_braces('ACGTN{10}TGACCC') == 'ACGTNNNNNNNNNNTGACCC' assert parse_braces('ACGTN{3}TGA{4}CCC') == 'ACGTNNNTGAAAACCC' assert parse_braces('ACGTN{0}TGA{4}CCC') == 'ACGTTGAAAACCC' def test_parse_braces_fail(): for expression in ['{', '}', '{}', '{5', '{1}', 'A{-7}', 'A{', 'A{1', 'N{7', 'AN{7', 'A{4{}', 'A{4}{3}', 'A{b}', 'A{6X}', 'A{X6}']: assert_raises(ValueError, lambda: parse_braces(expression)) def test_linked_adapter(): linked_adapter = LinkedAdapter('AAAA', 'TTTT', min_overlap=4) assert linked_adapter.front_adapter.min_overlap == 4 assert linked_adapter.back_adapter.min_overlap == 4 sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT') trimmed = linked_adapter.match_to(sequence).trimmed() assert trimmed.name == 'seq' assert trimmed.sequence == 'CCCCC' def test_info_record(): adapter = Adapter( sequence='GAACTCCAGTCACNNNNN', where=BACK, max_error_rate=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True, name="Foo") read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, remove_before=False, adapter=adapter, read=read) assert am.get_info_record() == ( "abc", 2, 5, 21, 'CCCCA', 'GAACTACAGTCCCGGC', '', 'Foo', '', '', '' ) def test_random_match_probabilities(): a = Adapter('A', where=BACK, max_error_rate=0.1).create_statistics() assert a.back.random_match_probabilities(0.5) == [1, 0.25] assert a.back.random_match_probabilities(0.2) == [1, 0.4] for s in ('ACTG', 'XMWH'): a = Adapter(s, where=BACK, max_error_rate=0.1).create_statistics() assert a.back.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] assert a.back.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] a = Adapter('GTCA', where=FRONT, max_error_rate=0.1).create_statistics() assert a.front.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] assert a.front.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] def test_add_adapter_statistics(): stats = Adapter('A', name='name', where=BACK, max_error_rate=0.1).create_statistics() end_stats = stats.back end_stats.adjacent_bases['A'] = 7 end_stats.adjacent_bases['C'] = 19 end_stats.adjacent_bases['G'] = 23 end_stats.adjacent_bases['T'] = 42 end_stats.adjacent_bases[''] = 45 end_stats.errors[10][0] = 100 end_stats.errors[10][1] = 11 end_stats.errors[10][2] = 3 end_stats.errors[20][0] = 600 end_stats.errors[20][1] = 66 end_stats.errors[20][2] = 6 stats2 = Adapter('A', name='name', where=BACK, max_error_rate=0.1).create_statistics() end_stats2 = stats2.back end_stats2.adjacent_bases['A'] = 43 end_stats2.adjacent_bases['C'] = 31 end_stats2.adjacent_bases['G'] = 27 end_stats2.adjacent_bases['T'] = 8 end_stats2.adjacent_bases[''] = 5 end_stats2.errors[10][0] = 234 end_stats2.errors[10][1] = 14 end_stats2.errors[10][3] = 5 end_stats2.errors[15][0] = 90 end_stats2.errors[15][1] = 17 end_stats2.errors[15][2] = 2 stats += stats2 r = stats.back assert r.adjacent_bases == {'A': 50, 'C': 50, 'G': 50, 'T': 50, '': 50} assert r.errors == { 10: {0: 334, 1: 25, 2: 3, 3: 5}, 15: {0: 90, 1: 17, 2: 2}, 20: {0: 600, 1: 66, 2: 6}, } def test_issue_265(): """Crash when accessing the matches property of non-anchored linked adapters""" s = Sequence('name', 'AAAATTTT') la = LinkedAdapter('GGG', 'TTT', front_anchored=False, back_anchored=False) assert la.match_to(s).matches == 3