/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.biojava.bio.BioError; import org.biojava.bio.BioException; import org.biojava.bio.SimpleAnnotation; import org.biojava.bio.seq.impl.SimpleSequenceFactory; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alphabet; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.AtomicSymbol; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.IllegalAlphabetException; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.ReversibleTranslationTable; import org.biojava.bio.symbol.SimpleSymbolList; import org.biojava.bio.symbol.Symbol; import org.biojava.bio.symbol.SymbolList; import org.biojava.bio.symbol.SymbolListViews; /** * Useful functionality for processing nucleotide sequences. * * @author Matthew Pocock * @author Keith James (docs) */ public final class NucleotideTools { private static final ReversibleTranslationTable complementTable; static private final FiniteAlphabet nucleotide; private static final SymbolTokenization nucleotideTokens; static private final AtomicSymbol a; static private final AtomicSymbol g; static private final AtomicSymbol c; static private final AtomicSymbol t; static private final AtomicSymbol u; static private final Symbol r; static private final Symbol y; static private final Symbol m; static private final Symbol k; static private final Symbol s; static private final Symbol w; static private final Symbol b; static private final Symbol d; static private final Symbol h; static private final Symbol v; static private final Symbol n; static private Map symbolToComplement; static { try { nucleotide = (FiniteAlphabet) AlphabetManager.alphabetForName("NUCLEOTIDE"); nucleotideTokens = nucleotide.getTokenization("token"); SymbolList syms = new SimpleSymbolList(nucleotideTokens, "agcturymkswbdhvn"); a = (AtomicSymbol) syms.symbolAt(1); g = (AtomicSymbol) syms.symbolAt(2); c = (AtomicSymbol) syms.symbolAt(3); t = (AtomicSymbol) syms.symbolAt(4); u = (AtomicSymbol) syms.symbolAt(5); r = syms.symbolAt(6); y = syms.symbolAt(7); m = syms.symbolAt(8); k = syms.symbolAt(9); s = syms.symbolAt(10); w = syms.symbolAt(11); b = syms.symbolAt(12); d = syms.symbolAt(13); h = syms.symbolAt(14); v = syms.symbolAt(15); n = syms.symbolAt(16); symbolToComplement = new HashMap(); // add the gap symbol Symbol gap = nucleotide.getGapSymbol(); symbolToComplement.put(gap, gap); // add all other ambiguity symbols for(Iterator i = AlphabetManager.getAllSymbols(nucleotide).iterator(); i.hasNext();) { Symbol as = (Symbol) i.next(); FiniteAlphabet matches = (FiniteAlphabet) as.getMatches(); if (matches.size() > 1) { // We've hit an ambiguous symbol. Set l = new HashSet(); for(Iterator j = matches.iterator(); j.hasNext(); ) { l.add(complement((Symbol) j.next())); } symbolToComplement.put(as, nucleotide.getAmbiguity(l)); } } complementTable = new NucleotideComplementTranslationTable(); } catch (Throwable t) { throw new BioError("Unable to initialize NucleotideTools",t); } } public static AtomicSymbol a() { return a; } public static AtomicSymbol g() { return g; } public static AtomicSymbol c() { return c; } public static AtomicSymbol t() { return t; } public static AtomicSymbol u() { return u; } public static Symbol r() { return r; } public static Symbol y() { return y; } public static Symbol m() { return m; } public static Symbol k() { return k; } public static Symbol s() { return s; } public static Symbol w() { return w; } public static Symbol b() { return b; } public static Symbol d() { return d; } public static Symbol h() { return h; } public static Symbol v() { return v; } public static Symbol n() { return n; } private NucleotideTools() { } /** * Return the Nucleotide alphabet. * * @return a flyweight version of the Nucleotide alphabet */ public static FiniteAlphabet getNucleotide() { return nucleotide; } /** * Return a new Nucleotide SymbolList for * nucleotide. * * @param nucleotide a String to parse into Nucleotide * @return a SymbolList created form * nucleotide * @throws IllegalSymbolException if nucleotide contains * any non-Nucleotide characters */ public static SymbolList createNucleotide(String nucleotide) throws IllegalSymbolException { try { SymbolTokenization p = getNucleotide().getTokenization("token"); return new SimpleSymbolList(p, nucleotide); } catch (BioException se) { throw new BioError("Something has gone badly wrong with Nucleotide",se); } } /** * Return a new Nucleotide Sequence for * nucleotide. * * @param nucleotide a String to parse into Nucleotide * @param name a String to use as the name * @return a Sequence created form * nucleotide * @throws IllegalSymbolException if nucleotide contains * any non-Nucleotide characters */ public static Sequence createNucleotideSequence(String nucleotide, String name) throws IllegalSymbolException { try { return new SimpleSequenceFactory().createSequence( createNucleotide(nucleotide), "", name, new SimpleAnnotation() ); } catch (BioException se) { throw new BioError("Something has gone badly wrong with Nucleotide",se); } } /** * Return an integer index for a symbol - compatible with * forIndex. * *

* The index for a symbol is stable accross virtual machines & * invocations. *

* * @param sym the Symbol to index * @return the index for that symbol * * @throws IllegalSymbolException if sym is not a member of the Nucleotide * alphabet */ public static int index(Symbol sym) throws IllegalSymbolException { if(sym == a) { return 0; } else if(sym == g) { return 1; } else if(sym == c) { return 2; } else if(sym == t) { return 3; } else if(sym == u) { return 4; } getNucleotide().validate(sym); throw new IllegalSymbolException("Really confused. Can't find index for " + sym.getName()); } /** * Return the symbol for an index - compatible with index. * *

* The index for a symbol is stable accross virtual machines & * invocations. *

* * @param index the index to look up * @return the symbol at that index * * @throws IndexOutOfBoundsException if index is not between 0 and 3 */ static public Symbol forIndex(int index) throws IndexOutOfBoundsException { if(index == 0) return a; else if(index == 1) return g; else if(index == 2) return c; else if(index == 3) return t; else if(index == 4) return u; else throw new IndexOutOfBoundsException("No symbol for index " + index); } /** * Complement the symbol. * * @param sym the symbol to complement * @return a Symbol that is the complement of sym * @throws IllegalSymbolException if sym is not a member of the Nucleotide alphabet */ static public Symbol complement(Symbol sym) throws IllegalSymbolException { if(sym == a) { return t; } else if(sym == g) { return c; } else if(sym == c) { return g; } else if(sym == t) { return a; } else if(sym == u) { return a; } Symbol s = (Symbol) symbolToComplement.get(sym); if(s != null) { return s; } else { getNucleotide().validate(sym); throw new BioError( "Really confused. Can't find symbol " + sym.getName() ); } } /** * Retrieve the symbol for a symbol. * * @param token the char to look up * @return the symbol for that char * @throws IllegalSymbolException if the char does not belong to {a, g, c, t, u} */ static public Symbol forSymbol(char token) throws IllegalSymbolException { if(token == 'a') { return a; } else if(token == 'g') { return g; } else if(token == 'c') { return c; } else if(token == 't') { return t; } else if(token == 'u') { return u; } throw new IllegalSymbolException("Unable to find symbol for token " + token); } /** * Retrieve a complement view of list. * * @param list the SymbolList to complement * @return a SymbolList that is the complement * @throws IllegalAlphabetException if list is not a complementable alphabet */ public static SymbolList complement(SymbolList list) throws IllegalAlphabetException { return SymbolListViews.translate(list, complementTable()); } /** * Retrieve a reverse-complement view of list. * * @param list the SymbolList to complement * @return a SymbolList that is the complement * @throws IllegalAlphabetException if list is not a complementable alphabet */ public static SymbolList reverseComplement(SymbolList list) throws IllegalAlphabetException { return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable()); } /** * Get a translation table for complementing Nucleotide symbols. * * @since 1.1 */ public static ReversibleTranslationTable complementTable() { return complementTable; } /** * Get a single-character token for a Nucleotide symbol * * @throws IllegalSymbolException if sym is not a member of the Nucleotide alphabet */ public static char nucleotideToken(Symbol sym) throws IllegalSymbolException { return nucleotideTokens.tokenizeSymbol(sym).charAt(0); } /** * Sneaky class for complementing Nucleotide bases. */ private static class NucleotideComplementTranslationTable implements ReversibleTranslationTable { public Symbol translate(Symbol s) throws IllegalSymbolException { return NucleotideTools.complement(s); } public Symbol untranslate(Symbol s) throws IllegalSymbolException { return NucleotideTools.complement(s); } public Alphabet getSourceAlphabet() { return NucleotideTools.getNucleotide(); } public Alphabet getTargetAlphabet() { return NucleotideTools.getNucleotide(); } } }