/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.biojava.bio.BioError; import org.biojava.bio.BioException; import org.biojava.bio.SimpleAnnotation; import org.biojava.bio.dist.Distribution; import org.biojava.bio.dist.PairDistribution; import org.biojava.bio.dist.SimpleDistribution; import org.biojava.bio.seq.impl.SimpleGappedSequence; import org.biojava.bio.seq.impl.SimpleSequenceFactory; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alphabet; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.AtomicSymbol; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.IllegalAlphabetException; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.ReversibleTranslationTable; import org.biojava.bio.symbol.SimpleReversibleTranslationTable; import org.biojava.bio.symbol.SimpleSymbolList; import org.biojava.bio.symbol.Symbol; import org.biojava.bio.symbol.SymbolList; import org.biojava.bio.symbol.SymbolListViews; import org.biojava.utils.ChangeVetoException; /** * Useful functionality for processing DNA sequences. * * @author Matthew Pocock * @author Keith James (docs) * @author Mark Schreiber * @author David Huen * @author Richard Holland */ public final class DNATools { private static final ReversibleTranslationTable complementTable; static private final FiniteAlphabet dna; private static final SymbolTokenization dnaTokens; static private final AtomicSymbol a; static private final AtomicSymbol g; static private final AtomicSymbol c; static private final AtomicSymbol t; static private final Symbol n; static private final Symbol m; static private final Symbol r; static private final Symbol w; static private final Symbol s; static private final Symbol y; static private final Symbol k; static private final Symbol v; static private final Symbol h; static private final Symbol d; static private final Symbol b; static private final SimpleReversibleTranslationTable transcriptionTable; static private Map symbolToComplement; static { try { dna = (FiniteAlphabet) AlphabetManager.alphabetForName("DNA"); dnaTokens = dna.getTokenization("token"); SymbolList syms = new SimpleSymbolList(dnaTokens, "agctnmrwsykvhdb"); a = (AtomicSymbol) syms.symbolAt(1); g = (AtomicSymbol) syms.symbolAt(2); c = (AtomicSymbol) syms.symbolAt(3); t = (AtomicSymbol) syms.symbolAt(4); n = syms.symbolAt(5); m = syms.symbolAt(6); r = syms.symbolAt(7); w = syms.symbolAt(8); s = syms.symbolAt(9); y = syms.symbolAt(10); k = syms.symbolAt(11); v = syms.symbolAt(12); h = syms.symbolAt(13); d = syms.symbolAt(14); b = syms.symbolAt(15); symbolToComplement = new HashMap(); // add the gap symbol Symbol gap = dna.getGapSymbol(); symbolToComplement.put(gap, gap); // add all other ambiguity symbols for(Iterator i = AlphabetManager.getAllSymbols(dna).iterator(); i.hasNext();) { Symbol as = (Symbol) i.next(); FiniteAlphabet matches = (FiniteAlphabet) as.getMatches(); if (matches.size() > 1) { // We've hit an ambiguous symbol. Set l = new HashSet(); for(Iterator j = matches.iterator(); j.hasNext(); ) { l.add(complement((Symbol) j.next())); } symbolToComplement.put(as, dna.getAmbiguity(l)); } } complementTable = new DNAComplementTranslationTable(); transcriptionTable = new SimpleReversibleTranslationTable(getDNA(), RNATools.getRNA()); transcriptionTable.setTranslation(a, RNATools.a()); transcriptionTable.setTranslation(c, RNATools.c()); transcriptionTable.setTranslation(g, RNATools.g()); transcriptionTable.setTranslation(t, RNATools.u()); } catch (Throwable th) { throw new BioError("Unable to initialize DNATools", th); } } public static AtomicSymbol a() { return a; } public static AtomicSymbol g() { return g; } public static AtomicSymbol c() { return c; } public static AtomicSymbol t() { return t; } public static Symbol n() { return n; } public static Symbol m() { return m; } public static Symbol r() { return r; } public static Symbol w() { return w; } public static Symbol s() { return s; } public static Symbol y() { return y; } public static Symbol k() { return k; } public static Symbol v() { return v; } public static Symbol h() { return h; } public static Symbol d() { return d; } public static Symbol b() { return b; } private DNATools() { } /** * Return the DNA alphabet. * * @return a flyweight version of the DNA alphabet */ public static FiniteAlphabet getDNA() { return dna; } /** * Gets the (DNA x DNA) Alphabet * @return a flyweight version of the (DNA x DNA) alphabet */ public static FiniteAlphabet getDNAxDNA(){ return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(DNA x DNA)"); } /** * Gets the (DNA x DNA x DNA) Alphabet * @return a flyweight version of the (DNA x DNA x DNA) alphabet */ public static FiniteAlphabet getCodonAlphabet(){ return (FiniteAlphabet)AlphabetManager.generateCrossProductAlphaFromName("(DNA x DNA x DNA)"); } /** * Return a new DNA SymbolList for * dna. * * @param dna a String to parse into DNA * @return a SymbolList created form * dna * @throws IllegalSymbolException if dna contains * any non-DNA characters */ public static SymbolList createDNA(String dna) throws IllegalSymbolException { SymbolTokenization p = null; try { p = getDNA().getTokenization("token"); } catch (BioException e) { throw new BioError("Something has gone badly wrong with DNA", e); } return new SimpleSymbolList(p, dna); } /** * Return a new DNA Sequence for * dna. * * @param dna a String to parse into DNA * @param name a String to use as the name * @return a Sequence created form * dna * @throws IllegalSymbolException if dna contains * any non-DNA characters */ public static Sequence createDNASequence(String dna, String name) throws IllegalSymbolException { //should I be calling createGappedDNASequence? if(dna.indexOf('-') != -1 || dna.indexOf('~') != -1){//there is a gap return createGappedDNASequence(dna, name); } // No need to wrap in try-catch as throws IllegalSymbolException already. //try { return new SimpleSequenceFactory().createSequence( createDNA(dna), "", name, new SimpleAnnotation() ); //} catch (BioException se) { // throw new BioError("Something has gone badly wrong with DNA", se); //} } /** Get a new dna as a GappedSequence */ public static GappedSequence createGappedDNASequence(String dna, String name) throws IllegalSymbolException{ String dna1 = dna.replaceAll("-", ""); Sequence dnaSeq = createDNASequence(dna1, name); GappedSequence dnaSeq1 = new SimpleGappedSequence(dnaSeq); int pos = dna.indexOf('-', 0); while(pos!=-1){ dnaSeq1.addGapInView(pos+1); pos = dna.indexOf('-', pos+1); } return dnaSeq1; } /** * Return an integer index for a symbol - compatible with * forIndex. * *

* The index for a symbol is stable accross virtual machines & * invocations. *

* * @param sym the Symbol to index * @return the index for that symbol * * @throws IllegalSymbolException if sym is not a member of the DNA * alphabet */ public static int index(Symbol sym) throws IllegalSymbolException { if(sym == a) { return 0; } else if(sym == g) { return 1; } else if(sym == c) { return 2; } else if(sym == t) { return 3; } getDNA().validate(sym); throw new IllegalSymbolException("Really confused. Can't find index for " + sym.getName()); } /** * Return the symbol for an index - compatible with index. * *

* The index for a symbol is stable accross virtual machines & * invocations. *

* * @param index the index to look up * @return the symbol at that index * * @throws IndexOutOfBoundsException if index is not between 0 and 3 */ static public Symbol forIndex(int index) throws IndexOutOfBoundsException { if(index == 0) return a; else if(index == 1) return g; else if(index == 2) return c; else if(index == 3) return t; else throw new IndexOutOfBoundsException("No symbol for index " + index); } /** * Complement the symbol. * * @param sym the symbol to complement * @return a Symbol that is the complement of sym * @throws IllegalSymbolException if sym is not a member of the DNA alphabet */ static public Symbol complement(Symbol sym) throws IllegalSymbolException { if(sym == a) { return t; } else if(sym == g) { return c; } else if(sym == c) { return g; } else if(sym == t) { return a; } Symbol s = (Symbol) symbolToComplement.get(sym); if(s != null) { return s; } else { getDNA().validate(sym); throw new BioError( "Really confused. Can't find symbol " + sym.getName() ); } } /** * Retrieve the symbol for a symbol. * * @param token the char to look up * @return the symbol for that char * @throws IllegalSymbolException if the char is not a valid IUB dna code */ static public Symbol forSymbol(char token) throws IllegalSymbolException { String t = String.valueOf(token); SymbolTokenization toke; try{ toke = getDNA().getTokenization("token"); }catch(BioException e){ throw new BioError("Cannot find the 'token' Tokenization for DNA!?", e); } return toke.parseToken(t); } /** * Retrieve a complement view of list. * * @param list the SymbolList to complement * @return a SymbolList that is the complement * @throws IllegalAlphabetException if list is not a complementable alphabet */ public static SymbolList complement(SymbolList list) throws IllegalAlphabetException { return SymbolListViews.translate(list, complementTable()); } /** * Retrieve a reverse-complement view of list. * * @param list the SymbolList to complement * @return a SymbolList that is the complement * @throws IllegalAlphabetException if list is not a complementable alphabet */ public static SymbolList reverseComplement(SymbolList list) throws IllegalAlphabetException { return SymbolListViews.translate(SymbolListViews.reverse(list), complementTable()); } /** * Returns a SymbolList that is reverse complemented if the strand is * negative, and the origninal one if it is not. * * @param list the SymbolList to view * @param strand the Strand to use * @return the apropreate view of the SymbolList * @throws IllegalAlphabetException if list is not a complementable alphabet */ public static SymbolList flip(SymbolList list, StrandedFeature.Strand strand) throws IllegalAlphabetException { if(strand == StrandedFeature.NEGATIVE) { return reverseComplement(list); } else { return list; } } /** * Get a translation table for complementing DNA symbols. * * @since 1.1 */ public static ReversibleTranslationTable complementTable() { return complementTable; } /** * Get a single-character token for a DNA symbol * * @throws IllegalSymbolException if sym is not a member of the DNA alphabet */ public static char dnaToken(Symbol sym) throws IllegalSymbolException { return dnaTokens.tokenizeSymbol(sym).charAt(0); } /** * Sneaky class for complementing DNA bases. */ private static class DNAComplementTranslationTable implements ReversibleTranslationTable { public Symbol translate(Symbol s) throws IllegalSymbolException { return DNATools.complement(s); } public Symbol untranslate(Symbol s) throws IllegalSymbolException { return DNATools.complement(s); } public Alphabet getSourceAlphabet() { return DNATools.getDNA(); } public Alphabet getTargetAlphabet() { return DNATools.getDNA(); } } /** * return a SimpleDistribution of specified GC content. * @param fractionGC (G+C) content as a fraction. */ public static Distribution getDNADistribution(double fractionGC) { try { Distribution dist = new SimpleDistribution(DNATools.getDNA()); double gc = 0.5 * fractionGC; double at = 0.5 * (1.0 - fractionGC); dist.setWeight(DNATools.a(), at); dist.setWeight(DNATools.t(), at); dist.setWeight(DNATools.c(), gc); dist.setWeight(DNATools.g(), gc); return dist; } // these exceptions are just plain impossible!!! catch (IllegalSymbolException ise) { return null; } catch (ChangeVetoException cve) { return null; } } /** * return a (DNA x DNA) cross-product Distribution with specified * DNA contents in each component Alphabet. * @param fractionGC0 (G+C) content of first sequence as a fraction. * @param fractionGC1 (G+C) content of second sequence as a fraction. */ public static Distribution getDNAxDNADistribution( double fractionGC0, double fractionGC1 ) { return new PairDistribution(getDNADistribution(fractionGC0), getDNADistribution(fractionGC1)); } /** * Converts a SymbolList from the DNA Alphabet to the * RNA Alphabet. * @param syms the SymbolList to convert to RNA * @return a view on syms where Symbols have been converted to RNA. * Most significantly t's are now u's. The 5' to 3' order of the Symbols is conserved. * @since 1.4 * @throws IllegalAlphabetException if syms is not DNA. */ public static SymbolList toRNA(SymbolList syms)throws IllegalAlphabetException{ return SymbolListViews.translate(syms, transcriptionTable); } /** * Transcribes DNA to RNA. The method more closely represents the biological reality * than toRNA(SymbolList syms) does. The presented DNA SymbolList * is assumed to be the template strand in the 5' to 3' orientation. The resulting * RNA is transcribed from this template effectively a reverse complement in the RNA alphabet. * The method is equivalent to calling reverseComplement() and toRNA() in sequence. *

If you are dealing with cDNA sequences that you want converted to RNA you would be * better off calling toRNA(SymbolList syms) * @param syms the SymbolList to convert to RNA * @return a view on syms where Symbols have been converted to RNA. * @since 1.4 * @throws IllegalAlphabetException if syms is not DNA. */ public static SymbolList transcribeToRNA(SymbolList syms) throws IllegalAlphabetException{ syms = reverseComplement(syms); return toRNA(syms); } /** * Convenience method that directly converts a DNA sequence to RNA then to * protein. The translated protein is from the +1 reading frame of the * SymbolList. The whole SymbolList is translated * although up to 2 DNA residues may be truncated if full codons cannot be * formed. * @param syms the sequence to be translated. * @return the translated protein sequence. * @throws org.biojava.bio.symbol.IllegalAlphabetException if syms * is not from the DNA alphabet. * @since 1.5.1 */ public static SymbolList toProtein(final SymbolList syms) throws IllegalAlphabetException{ SymbolList symz = new SimpleSymbolList(syms); symz = toRNA(symz); //truncate to a length divisible by three. symz = symz.subList(1, symz.length() - (symz.length() %3)); return RNATools.translate(symz); } /** * Convenience method to translate a region of a DNA sequence directly into * protein. While the start and end can be specified if the length of the * specified region is not evenly divisible by three then the translated * region will be truncated until a full terminal codon can be formed. * @param syms the DNA sequence to be translated. * @param start the location to begin translation. * @param end the end of the translated region. * @return the translated protein sequence. * @throws org.biojava.bio.symbol.IllegalAlphabetException if syms * is not from the DNA alphabet. * @since 1.5.1 */ public static SymbolList toProtein(final SymbolList syms, int start, int end) throws IllegalAlphabetException{ SymbolList symz = new SimpleSymbolList(syms); symz = symz.subList(start, end); //return toProtein(syms); //changed to fix bug 2521 return toProtein(symz); } }