/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.program.abi; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.biojava.bio.BioError; import org.biojava.bio.seq.DNATools; import org.biojava.bio.seq.io.CharacterTokenization; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.AbstractAlphabet; import org.biojava.bio.symbol.Alignment; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.AtomicSymbol; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.IllegalAlphabetException; import org.biojava.bio.symbol.IntegerAlphabet; import org.biojava.bio.symbol.Symbol; import org.biojava.bio.symbol.SymbolList; import org.biojava.bio.symbol.SymbolListViews; import org.biojava.utils.ListTools; /** * Useful functionality for working with fasta files where the quality of the * DNA is encoded as upper and lower case DNA characters. * * @author Matthew Pocock */ public class ABITools { /** * The quality alphabet. This is equivalent to DNA x [0,1] where 0 represents * poorly supported (lower case) and 1 represents strongly supported (upper * case). */ public static final FiniteAlphabet QUALITY; /** * The poorly supported symbol. */ public static final AtomicSymbol _0; /** * The well supported symbol. */ public static final AtomicSymbol _1; /** * Alignment label for the DNA sequence row. */ public static final Object SEQUENCE = "SEQUENCE"; /** * Alignment label for the support row. */ public static final Object SUPPORT = "SUPPORT"; static { try { IntegerAlphabet.SubIntegerAlphabet _01 = IntegerAlphabet.getSubAlphabet(0, 1); _0 = _01.getSymbol(0); _1 = _01.getSymbol(1); List alphas = new ArrayList(); alphas.add(DNATools.getDNA()); alphas.add(_01); // naughty here - we know because we are insiders that the result of this // call will be an AbstractAlphabet impl AbstractAlphabet quality = (AbstractAlphabet) AlphabetManager.getCrossProductAlphabet(alphas); CharacterTokenization tok = new CharacterTokenization(quality, true); // all lower case characters go to sym,0 // all upper case characters go to sym,1 SymbolList sl = DNATools.createDNA("agctrymkswhbvdn"); ListTools.Doublet pair = new ListTools.Doublet(); SymbolTokenization dnaTok = DNATools.getDNA().getTokenization("token"); for(Iterator i = sl.iterator(); i.hasNext(); ) { pair.setA((Symbol) i.next()); String c = dnaTok.tokenizeSymbol((Symbol) pair.getA()); pair.setB(_1); tok.bindSymbol(quality.getSymbol(pair), c.toUpperCase().charAt(0)); pair.setB(_0); tok.bindSymbol(quality.getSymbol(pair), c.toLowerCase().charAt(0)); } quality.putTokenization("token", tok); QUALITY = quality; } catch (Exception e) { throw new BioError("Could not initialize ABI quality alphabet",e); } } /** *

* View a symbol list over the QUALITY alphabet as an alignment. *

* *

* The alignment will have labels of SEQUENCE and SUPPORT that retrieve the * DNA sequence and the binary support values respectively. *

* * @param abiSeq the SymbolList over the QUALITY alphabet to view * @return an Alignment view of abiSeq * @throws IllegalAlphabetException if abiSeq is not over QUALITY */ public static Alignment getAlignment(SymbolList abiSeq) throws IllegalAlphabetException { return SymbolListViews.alignment( new ListTools.Doublet(SEQUENCE, SUPPORT), abiSeq ); } }