/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.symbol; import java.util.Iterator; import java.util.List; import java.util.Set; import org.biojava.bio.Annotation; import org.biojava.bio.BioError; import org.biojava.bio.BioException; import org.biojava.bio.seq.io.SeqIOListener; import org.biojava.bio.seq.io.StreamParser; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol; import org.biojava.utils.ChangeVetoException; import org.biojava.utils.ListTools; import org.biojava.utils.Unchangeable; /** * Soft masking is usually displayed by making the masked regions somehow * different from the non masked regions. Typically the masked regions are * lower case but other schemes could be invented. For example a softmasked * DNA sequence may look like this:
 *
 * >DNA_sequence
 * ATGGACGCTAGCATggtggtggtggtggtggtggtGCATAGCGAGCAAGTGGAGCGT
 *
 * 
* Where the lowercase regions are masked by low complexity. *

* SoftMaskedAlphabets come with SymbolTokenizers * that understand how to read and write the softmasking. The interpretation * of what constitutes a masked region is governed by an implementation of * a MaskingDetector. The DEFAULT field of the * MaskingDetector interface defines lower case tokens as masked. *

Copyright (c) 2004 Novartis Institute for Tropical Diseases

* @author Mark Schreiber * @version 1.0 */ public final class SoftMaskedAlphabet extends Unchangeable implements FiniteAlphabet{ //used to indicate masking. 0 indicates no mask 1 indicates mask. private IntegerAlphabet.SubIntegerAlphabet binary; private FiniteAlphabet alpha; private String name; private FiniteAlphabet delegateAlpha; private MaskingDetector maskingDetector; private SoftMaskedAlphabet(FiniteAlphabet alpha, String name) throws IllegalAlphabetException{ this.alpha = alpha; binary = IntegerAlphabet.getSubAlphabet(0,1); this.name = name; delegateAlpha = (FiniteAlphabet)AlphabetManager.getCrossProductAlphabet( new ListTools.Doublet(alpha, binary)); } /** * Generates a soft masked Alphabet where lowercase tokens are assumed to be * soft masked. * @param alphaToMask for example the DNA alphabet. * @throws IllegalAlphabetException if it cannot be constructed * @return a reference to a singleton SoftMaskedAlphabet. */ public static SoftMaskedAlphabet getInstance(FiniteAlphabet alphaToMask) throws IllegalAlphabetException { return getInstance(alphaToMask, MaskingDetector.DEFAULT); } /** * Creates a compound alphabet that is a hybrid of the alphabet that is to * be soft masked and a binary alphabet that indicates if any * Symbol is soft masked or not. * * @param alphaToMask for example the DNA alphabet. * @param maskingDetector to define masking behaivour * @throws IllegalAlphabetException if it cannot be constructed * @return a reference to a singleton SoftMaskedAlphabet. */ public static SoftMaskedAlphabet getInstance(FiniteAlphabet alphaToMask, MaskingDetector maskingDetector) throws IllegalAlphabetException{ String lookup = "Softmasked {"+alphaToMask.getName()+"}"; if(AlphabetManager.registered(lookup)){ return (SoftMaskedAlphabet)AlphabetManager.alphabetForName(lookup); } SoftMaskedAlphabet sma = new SoftMaskedAlphabet(alphaToMask, lookup); AlphabetManager.registerAlphabet(sma.getName(), sma); sma.maskingDetector = maskingDetector; return sma; } /** * Gets the Alphabet upon which masking is being applied * @return A FiniteAlphabet */ public FiniteAlphabet getMaskedAlphabet(){ return alpha; } /** * The compound alpha that holds the symbols used by this wrapper * @return a FiniteAlphabet */ protected FiniteAlphabet getDelegate(){ return delegateAlpha; } /** * The SoftMaskedAlphabet has no annotation * @return Annotation.EMPTY_ANNOTATION */ public Annotation getAnnotation(){ return Annotation.EMPTY_ANNOTATION; } /** * The name of the Alphabet * @return a String in the form of * "Softmasked {"+alphaToMask.getName()+"}" */ public String getName(){ return name; } /** * Gets the components of the Alphabet. * @return a List with two members, the first is the wrapped * Alphabet the second is the binary * SubIntegerAlphabet. */ public List getAlphabets(){ return new ListTools.Doublet(alpha, binary); } /** * Gets the compound symbol composed of the Symbols in the List. * The Symbols in the List must be from alpha * (defined in the constructor) and SUBINTEGER[0..1] * @return A Symbol from this alphabet. * @throws IllegalSymbolException if l is not as expected (see above) * @param l a List of Symbols */ public Symbol getSymbol(List l) throws IllegalSymbolException { return delegateAlpha.getSymbol(l); } /** * This is not supported. Ambiguity should be handled at the level of the * wrapped Alphabet. Use getSymbol(List l) instead and provide * it with an ambigutiy and a masking symbol. * @param s a Set of Symbols * @see #getSymbol(List l) * @throws UnsupportedOperationException */ public Symbol getAmbiguity(Set s) throws UnsupportedOperationException { throw new UnsupportedOperationException( "Ambiguity should be handled at the level of the wrapped Alphabet"); } public Symbol getGapSymbol(){ return AlphabetManager.getGapSymbol(new ListTools.Doublet(alpha, binary)); } public boolean contains(Symbol s){ return delegateAlpha.contains(s); } public void validate(Symbol s)throws IllegalSymbolException{ if(! contains(s)){ throw new IllegalSymbolException( s, s.getName()+" is not a valid part of "+getName()); } } /** * Getter for the MaskingDetector * @return the MaskingDetector */ public MaskingDetector getMaskingDetector(){ return maskingDetector; } public SymbolTokenization getTokenization(String type) throws BioException{ return new CaseSensitiveTokenization(this, type); } public int size(){ return delegateAlpha.size(); } public Iterator iterator(){ return delegateAlpha.iterator(); } /** * SoftMaskedAlphabets cannot add new Symbols. A * ChangeVetoException will be thrown. * @param s the Symbol to add. * @throws ChangeVetoException when called. */ public void addSymbol(Symbol s) throws ChangeVetoException{ throw new ChangeVetoException("SoftMaskedAlphabets cannot add new Symbols"); } /** * SoftMaskedAlphabets cannot remove Symbols. A * ChangeVetoException will be thrown. * @param s the Symbol to remove. * @throws ChangeVetoException when called. */ public void removeSymbol(Symbol s) throws ChangeVetoException{ throw new ChangeVetoException("SoftMaskedAlphabets cannot remove Symbols"); } /** * Determines if a Symbol is masked. * @return true if s is masked. * @param s the Symbol to test. */ public boolean isMasked (BasisSymbol s) throws IllegalSymbolException { validate(s); IntegerSymbol b = (IntegerSymbol)s.getSymbols().get(1); return (b.intValue() == 1); } /** * Implementations will define how soft masking looks. The * DEFAULT implementation considers softmasking to be represented * by lower case characters. * *

Copyright (c) 2004 Novartis Institute for Tropical Diseases

* @author Mark Schreiber * @version 1.0 */ public interface MaskingDetector{ public boolean isMasked (String token); /** * Present the token for a Symbol as it would appear if masked * @param token the String to mask. * @return the masked token */ public String mask (String token); /** * Present the token for a Symbol as it would appear if * it wasn't softmasked * @param token the String to un-mask. * @return the un-masked token */ public String unmask (String token); public static MaskingDetector DEFAULT = new DefaultMaskingDetector(); class DefaultMaskingDetector implements MaskingDetector{ /** * Default Behaivour is that if the whole token is lower case it is * masked. * @param token the String to check for masking * @return true is it is all lower case, otherwise false. */ public boolean isMasked(String token){ for (int i = 0; i < token.length(); i++) { if(Character.isUpperCase(token.charAt(i))){ return false; } } return true; } /** * Masks a token by making it lowercase * @param token the String to mask * @return a lower case String */ public String mask(String token){ return token.toLowerCase(); } /** * Un-masks the token by making it upper case. * @param token the String to unmask * @return the upper case String */ public String unmask(String token){ return token.toUpperCase(); } } } /** * This SymbolTokenizer works with a delegate to softmask * symbol tokenization as appropriate. It should only be used in combination * with a SoftMaskedAlphabet. * You will never instantiate one of these yourself. * *

Copyright (c) 2004 Novartis Institute for Tropical Diseases

* @author Mark Schreiber * @version 1.0 */ public class CaseSensitiveTokenization extends Unchangeable implements SymbolTokenization{ private SymbolTokenization delegate; private SoftMaskedAlphabet alpha; private CaseSensitiveTokenization( SoftMaskedAlphabet alpha, String type) throws BioException{ this.alpha = alpha; this.delegate = alpha.getMaskedAlphabet().getTokenization(type); } public Annotation getAnnotation(){ return Annotation.EMPTY_ANNOTATION; } public Alphabet getAlphabet(){ return alpha; } public SymbolTokenization.TokenType getTokenType(){ return delegate.getTokenType(); } public Symbol parseToken(String token) throws IllegalSymbolException{ MaskingDetector md = alpha.getMaskingDetector(); IntegerSymbol bin; Symbol component = delegate.parseToken(token); if(md.isMasked(token)){ bin = binary.getSymbol(1); }else{ bin = binary.getSymbol(0); } return alpha.getSymbol(new ListTools.Doublet(component, bin)); } public String tokenizeSymbolList(SymbolList sl) throws IllegalSymbolException { StringBuffer sb = new StringBuffer(sl.length()); for(int i = 1; i <= sl.length(); i++){ sb.append(tokenizeSymbol(sl.symbolAt(i))); } return sb.toString(); } /** * The current implementation only supports character parsing. Word or * fixed width parsing is not yet supported. * * @param l the SeqIOListener to callback to. * @return a StreamParser that the SeqIOListener * talks to. */ public StreamParser parseStream(SeqIOListener l){ return new CharStreamParser(l); } public String tokenizeSymbol (Symbol s) throws IllegalSymbolException{ validate(s); Symbol a = (Symbol) ((BasisSymbol)s).getSymbols().get(0); String token = delegate.tokenizeSymbol(a); if(alpha.isMasked((BasisSymbol) s)){ return maskingDetector.mask(token); } return maskingDetector.unmask(token); } private class CharStreamParser implements StreamParser { private SeqIOListener listener; private Symbol[] buffer; public CharStreamParser(SeqIOListener l) { this.listener = l; buffer = new Symbol[256]; } public void characters(char[] data, int start, int len) throws IllegalSymbolException{ int cnt = 0; while (cnt < len) { int bcnt = 0; while (cnt < len && bcnt < buffer.length) { buffer[bcnt++] = parseToken( new String(""+data[start + (cnt++)])); } try { listener.addSymbols(getAlphabet(), buffer, 0, bcnt); } catch (IllegalAlphabetException ex) { throw new BioError( "Assertion failed: can't add symbols.", ex); } } } public void close() { } } } }