/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.alignment; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import org.biojava.bio.BioException; import org.biojava.bio.BioRuntimeException; import org.biojava.bio.SimpleAnnotation; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.SequenceIterator; import org.biojava.bio.seq.db.SequenceDB; import org.biojava.bio.seq.impl.SimpleGappedSequence; import org.biojava.bio.seq.impl.SimpleSequence; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alignment; import org.biojava.bio.symbol.SimpleAlignment; import org.biojava.bio.symbol.SimpleSymbolList; /* * Created on 23.06.2005 */ import org.biojava.bio.symbol.SymbolList; /** * Needleman and Wunsch defined the problem of global sequence alignments, from * the first till the last symbol of a sequence. This class is able to perform * such global sequence comparisons efficiently by dynamic programming. If * inserts and deletes are equally expensive and as expensive as the extension * of a gap, the alignment method of this class does not use affine gap * penalties. Otherwise it does. Those costs need four times as much memory, * which has significant effects on the run time, if the computer needs to swap. * * @author Andreas Dräger * @author Gero Greiner * @author Mark Schreiber * @since 1.5 */ public class NeedlemanWunsch extends SequenceAlignment { /** * A matrix with the size length(sequence1) times length(sequence2) */ protected int[][] CostMatrix; /** * A matrix with the size length(alphabet) times length(alphabet) */ protected SubstitutionMatrix subMatrix; /** * The result of a successful alignment */ protected Alignment pairalign; /** * The result of a successful alignment as a simple String. */ protected String alignment; /** * Expenses for inserts. */ private short insert; /** * Expenses for deletes. */ private short delete; /** * Expenses for the extension of a gap. */ private short gapExt; /** * Expenses for matches. */ private short match; /** * Expenses for replaces. */ private short replace; /** * Constructs a new Object with the given parameters based on the * Needleman-Wunsch algorithm The alphabet of sequences to be aligned will be * taken from the given substitution matrix. * * @param match * This gives the costs for a match operation. It is only used, if * there is no entry for a certain match of two symbols in the * substitution matrix (default value). * @param replace * This is like the match parameter just the default, if there is no * entry in the substitution matrix object. * @param insert * The costs of a single insert operation. * @param delete * The expenses of a single delete operation. * @param gapExtend * The expenses of an extension of a existing gap (that is a previous * insert or delete. If the costs for insert and delete are equal and * also equal to gapExtend, no affine gap penalties will be used, * which saves a significant amount of memory. * @param subMat * The substitution matrix object which gives the costs for matches * and replaces. */ public NeedlemanWunsch(short match, short replace, short insert, short delete, short gapExtend, SubstitutionMatrix subMat) { this.subMatrix = subMat; this.insert = insert; this.delete = delete; this.gapExt = gapExtend; this.match = match; this.replace = replace; this.alignment = ""; } /** * Sets the substitution matrix to be used to the specified one. Afterwards it * is only possible to align sequences of the alphabet of this substitution * matrix. * * @param matrix * an instance of a substitution matrix. */ public void setSubstitutionMatrix(SubstitutionMatrix matrix) { this.subMatrix = matrix; } /** * Sets the penalty for an insert operation to the specified value. * * @param ins * costs for a single insert operation */ public void setInsert(short ins) { this.insert = ins; } /** * Sets the penalty for a delete operation to the specified value. * * @param del * costs for a single deletion operation */ public void setDelete(short del) { this.delete = del; } /** * Sets the penalty for an extension of any gap (insert or delete) to the * specified value. * * @param ge * costs for any gap extension */ public void setGapExt(short ge) { this.gapExt = ge; } /** * Sets the penalty for a match operation to the specified value. * * @param ma * costs for a single match operation */ public void setMatch(short ma) { this.match = ma; } /** * Sets the penalty for a replace operation to the specified value. * * @param rep * costs for a single replace operation */ public void setReplace(short rep) { this.replace = rep; } /** * Returns the current expenses of a single insert operation. * * @return insert */ public short getInsert() { return insert; } /** * Returns the current expenses of a single delete operation. * * @return delete */ public short getDelete() { return delete; } /** * Returns the current expenses of any extension of a gap operation. * * @return gapExt */ public short getGapExt() { return gapExt; } /** * Returns the current expenses of a single match operation. * * @return match */ public short getMatch() { return match; } /** * Returns the current expenses of a single replace operation. * * @return replace */ public short getReplace() { return replace; } /** * Prints a String representation of the CostMatrix for the given Alignment on * the screen. This can be used to get a better understanding of the * algorithm. There is no other purpose. This method also works for all * extensions of this class with all kinds of matrices. * * @param CostMatrix * The matrix that contains all expenses for swapping symbols. * @param queryChar * a character representation of the query sequence (mySequence.seqString().toCharArray()). * @param targetChar * a character representation of the target sequence. * @return a String representation of the matrix. */ public static String printCostMatrix(int[][] CostMatrix, char[] queryChar, char[] targetChar) { int line, col; String output = "\t"; for (col = 0; col <= targetChar.length; col++) if (col == 0) output += "[" + col + "]\t"; else output += "[" + targetChar[col - 1] + "]\t"; for (line = 0; line <= queryChar.length; line++) { if (line == 0) output += System.getProperty("line.separator") + "[" + line + "]\t"; else output += System.getProperty("line.separator") + "[" + queryChar[line - 1] + "]\t"; for (col = 0; col <= targetChar.length; col++) output += CostMatrix[line][col] + "\t"; } output += System.getProperty("line.separator") + "delta[Edit] = " + CostMatrix[line - 1][col - 1] + System.getProperty("line.separator"); return output; } /** * prints the alignment String on the screen (standard output). * * @param align * The parameter is typically given by the * {@link #getAlignmentString() getAlignmentString()} method. */ public static void printAlignment(String align) { System.out.print(align); } /** * This method is good if one wants to reuse the alignment calculated by this * class in another BioJava class. It just performs * {@link #pairwiseAlignment(SymbolList, SymbolList) pairwiseAlignment} and * returns an Alignment instance containing the two aligned * sequences. * * @return Alignment object containing the two gapped sequences constructed * from query and target. * @throws Exception */ public Alignment getAlignment(SymbolList query, SymbolList target) throws Exception { pairwiseAlignment(query, target); return pairalign; } /** * This gives the edit distance according to the given parameters of this * certain object. It returns just the last element of the internal cost * matrix (left side down). So if you extend this class, you can just do the * following: * int myDistanceValue = foo; this.CostMatrix = new int[1][1]; this.CostMatrix[0][0] = myDistanceValue; * * @return returns the edit_distance computed with the given parameters. */ public int getEditDistance() { return CostMatrix[CostMatrix.length - 1][CostMatrix[CostMatrix.length - 1].length - 1]; } /** * This just computes the minimum of three integer values. * * @param x * @param y * @param z * @return Gives the minimum of three integers */ protected static int min(int x, int y, int z) { if ((x < y) && (x < z)) return x; if (y < z) return y; return z; } /* * (non-Javadoc) * * @see toolbox.align.SequenceAlignment#getAlignment() */ public String getAlignmentString() throws BioException { return alignment; } /* * (non-Javadoc) * * @see toolbox.align.SequenceAlignment#alignAll(org.biojava.bio.seq.SequenceIterator, * org.biojava.bio.seq.db.SequenceDB) */ public List alignAll(SequenceIterator source, SequenceDB subjectDB) throws NoSuchElementException, BioException { List l = new LinkedList(); while (source.hasNext()) { Sequence query = source.nextSequence(); // compare all the sequences of both sets. SequenceIterator target = subjectDB.sequenceIterator(); while (target.hasNext()) try { l.add(getAlignment(query, target.nextSequence())); // pairwiseAlignment(query, target.nextSequence()); } catch (Exception exc) { exc.printStackTrace(); } } return l; } /** * Global pairwise sequence alignment of two BioJava-Sequence objects * according to the Needleman-Wunsch-algorithm. * * @see org.biojava.bio.alignment.SequenceAlignment#pairwiseAlignment(org.biojava.bio.symbol.SymbolList, * org.biojava.bio.symbol.SymbolList) */ public int pairwiseAlignment(SymbolList query, SymbolList subject) throws BioRuntimeException { Sequence squery = null; Sequence ssubject = null; if(query instanceof Sequence){ squery = (Sequence)query; }else{ //make it a sequence squery = new SimpleSequence(query, "", "query", new SimpleAnnotation()); } if(subject instanceof Sequence){ ssubject = (Sequence)subject; }else{ //make it a sequence ssubject = new SimpleSequence(subject, "", "subject", new SimpleAnnotation()); } if (squery.getAlphabet().equals(ssubject.getAlphabet()) && squery.getAlphabet().equals(subMatrix.getAlphabet())) { long time = System.currentTimeMillis(); int i, j; this.CostMatrix = new int[squery.length() + 1][ssubject.length() + 1]; // Matrix // CostMatrix /* * Variables for the traceback */ String[] align = new String[] {"", ""}; String path = ""; // construct the matrix: CostMatrix[0][0] = 0; /* * If we want to have affine gap penalties, we have to initialise * additional matrices: If this is not necessary, we won't do that * (because it's expensive). */ if ((gapExt != delete) || (gapExt != insert)) { int[][] E = new int[squery.length() + 1][ssubject.length() + 1]; // Inserts int[][] F = new int[squery.length() + 1][ssubject.length() + 1]; // Deletes E[0][0] = F[0][0] = Integer.MAX_VALUE; //Double.MAX_VALUE; for (i = 1; i <= squery.length(); i++) { // CostMatrix[i][0] = CostMatrix[i-1][0] + delete; E[i][0] = Integer.MAX_VALUE; //Double.POSITIVE_INFINITY; CostMatrix[i][0] = F[i][0] = delete + i * gapExt; } for (j = 1; j <= ssubject.length(); j++) { // CostMatrix[0][j] = CostMatrix[0][j - 1] + insert; F[0][j] = Integer.MAX_VALUE; //Double.POSITIVE_INFINITY; CostMatrix[0][j] = E[0][j] = insert + j * gapExt; } for (i = 1; i <= squery.length(); i++) for (j = 1; j <= ssubject.length(); j++) { E[i][j] = Math.min(E[i][j - 1], CostMatrix[i][j - 1] + insert) + gapExt; F[i][j] = Math.min(F[i - 1][j], CostMatrix[i - 1][j] + delete) + gapExt; CostMatrix[i][j] = min(E[i][j], F[i][j], CostMatrix[i - 1][j - 1] - matchReplace(squery, ssubject, i, j)); } /* * Traceback for affine gap penalties. */ try { boolean[] gap_extend = {false, false}; j = this.CostMatrix[CostMatrix.length - 1].length - 1; SymbolTokenization st = subMatrix.getAlphabet().getTokenization( "default"); for (i = this.CostMatrix.length - 1; i > 0;) { do { // only Insert. if (i == 0) { align[0] = '~' + align[0]; align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1]; path = ' ' + path; // only Delete. } else if (j == 0) { align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0]; align[1] = '~' + align[1]; path = ' ' + path; // Match/Replace } else if ((CostMatrix[i][j] == CostMatrix[i - 1][j - 1] - matchReplace(squery, ssubject, i, j)) && !(gap_extend[0] || gap_extend[1])) { if (squery.symbolAt(i) == ssubject.symbolAt(j)) path = '|' + path; else path = ' ' + path; align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0]; align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1]; // Insert || finish gap if extended gap is opened } else if (CostMatrix[i][j] == E[i][j] || gap_extend[0]) { // check if gap has been extended or freshly opened gap_extend[0] = (E[i][j] != CostMatrix[i][j - 1] + insert + gapExt); align[0] = '-' + align[0]; align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1]; path = ' ' + path; // Delete || finish gap if extended gap is opened } else { // check if gap has been extended or freshly opened gap_extend[1] = (F[i][j] != CostMatrix[i - 1][j] + delete + gapExt); align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0]; align[1] = '-' + align[1]; path = ' ' + path; } } while (j > 0); } } catch (BioException exc) { throw new BioRuntimeException(exc); } /* * No affine gap penalties, constant gap penalties, which is much faster * and needs less memory. */ } else { for (i = 1; i <= squery.length(); i++) CostMatrix[i][0] = CostMatrix[i - 1][0] + delete; for (j = 1; j <= ssubject.length(); j++) CostMatrix[0][j] = CostMatrix[0][j - 1] + insert; for (i = 1; i <= squery.length(); i++) for (j = 1; j <= ssubject.length(); j++) { CostMatrix[i][j] = min(CostMatrix[i - 1][j] + delete, CostMatrix[i][j - 1] + insert, CostMatrix[i - 1][j - 1] - matchReplace(squery, ssubject, i, j)); } /* * Traceback for constant gap penalties. */ try { j = this.CostMatrix[CostMatrix.length - 1].length - 1; SymbolTokenization st = subMatrix.getAlphabet().getTokenization( "default"); // System.out.println(printCostMatrix(CostMatrix, // query.seqString().toCharArray(), // subject.seqString().toCharArray())); for (i = this.CostMatrix.length - 1; i > 0;) { do { // only Insert. if (i == 0) { align[0] = '~' + align[0]; align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1]; path = ' ' + path; // only Delete. } else if (j == 0) { align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0]; align[1] = '~' + align[1]; path = ' ' + path; // Match/Replace } else if (CostMatrix[i][j] == CostMatrix[i - 1][j - 1] - matchReplace(squery, ssubject, i, j)) { if (squery.symbolAt(i) == ssubject.symbolAt(j)) path = '|' + path; else path = ' ' + path; align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0]; align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1]; // Insert } else if (CostMatrix[i][j] == CostMatrix[i][j - 1] + insert) { align[0] = '-' + align[0]; align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1]; path = ' ' + path; // Delete } else { align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0]; align[1] = '-' + align[1]; path = ' ' + path; } } while (j > 0); } } catch (BioException exc) { throw new BioRuntimeException(exc); } } /* * From here both cases are equal again. */ try { squery = new SimpleGappedSequence(new SimpleSequence( new SimpleSymbolList(squery.getAlphabet().getTokenization("token"), align[0]), squery.getURN(), squery.getName(), squery .getAnnotation())); ssubject = new SimpleGappedSequence(new SimpleSequence( new SimpleSymbolList( ssubject.getAlphabet().getTokenization("token"), align[1]), ssubject.getURN(), ssubject.getName(), ssubject.getAnnotation())); Map m = new HashMap(); m.put(squery.getName(), squery); m.put(ssubject.getName(), ssubject); pairalign = new SimpleAlignment(m); // this.printCostMatrix(queryChar, targetChar); // only for tests // important this.alignment = formatOutput(squery.getName(), // name of the query // sequence ssubject.getName(), // name of the target sequence align, // the String representation of the alignment path, // String match/missmatch representation 0, // Start position of the alignment in the query sequence CostMatrix.length - 1, // End position of the alignment in the // query sequence CostMatrix.length - 1, // length of the query sequence 0, // Start position of the alignment in the target sequence CostMatrix[0].length - 1, // End position of the alignment in the // target sequence CostMatrix[0].length - 1, // length of the target sequence getEditDistance(), // the edit distance System.currentTimeMillis() - time) + System.getProperty("line.separator"); // time consumption // System.out.println(printCostMatrix(CostMatrix, // query.seqString().toCharArray(), subject.seqString().toCharArray())); return getEditDistance(); } catch (BioException exc) { throw new BioRuntimeException(exc); } } else throw new BioRuntimeException( "Alphabet missmatch occured: sequences with different alphabet cannot be aligned."); } /** * This method computes the scores for the substitution of the i-th symbol of * query by the j-th symbol of subject. * * @param query * The query sequence * @param subject * The target sequence * @param i * The position of the symbol under consideration within the query * sequence (starting from one) * @param j * The position of the symbol under consideration within the target * sequence * @return The score for the given substitution. */ private int matchReplace(Sequence query, Sequence subject, int i, int j) { try { return subMatrix.getValueAt(query.symbolAt(i), subject.symbolAt(j)); } catch (Exception exc) { if (query.symbolAt(i).getMatches().contains(subject.symbolAt(j)) || subject.symbolAt(j).getMatches().contains(query.symbolAt(i))) return -match; return -replace; } } }