/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.alignment;
import java.util.HashMap;
import java.util.Map;
import org.biojava.bio.BioException;
import org.biojava.bio.BioRuntimeException;
import org.biojava.bio.SimpleAnnotation;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.impl.SimpleGappedSequence;
import org.biojava.bio.seq.impl.SimpleSequence;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.SimpleAlignment;
import org.biojava.bio.symbol.SimpleSymbolList;
/*
* Created on 05.09.2005
*/
import org.biojava.bio.symbol.SymbolList;
/**
* Smith and Waterman developed an efficient dynamic programming algorithm to
* perform local sequence alignments, which returns the most conserved region of
* two sequences (longest common substring with modifications). This algorithm is
* performed by the method pairwiseAlignment
of this class. It
* uses affine gap penalties if and only if the expenses of a delete or insert
* operation are unequal to the expenses of gap extension. This uses
* significantly more memory (four times as much) and increases the runtime if
* swapping is performed.
*
* @author Andreas Dräger
* @author Gero Greiner
* @author Mark Schreiber
* @since 1.5
*/
public class SmithWaterman extends NeedlemanWunsch {
private short match, replace, insert, delete, gapExt;
/**
* Constructs the new SmithWaterman alignment object. Alignments are only
* performed, if the alphabet of the given SubstitutionMatrix
* equals the alphabet of both the query and the target Sequence
.
* The alignment parameters here are expenses and not scores as they are in
* the NeedlemanWunsch
object. scores are just given by
* multiplying the expenses with (-1)
. For example you could
* use parameters like "-2, 5, 3, 3, 0". If the expenses for gap extension are
* equal to the cost of starting a gap (delete or insert), no affine gap
* penalties are used, which saves memory.
*
* @param match
* expenses for a match
* @param replace
* expenses for a replace operation
* @param insert
* expenses for a gap opening in the query sequence
* @param delete
* expenses for a gap opening in the target sequence
* @param gapExtend
* expenses for the extension of a gap which was started earlier.
* @param matrix
* the SubstitutionMatrix
object to use.
*/
public SmithWaterman(short match, short replace, short insert, short delete,
short gapExtend, SubstitutionMatrix matrix) {
super(insert, delete, gapExtend, match, replace, matrix);
this.match = (short) -match;
this.replace = (short) -replace;
this.insert = (short) -insert;
this.delete = (short) -delete;
this.gapExt = (short) -gapExtend;
this.subMatrix = matrix;
this.alignment = "";
}
/**
* Overrides the method inherited from the NeedlemanWunsch and sets the
* penalty for an insert operation to the specified value. Reason: internally
* scores are used instead of penalties so that the value is muliplyed with
* -1.
*
* @param ins
* costs for a single insert operation
*/
@Override
public void setInsert(short ins) {
this.insert = (short) -ins;
}
/**
* Overrides the method inherited from the NeedlemanWunsch and sets the
* penalty for a delete operation to the specified value. Reason: internally
* scores are used instead of penalties so that the value is muliplyed with
* -1.
*
* @param del
* costs for a single deletion operation
*/
@Override
public void setDelete(short del) {
this.delete = (short) -del;
}
/**
* Overrides the method inherited from the NeedlemanWunsch and sets the
* penalty for an extension of any gap (insert or delete) to the specified
* value. Reason: internally scores are used instead of penalties so that the
* value is muliplyed with -1.
*
* @param ge
* costs for any gap extension
*/
@Override
public void setGapExt(short ge) {
this.gapExt = (short) -ge;
}
/**
* Overrides the method inherited from the NeedlemanWunsch and sets the
* penalty for a match operation to the specified value. Reason: internally
* scores are used instead of penalties so that the value is muliplyed with
* -1.
*
* @param ma
* costs for a single match operation
*/
@Override
public void setMatch(short ma) {
this.match = (short) -ma;
}
/**
* Overrides the method inherited from the NeedlemanWunsch and sets the
* penalty for a replace operation to the specified value. Reason: internally
* scores are used instead of penalties so that the value is muliplyed with
* -1.
*
* @param rep
* costs for a single replace operation
*/
@Override
public void setReplace(short rep) {
this.replace = (short) -rep;
}
/**
* Overrides the method inherited from the NeedlemanWunsch and performs only a
* local alignment. It finds only the longest common subsequence. This is good
* for the beginning, but it might be better to have a system to find more
* than only one hit within the score matrix. Therefore, one should only define
* the k-th best hit, where k is somehow related to the number of hits.
*
* @see SequenceAlignment#pairwiseAlignment(org.biojava.bio.symbol.SymbolList,
* org.biojava.bio.symbol.SymbolList)
*/
@Override
public int pairwiseAlignment(SymbolList query, SymbolList subject)
throws BioRuntimeException {
int[][] scoreMatrix;
Sequence squery = null;
Sequence ssubject = null;
if (query instanceof Sequence) {
squery = (Sequence) query;
} else {
// make it a sequence
squery = new SimpleSequence(query, "", "query", new SimpleAnnotation());
}
if (subject instanceof Sequence) {
ssubject = (Sequence) subject;
} else {
// make it a sequence
ssubject = new SimpleSequence(subject, "", "subject",
new SimpleAnnotation());
}
if (squery.getAlphabet().equals(ssubject.getAlphabet())
&& squery.getAlphabet().equals(subMatrix.getAlphabet())) {
long time = System.currentTimeMillis();
int i, j, maxI = 0, maxJ = 0, queryStart = 0, targetStart = 0;
scoreMatrix = new int[squery.length() + 1][ssubject.length() + 1];
/*
* Variables needed for traceback
*/
String[] align = new String[] {"", ""};
String path = "";
SymbolTokenization st;
try {
st = squery.getAlphabet().getTokenization("default");
} catch (BioException exc) {
throw new BioRuntimeException(exc);
}
/*
* Use affine gap panalties.
*/
if ((gapExt != delete) || (gapExt != insert)) {
int[][] E = new int[squery.length() + 1][ssubject.length() + 1]; // Inserts
int[][] F = new int[squery.length() + 1][ssubject.length() + 1]; // Deletes
scoreMatrix[0][0] = 0;
E[0][0] = F[0][0] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY;
for (i = 1; i <= squery.length(); i++) {
scoreMatrix[i][0] = F[i][0] = 0;
E[i][0] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY;
}
for (j = 1; j <= ssubject.length(); j++) {
scoreMatrix[0][j] = E[0][j] = 0;
F[0][j] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY;
}
for (i = 1; i <= squery.length(); i++)
for (j = 1; j <= ssubject.length(); j++) {
E[i][j] = Math.max(E[i][j - 1], scoreMatrix[i][j - 1] + insert)
+ gapExt;
F[i][j] = Math.max(F[i - 1][j], scoreMatrix[i - 1][j] + delete)
+ gapExt;
scoreMatrix[i][j] = max(0, E[i][j], F[i][j],
scoreMatrix[i - 1][j - 1]
+ matchReplace(squery, ssubject, i, j));
if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) {
maxI = i;
maxJ = j;
}
}
// System.out.println(printCostMatrix(G,
// query.seqString().toCharArray(), subject.seqString().toCharArray()));
/*
* Here starts the traceback for affine gap penalities
*/
try {
boolean[] gap_extend = {false, false};
j = maxJ;
for (i = maxI; i > 0;) {
do {
// only Deletes or Inserts or Replaces possible. That's not what
// we want to have.
if (scoreMatrix[i][j] == 0) {
queryStart = i;
targetStart = j;
i = j = 0;
// Match/Replace
} else if ((scoreMatrix[i][j] == scoreMatrix[i - 1][j - 1]
+ matchReplace(squery, ssubject, i, j))
&& !(gap_extend[0] || gap_extend[1])) {
if (squery.symbolAt(i) == ssubject.symbolAt(j))
path = '|' + path;
else path = ' ' + path;
align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0];
align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1];
// Insert || finish gap if extended gap is opened
} else if (scoreMatrix[i][j] == E[i][j] || gap_extend[0]) {
// check if gap has been extended or freshly opened
gap_extend[0] = (E[i][j] != scoreMatrix[i][j - 1] + insert
+ gapExt);
align[0] = '-' + align[0];
align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1];
path = ' ' + path;
// Delete || finish gap if extended gap is opened
} else {
// check if gap has been extended or freshly opened
gap_extend[1] = (F[i][j] != scoreMatrix[i - 1][j] + delete
+ gapExt);
align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0];
align[1] = '-' + align[1];
path = ' ' + path;
}
} while (j > 0);
}
} catch (BioException exc) {
throw new BioRuntimeException(exc);
}
/*
* No affine gap penalties to save memory.
*/
} else {
for (i = 0; i <= squery.length(); i++)
scoreMatrix[i][0] = 0;
for (j = 0; j <= ssubject.length(); j++)
scoreMatrix[0][j] = 0;
for (i = 1; i <= squery.length(); i++)
for (j = 1; j <= ssubject.length(); j++) {
scoreMatrix[i][j] = max(0, scoreMatrix[i - 1][j] + delete,
scoreMatrix[i][j - 1] + insert, scoreMatrix[i - 1][j - 1]
+ matchReplace(squery, ssubject, i, j));
if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) {
maxI = i;
maxJ = j;
}
}
/*
* Here starts the traceback for non-affine gap penalities
*/
try {
j = maxJ;
for (i = maxI; i > 0;) {
do {
// only Deletes or Inserts or Replaces possible. That's not what
// we want to have.
if (scoreMatrix[i][j] == 0) {
queryStart = i;
targetStart = j;
i = j = 0;
// Match/Replace
} else if (scoreMatrix[i][j] == scoreMatrix[i - 1][j - 1]
+ matchReplace(squery, ssubject, i, j)) {
if (squery.symbolAt(i) == ssubject.symbolAt(j))
path = '|' + path;
else path = ' ' + path;
align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0];
align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1];
// Insert
} else if (scoreMatrix[i][j] == scoreMatrix[i][j - 1] + insert) {
align[0] = '-' + align[0];
align[1] = st.tokenizeSymbol(ssubject.symbolAt(j--)) + align[1];
path = ' ' + path;
// Delete
} else {
align[0] = st.tokenizeSymbol(squery.symbolAt(i--)) + align[0];
align[1] = '-' + align[1];
path = ' ' + path;
}
} while (j > 0);
}
} catch (BioException exc) {
throw new BioRuntimeException(exc);
}
}
/*
* From here both cases are equal again.
*/
// System.out.println(printCostMatrix(scoreMatrix,
// query.seqString().toCharArray(), subject.seqString().toCharArray()));
try {
// this is necessary to have a value for the getEditDistance method.
this.CostMatrix = new int[1][1];
CostMatrix[0][0] = -scoreMatrix[maxI][maxJ];
squery = new SimpleGappedSequence(new SimpleSequence(
new SimpleSymbolList(squery.getAlphabet().getTokenization("token"),
align[0]), squery.getURN(), squery.getName(), squery
.getAnnotation()));
ssubject = new SimpleGappedSequence(new SimpleSequence(
new SimpleSymbolList(ssubject.getAlphabet()
.getTokenization("token"), align[1]), ssubject.getURN(),
ssubject.getName(), ssubject.getAnnotation()));
Map m = new HashMap();
m.put(squery.getName(), squery);
m.put(ssubject.getName(), ssubject);
pairalign = new SimpleAlignment(m);
/*
* Construct the output with only 60 symbols in each line.
*/
this.alignment = formatOutput(squery.getName(), // name of the query
// sequence
ssubject.getName(), // name of the target sequence
align, // the String representation of the alignment
path, // String match/missmatch representation
queryStart, // Start position of the alignment in the query sequence
maxI, // End position of the alignment in the query sequence
scoreMatrix.length - 1, // length of the query sequence
targetStart, // Start position of the alignment in the target
// sequence
maxJ, // End position of the alignment in the target sequence
scoreMatrix[0].length - 1, // length of the target sequence
getEditDistance(), System.currentTimeMillis() - time)
+ System.getProperty("line.separator"); // time consumption
// Don't waste any memory.
int value = scoreMatrix[maxI][maxJ];
scoreMatrix = null;
// Runtime.getRuntime().gc();
return value;
} catch (BioException exc) {
throw new BioRuntimeException(exc);
}
} else throw new BioRuntimeException(
"The alphabets of the sequences and the substitution matrix have to be equal.");
}
/**
* This just computes the maximum of four integers.
*
* @param w
* @param x
* @param y
* @param z
* @return the maximum of four int
s.
*/
private int max(int w, int x, int y, int z) {
if ((w > x) && (w > y) && (w > z)) return w;
if ((x > y) && (x > z)) return x;
if ((y > z)) return y;
return z;
}
/**
* This method computes the scores for the substitution of the i-th symbol of
* query by the j-th symbol of subject.
*
* @param query
* The query sequence
* @param subject
* The target sequence
* @param i
* The position of the symbol under consideration within the query
* sequence (starting from one)
* @param j
* The position of the symbol under consideration within the target
* sequence
* @return The score for the given substitution.
*/
private short matchReplace(Sequence query, Sequence subject, int i, int j) {
try {
return subMatrix.getValueAt(query.symbolAt(i), subject.symbolAt(j));
} catch (Exception exc) {
if (query.symbolAt(i).getMatches().contains(subject.symbolAt(j))
|| subject.symbolAt(j).getMatches().contains(query.symbolAt(i)))
return match;
return replace;
}
}
}