/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
/*
* Created on 2005-08-01
*/
package org.biojava.bio.alignment;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.FiniteAlphabet;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.Symbol;
/**
*
* This object is able to read a substitution matrix file and constructs a short
* matrix in memory. Every single element of the matrix can be accessed by the
* method getValueAt
with the parameters being two BioJava
* symbols. This is why it is not necessary to access the matrix directly. If
* there is no value for the two specified Symbol
s an
* Exception
is thrown.
*
*
* Substitution matrix files, are available at the NCBI FTP directory.
*
*
* @author Andreas Dräger
*/
public class SubstitutionMatrix {
protected Map rowSymbols, colSymbols;
protected short[][] matrix;
protected short min, max;
protected FiniteAlphabet alphabet;
protected String description, name;
private static final String newLine = System
.getProperty("line.separator");
/**
* This constructs a SubstitutionMatrix
object that contains
* two Map
data structures having BioJava symbols as keys and
* the value being the index of the matrix containing the substitution score.
*
* @param alpha
* the alphabet of the matrix (e.g., DNA, RNA or PROTEIN, or
* PROTEIN-TERM)
* @param matrixFile
* the file containing the substitution matrix. Lines starting with '#
'
* are comments. The line starting with a white space, is the table
* head. Every line has to start with the one letter representation
* of the Symbol and then the values for the exchange.
* @throws IOException
* @throws BioException
* @throws NumberFormatException
*/
public SubstitutionMatrix(FiniteAlphabet alpha, File matrixFile)
throws BioException, NumberFormatException, IOException {
this.alphabet = alpha;
this.description = "";
this.name = matrixFile.getName();
this.rowSymbols = new HashMap();
this.colSymbols = new HashMap();
this.matrix = this.parseMatrix(matrixFile);
}
/**
* With this constructor it is possible to construct a SubstitutionMatrix
* object from a substitution matrix file. The given String contains a number
* of lines separated by System.getProperty("line.separator")
.
* Everything else is the same than for the constructor above.
*
* @param alpha
* The FiniteAlphabet
to use
* @param matrixString
* @param name
* of the matrix.
* @throws BioException
* @throws IOException
* @throws NumberFormatException
*/
public SubstitutionMatrix(FiniteAlphabet alpha, String matrixString,
String name) throws BioException, NumberFormatException, IOException {
this.alphabet = alpha;
this.description = "";
this.name = name;
this.rowSymbols = new HashMap();
this.colSymbols = new HashMap();
this.matrix = this.parseMatrix(matrixString);
// this.printMatrix();
}
/**
* Constructs a SubstitutionMatrix with every Match and every Replace having
* the same expenses given by the parameters. Ambiguous symbols are not
* considered because there might be to many of them (for proteins).
*
* @param alpha
* @param match
* @param replace
*/
public SubstitutionMatrix(FiniteAlphabet alpha, short match, short replace) {
int i = 0, j = 0;
this.alphabet = alpha;
this.description = "Identity matrix. All replaces and all matches are treated equally.";
this.name = "IDENTITY_" + match + "_" + replace;
this.rowSymbols = new HashMap();
this.colSymbols = new HashMap();
this.matrix = new short[alpha.size()][alpha.size()];
Symbol[] sym = new Symbol[alpha.size()];
Iterator iter = alpha.iterator();
for (i = 0; iter.hasNext(); i++) {
sym[i] = iter.next();
rowSymbols.put(sym[i], new Integer(i));
colSymbols.put(sym[i], new Integer(i));
}
for (i = 0; i < alphabet.size(); i++)
for (j = 0; j < alphabet.size(); j++)
if (sym[i].getMatches().contains(sym[j]))
matrix[i][j] = match;
else matrix[i][j] = replace;
// this.printMatrix();
}
/**
* This constructor can be used to guess the alphabet of this substitution
* matrix. However, it is recommended to apply another constructor if the
* alphabet is known.
*
* @param file
* A file containing a substitution matrix.
* @throws NumberFormatException
* @throws NoSuchElementException
* @throws BioException
* @throws IOException
*/
public SubstitutionMatrix(File file) throws NumberFormatException,
NoSuchElementException, BioException, IOException {
this(guessAlphabet(file), file);
}
/**
* This constructor can be used to guess the alphabet of this substitution
* matrix. However, it is recommended to apply another constructor if the
* alphabet is known.
*
* @param reader
* @throws NumberFormatException
* @throws BioException
* @throws IOException
*/
public static SubstitutionMatrix getSubstitutionMatrix(BufferedReader reader)
throws NumberFormatException, BioException, IOException {
StringBuffer stringMatrix = new StringBuffer("");
while (reader.ready()) {
stringMatrix.append(reader.readLine());
stringMatrix.append(newLine);
}
reader.close();
String mat = stringMatrix.toString();
FiniteAlphabet alpha = guessAlphabet(new BufferedReader(new StringReader(mat)));
SubstitutionMatrix matrix = new SubstitutionMatrix(alpha, mat, "unknown");
return matrix;
}
/**
* This method tries to identify the alphabet within a matrix file. This is
* necessary in cases where we do not know if this is a matrix for DNA, RNA or
* PROTEIN/PROTEIN-TERM.
*
* @param file
* @return
* @throws IOException
* @throws BioException
* @throws NoSuchElementException
* @throws BioException
*/
private static FiniteAlphabet guessAlphabet(File file) throws IOException,
NoSuchElementException, BioException {
String fileName = file.getName().toLowerCase();
if (fileName.contains("pam") || fileName.contains("blosum"))
return (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN-TERM");
return guessAlphabet(new BufferedReader(new FileReader(file)));
}
/**
* This method guesses the alphabet of the given substituttion matrix which is
* required for the parser.
*
* @param reader
* @return
* @throws IOException
* @throws BioException
*/
private static FiniteAlphabet guessAlphabet(BufferedReader reader)
throws IOException, BioException {
String line, trim;
FiniteAlphabet alphabet = null;
while (reader.ready()) {
line = reader.readLine();
if (line == null) break;
trim = line.trim();
if (trim.charAt(0) == '#')
continue;
else if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) {
String alphabets[] = new String[] {"DNA", "RNA", "PROTEIN",
"PROTEIN-TERM"};
SymbolTokenization symtok;
for (int i = 0; i < alphabets.length; i++) {
alphabet = (FiniteAlphabet) AlphabetManager
.alphabetForName(alphabets[i]);
symtok = alphabet.getTokenization("token");
StringTokenizer st = new StringTokenizer(trim);
boolean noError = true;
for (int j = 0; st.hasMoreElements(); j++)
try {
symtok.parseToken(st.nextElement().toString());
} catch (IllegalSymbolException exc) {
noError = false;
break;
}
if (noError) return alphabet;
}
}
}
throw new BioException("Unknow alphabet used in this substitution matrix");
}
/**
* Reads a String representing the contents of a substitution matrix file.
*
* @param matrixObj
* @return matrix
* @throws BioException
* @throws IOException
* @throws NumberFormatException
*/
private short[][] parseMatrix(Object matrixObj) throws BioException,
NumberFormatException, IOException {
int j = 0, rows = 0, cols = 0;
SymbolTokenization symtok = alphabet.getTokenization("token");
StringTokenizer st;
String line, trim;
this.min = Short.MAX_VALUE;
this.max = Short.MIN_VALUE;
/*
* First: count how many elements are in the matrix fill lines and rows
*/
Reader reader;
if (matrixObj instanceof File)
reader = new FileReader((File) matrixObj);
else if (matrixObj instanceof String)
reader = new StringReader(matrixObj.toString());
else return null;
BufferedReader br = new BufferedReader(reader);
while (br.ready()) {
line = br.readLine();
if (line == null) break;
trim = line.trim();
if (trim.charAt(0) == '#') {
description += line.substring(1);
continue;
} else if (!trim.startsWith(newLine)) {
if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) {
st = new StringTokenizer(trim);
for (j = 0; st.hasMoreElements(); j++) {
colSymbols.put(symtok.parseToken(st.nextElement().toString()),
Integer.valueOf(j));
}
cols = j;
} else {
// the matrix.
st = new StringTokenizer(line);
if (st.hasMoreElements())
rowSymbols.put(symtok.parseToken(st.nextElement().toString()),
Integer.valueOf(rows++));
}
}
}
br.close();
short[][] matrix = new short[rows][cols];
rows = 0;
if (matrixObj instanceof File)
reader = new FileReader((File) matrixObj);
else if (matrixObj instanceof String)
reader = new StringReader(matrixObj.toString());
else return null;
br = new BufferedReader(reader);
/*
* Second reading. Fill the matrix.
*/
while (br.ready()) {
line = br.readLine();
if (line == null) break;
trim = line.trim();
if (trim.charAt(0) == '#')
continue;
else if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t'))
continue;
else if (!trim.startsWith(newLine)) { // lines:
st = new StringTokenizer(trim);
if (st.hasMoreElements()) st.nextElement(); // throw away Symbol at
// beginning.
for (j = 0; st.hasMoreElements(); j++) {// cols:
matrix[rows][j] = (short) Math.round(Double.parseDouble(st
.nextElement().toString()));
if (matrix[rows][j] > max) max = matrix[rows][j]; // maximum.
if (matrix[rows][j] < min) min = matrix[rows][j]; // minimum.
}
rows++;
}
}
br.close();
return matrix;
}
/**
* There are some substitution matrices containing more columns than lines.
* This has to do with the ambiguous symbols. Lines are always good, columns
* might not contain the whole information. The matrix is supposed to be
* symmetric anyway, so you can always set the ambiguous symbol to be the
* first argument.
*
* @param row
* Symbol of the line
* @param col
* Symbol of the column
* @return expenses for the exchange of symbol row and symbol column.
* @throws BioException
*/
public short getValueAt(Symbol row, Symbol col) throws BioException {
if ((!rowSymbols.containsKey(row)) || (!colSymbols.containsKey(col)))
throw new BioException("No entry for the sybols " + row.getName()
+ " and " + col.getName());
return matrix[rowSymbols.get(row).intValue()][colSymbols.get(col)
.intValue()];
}
/**
* This gives you the description of this matrix if there is one. Normally
* substitution matrix files like BLOSUM contain some lines of description.
*
* @return the comment of the matrix
*/
public String getDescription() {
return description;
}
/**
* Every substitution matrix has a name like "BLOSUM30" or "PAM160". This will
* be returned by this method.
*
* @return the name of the matrix.
*/
public String getName() {
return name;
}
/**
* The minimum score of this matrix.
*
* @return minimum of the matrix.
*/
public short getMin() {
return min;
}
/**
* The maximum score in this matrix.
*
* @return maximum of the matrix.
*/
public short getMax() {
return max;
}
/**
* Sets the description to the given value.
*
* @param desc
* a description. This doesn't have to start with '#'.
*/
public void setDescription(String desc) {
this.description = desc;
}
/**
* Gives the alphabet used by this matrix.
*
* @return the alphabet of this matrix.
*/
public FiniteAlphabet getAlphabet() {
return alphabet;
}
/**
* Creates a String
representation of this matrix.
*
* @return a string representation of this matrix without the description.
*/
public String stringnifyMatrix() {
int i = 0;
StringBuffer matrixString = new StringBuffer();
Symbol[] colSyms = new Symbol[this.colSymbols.keySet().size()];
try {
SymbolTokenization symtok = alphabet.getTokenization("default");
matrixString.append(" ");
Iterator colKeys = colSymbols.keySet().iterator();
while (colKeys.hasNext()) {
colSyms[i] = colKeys.next();
matrixString.append(symtok.tokenizeSymbol(colSyms[i++]).toUpperCase());
matrixString.append(' ');
}
matrixString.append(newLine);
Iterator rowKeys = rowSymbols.keySet().iterator();
while (rowKeys.hasNext()) {
Symbol rowSym = rowKeys.next();
matrixString.append(symtok.tokenizeSymbol(rowSym).toUpperCase());
matrixString.append(' ');
for (i = 0; i < colSyms.length; i++) {
matrixString.append(getValueAt(rowSym, colSyms[i]));
matrixString.append(' ');
}
matrixString.append(newLine);
}
} catch (BioException exc) {
exc.printStackTrace();
}
return matrixString.toString();
}
/**
* Converts the description of the matrix to a String.
*
* @return Gives a description with approximately 60 letters on every line
* separated by System.getProperty("line.separator")
.
* Every line starts with #
.
*/
public String stringnifyDescription() {
StringBuffer desc = new StringBuffer(), line = new StringBuffer();
line.append("# ");
StringTokenizer st = new StringTokenizer(description, " ");
while (st.hasMoreElements()) {
line.append(st.nextElement().toString());
line.append(' ');
if (line.length() >= 60) {
desc.append(line);
desc.append(newLine);
if (st.hasMoreElements()) {
line = new StringBuffer();
line.append("# ");
}
} else if (!st.hasMoreElements()) {
desc.append(line);
desc.append(newLine);
}
}
return desc.toString();
}
/**
* Overrides the inherited method.
*
* @return Gives a string representation of the SubstitutionMatrix. This is a
* valid input for the constructor which needs a matrix string. This
* String also contains the description of the matrix if there is one.
*/
@Override
public String toString() {
StringBuffer desc = new StringBuffer(), line = new StringBuffer();
line.append("# ");
StringTokenizer st = new StringTokenizer(description);
while (st.hasMoreElements()) {
line.append(st.nextElement().toString());
line.append(' ');
if (line.length() >= 60) {
desc.append(line);
desc.append(newLine);
if (st.hasMoreElements()) {
line = new StringBuffer();
line.append("# ");
}
} else if (!st.hasMoreElements()) {
desc.append(line);
desc.append(newLine);
}
}
desc.append(stringnifyMatrix());
return desc.toString();
}
/**
* Just to perform some test. It prints the matrix on the screen.
*/
public void printMatrix() {
// Test output:
Iterator rowKeys = rowSymbols.keySet().iterator();
while (rowKeys.hasNext()) {
Iterator colKeys = colSymbols.keySet().iterator();
Symbol rowSym = rowKeys.next();
System.out.print(rowSym.getName() + "\t");
while (colKeys.hasNext()) {
Symbol colSym = colKeys.next();
int x = rowSymbols.get(rowSym).intValue();
int y = colSymbols.get(colSym).intValue();
System.out.print(colSym.getName() + " " + " " + x + " " + y + " "
+ matrix[x][y] + "\t");
}
System.out.println(newLine);
}
System.out.println(toString());
}
/**
* With this method you can get a “normalized”
* SubstitutionMatrix
object; however, since this
* implementation uses an short matrix, the normalized matrix will be scaled
* by ten. If you need values between zero and one, you have to divide every
* value returned by getValueAt
by ten.
*
* @return a new and normalized SubstitutionMatrix
object given
* by this substitution matrix. Because this uses an
* short
matrix, all values are scaled by 10.
* @throws BioException
* @throws IOException
* @throws NumberFormatException
*/
public SubstitutionMatrix normalizeMatrix() throws BioException,
NumberFormatException, IOException {
int i, j;
short min = getMin(), newMax = Short.MIN_VALUE;
short[][] mat = new short[matrix.length][matrix[matrix.length - 1].length];
String name = getName() + "_normalized";
String matString = stringnifyDescription() + " ";
FiniteAlphabet alphabet = getAlphabet();
Map rowMap = this.rowSymbols;
Map colMap = this.colSymbols;
SymbolTokenization symtok = alphabet.getTokenization("default");
for (i = 0; i < matrix.length; i++)
for (j = 0; j < matrix[matrix.length - 1].length; j++) {
mat[i][j] = (short) (matrix[i][j] - min);
if (mat[i][j] > newMax) newMax = mat[i][j];
}
for (i = 0; i < mat.length; i++)
for (j = 0; j < mat[mat.length - 1].length; j++)
mat[i][j] = (short) (mat[i][j] * 10 / newMax);
Object[] rows = rowSymbols.keySet().toArray();
Object[] cols = colSymbols.keySet().toArray();
for (i = 0; i < cols.length; i++)
matString += symtok.tokenizeSymbol((Symbol) cols[i]) + " ";
for (i = 0; i < rows.length; i++) {
matString += newLine + symtok.tokenizeSymbol((Symbol) rows[i]) + " ";
for (j = 0; j < cols.length; j++) {
matString += mat[rowMap.get((Symbol) rows[i]).intValue()][colMap.get(
(Symbol) cols[j]).intValue()]
+ " ";
}
}
matString += newLine;
return new SubstitutionMatrix(alphabet, matString, name);
}
}