/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ /* * Created on 2005-08-01 */ package org.biojava.bio.alignment; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; import java.util.StringTokenizer; import org.biojava.bio.BioException; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.AlphabetManager; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.Symbol; /** *

* This object is able to read a substitution matrix file and constructs a short * matrix in memory. Every single element of the matrix can be accessed by the * method getValueAt with the parameters being two BioJava * symbols. This is why it is not necessary to access the matrix directly. If * there is no value for the two specified Symbols an * Exception is thrown. *

*

* Substitution matrix files, are available at the NCBI FTP directory. *

* * @author Andreas Dräger */ public class SubstitutionMatrix { protected Map rowSymbols, colSymbols; protected short[][] matrix; protected short min, max; protected FiniteAlphabet alphabet; protected String description, name; private static final String newLine = System .getProperty("line.separator"); /** * This constructs a SubstitutionMatrix object that contains * two Map data structures having BioJava symbols as keys and * the value being the index of the matrix containing the substitution score. * * @param alpha * the alphabet of the matrix (e.g., DNA, RNA or PROTEIN, or * PROTEIN-TERM) * @param matrixFile * the file containing the substitution matrix. Lines starting with '#' * are comments. The line starting with a white space, is the table * head. Every line has to start with the one letter representation * of the Symbol and then the values for the exchange. * @throws IOException * @throws BioException * @throws NumberFormatException */ public SubstitutionMatrix(FiniteAlphabet alpha, File matrixFile) throws BioException, NumberFormatException, IOException { this.alphabet = alpha; this.description = ""; this.name = matrixFile.getName(); this.rowSymbols = new HashMap(); this.colSymbols = new HashMap(); this.matrix = this.parseMatrix(matrixFile); } /** * With this constructor it is possible to construct a SubstitutionMatrix * object from a substitution matrix file. The given String contains a number * of lines separated by System.getProperty("line.separator"). * Everything else is the same than for the constructor above. * * @param alpha * The FiniteAlphabet to use * @param matrixString * @param name * of the matrix. * @throws BioException * @throws IOException * @throws NumberFormatException */ public SubstitutionMatrix(FiniteAlphabet alpha, String matrixString, String name) throws BioException, NumberFormatException, IOException { this.alphabet = alpha; this.description = ""; this.name = name; this.rowSymbols = new HashMap(); this.colSymbols = new HashMap(); this.matrix = this.parseMatrix(matrixString); // this.printMatrix(); } /** * Constructs a SubstitutionMatrix with every Match and every Replace having * the same expenses given by the parameters. Ambiguous symbols are not * considered because there might be to many of them (for proteins). * * @param alpha * @param match * @param replace */ public SubstitutionMatrix(FiniteAlphabet alpha, short match, short replace) { int i = 0, j = 0; this.alphabet = alpha; this.description = "Identity matrix. All replaces and all matches are treated equally."; this.name = "IDENTITY_" + match + "_" + replace; this.rowSymbols = new HashMap(); this.colSymbols = new HashMap(); this.matrix = new short[alpha.size()][alpha.size()]; Symbol[] sym = new Symbol[alpha.size()]; Iterator iter = alpha.iterator(); for (i = 0; iter.hasNext(); i++) { sym[i] = iter.next(); rowSymbols.put(sym[i], new Integer(i)); colSymbols.put(sym[i], new Integer(i)); } for (i = 0; i < alphabet.size(); i++) for (j = 0; j < alphabet.size(); j++) if (sym[i].getMatches().contains(sym[j])) matrix[i][j] = match; else matrix[i][j] = replace; // this.printMatrix(); } /** * This constructor can be used to guess the alphabet of this substitution * matrix. However, it is recommended to apply another constructor if the * alphabet is known. * * @param file * A file containing a substitution matrix. * @throws NumberFormatException * @throws NoSuchElementException * @throws BioException * @throws IOException */ public SubstitutionMatrix(File file) throws NumberFormatException, NoSuchElementException, BioException, IOException { this(guessAlphabet(file), file); } /** * This constructor can be used to guess the alphabet of this substitution * matrix. However, it is recommended to apply another constructor if the * alphabet is known. * * @param reader * @throws NumberFormatException * @throws BioException * @throws IOException */ public static SubstitutionMatrix getSubstitutionMatrix(BufferedReader reader) throws NumberFormatException, BioException, IOException { StringBuffer stringMatrix = new StringBuffer(""); while (reader.ready()) { stringMatrix.append(reader.readLine()); stringMatrix.append(newLine); } reader.close(); String mat = stringMatrix.toString(); FiniteAlphabet alpha = guessAlphabet(new BufferedReader(new StringReader(mat))); SubstitutionMatrix matrix = new SubstitutionMatrix(alpha, mat, "unknown"); return matrix; } /** * This method tries to identify the alphabet within a matrix file. This is * necessary in cases where we do not know if this is a matrix for DNA, RNA or * PROTEIN/PROTEIN-TERM. * * @param file * @return * @throws IOException * @throws BioException * @throws NoSuchElementException * @throws BioException */ private static FiniteAlphabet guessAlphabet(File file) throws IOException, NoSuchElementException, BioException { String fileName = file.getName().toLowerCase(); if (fileName.contains("pam") || fileName.contains("blosum")) return (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN-TERM"); return guessAlphabet(new BufferedReader(new FileReader(file))); } /** * This method guesses the alphabet of the given substituttion matrix which is * required for the parser. * * @param reader * @return * @throws IOException * @throws BioException */ private static FiniteAlphabet guessAlphabet(BufferedReader reader) throws IOException, BioException { String line, trim; FiniteAlphabet alphabet = null; while (reader.ready()) { line = reader.readLine(); if (line == null) break; trim = line.trim(); if (trim.charAt(0) == '#') continue; else if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) { String alphabets[] = new String[] {"DNA", "RNA", "PROTEIN", "PROTEIN-TERM"}; SymbolTokenization symtok; for (int i = 0; i < alphabets.length; i++) { alphabet = (FiniteAlphabet) AlphabetManager .alphabetForName(alphabets[i]); symtok = alphabet.getTokenization("token"); StringTokenizer st = new StringTokenizer(trim); boolean noError = true; for (int j = 0; st.hasMoreElements(); j++) try { symtok.parseToken(st.nextElement().toString()); } catch (IllegalSymbolException exc) { noError = false; break; } if (noError) return alphabet; } } } throw new BioException("Unknow alphabet used in this substitution matrix"); } /** * Reads a String representing the contents of a substitution matrix file. * * @param matrixObj * @return matrix * @throws BioException * @throws IOException * @throws NumberFormatException */ private short[][] parseMatrix(Object matrixObj) throws BioException, NumberFormatException, IOException { int j = 0, rows = 0, cols = 0; SymbolTokenization symtok = alphabet.getTokenization("token"); StringTokenizer st; String line, trim; this.min = Short.MAX_VALUE; this.max = Short.MIN_VALUE; /* * First: count how many elements are in the matrix fill lines and rows */ Reader reader; if (matrixObj instanceof File) reader = new FileReader((File) matrixObj); else if (matrixObj instanceof String) reader = new StringReader(matrixObj.toString()); else return null; BufferedReader br = new BufferedReader(reader); while (br.ready()) { line = br.readLine(); if (line == null) break; trim = line.trim(); if (trim.charAt(0) == '#') { description += line.substring(1); continue; } else if (!trim.startsWith(newLine)) { if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) { st = new StringTokenizer(trim); for (j = 0; st.hasMoreElements(); j++) { colSymbols.put(symtok.parseToken(st.nextElement().toString()), Integer.valueOf(j)); } cols = j; } else { // the matrix. st = new StringTokenizer(line); if (st.hasMoreElements()) rowSymbols.put(symtok.parseToken(st.nextElement().toString()), Integer.valueOf(rows++)); } } } br.close(); short[][] matrix = new short[rows][cols]; rows = 0; if (matrixObj instanceof File) reader = new FileReader((File) matrixObj); else if (matrixObj instanceof String) reader = new StringReader(matrixObj.toString()); else return null; br = new BufferedReader(reader); /* * Second reading. Fill the matrix. */ while (br.ready()) { line = br.readLine(); if (line == null) break; trim = line.trim(); if (trim.charAt(0) == '#') continue; else if ((line.charAt(0) == ' ') || (line.charAt(0) == '\t')) continue; else if (!trim.startsWith(newLine)) { // lines: st = new StringTokenizer(trim); if (st.hasMoreElements()) st.nextElement(); // throw away Symbol at // beginning. for (j = 0; st.hasMoreElements(); j++) {// cols: matrix[rows][j] = (short) Math.round(Double.parseDouble(st .nextElement().toString())); if (matrix[rows][j] > max) max = matrix[rows][j]; // maximum. if (matrix[rows][j] < min) min = matrix[rows][j]; // minimum. } rows++; } } br.close(); return matrix; } /** * There are some substitution matrices containing more columns than lines. * This has to do with the ambiguous symbols. Lines are always good, columns * might not contain the whole information. The matrix is supposed to be * symmetric anyway, so you can always set the ambiguous symbol to be the * first argument. * * @param row * Symbol of the line * @param col * Symbol of the column * @return expenses for the exchange of symbol row and symbol column. * @throws BioException */ public short getValueAt(Symbol row, Symbol col) throws BioException { if ((!rowSymbols.containsKey(row)) || (!colSymbols.containsKey(col))) throw new BioException("No entry for the sybols " + row.getName() + " and " + col.getName()); return matrix[rowSymbols.get(row).intValue()][colSymbols.get(col) .intValue()]; } /** * This gives you the description of this matrix if there is one. Normally * substitution matrix files like BLOSUM contain some lines of description. * * @return the comment of the matrix */ public String getDescription() { return description; } /** * Every substitution matrix has a name like "BLOSUM30" or "PAM160". This will * be returned by this method. * * @return the name of the matrix. */ public String getName() { return name; } /** * The minimum score of this matrix. * * @return minimum of the matrix. */ public short getMin() { return min; } /** * The maximum score in this matrix. * * @return maximum of the matrix. */ public short getMax() { return max; } /** * Sets the description to the given value. * * @param desc * a description. This doesn't have to start with '#'. */ public void setDescription(String desc) { this.description = desc; } /** * Gives the alphabet used by this matrix. * * @return the alphabet of this matrix. */ public FiniteAlphabet getAlphabet() { return alphabet; } /** * Creates a String representation of this matrix. * * @return a string representation of this matrix without the description. */ public String stringnifyMatrix() { int i = 0; StringBuffer matrixString = new StringBuffer(); Symbol[] colSyms = new Symbol[this.colSymbols.keySet().size()]; try { SymbolTokenization symtok = alphabet.getTokenization("default"); matrixString.append(" "); Iterator colKeys = colSymbols.keySet().iterator(); while (colKeys.hasNext()) { colSyms[i] = colKeys.next(); matrixString.append(symtok.tokenizeSymbol(colSyms[i++]).toUpperCase()); matrixString.append(' '); } matrixString.append(newLine); Iterator rowKeys = rowSymbols.keySet().iterator(); while (rowKeys.hasNext()) { Symbol rowSym = rowKeys.next(); matrixString.append(symtok.tokenizeSymbol(rowSym).toUpperCase()); matrixString.append(' '); for (i = 0; i < colSyms.length; i++) { matrixString.append(getValueAt(rowSym, colSyms[i])); matrixString.append(' '); } matrixString.append(newLine); } } catch (BioException exc) { exc.printStackTrace(); } return matrixString.toString(); } /** * Converts the description of the matrix to a String. * * @return Gives a description with approximately 60 letters on every line * separated by System.getProperty("line.separator"). * Every line starts with #. */ public String stringnifyDescription() { StringBuffer desc = new StringBuffer(), line = new StringBuffer(); line.append("# "); StringTokenizer st = new StringTokenizer(description, " "); while (st.hasMoreElements()) { line.append(st.nextElement().toString()); line.append(' '); if (line.length() >= 60) { desc.append(line); desc.append(newLine); if (st.hasMoreElements()) { line = new StringBuffer(); line.append("# "); } } else if (!st.hasMoreElements()) { desc.append(line); desc.append(newLine); } } return desc.toString(); } /** * Overrides the inherited method. * * @return Gives a string representation of the SubstitutionMatrix. This is a * valid input for the constructor which needs a matrix string. This * String also contains the description of the matrix if there is one. */ @Override public String toString() { StringBuffer desc = new StringBuffer(), line = new StringBuffer(); line.append("# "); StringTokenizer st = new StringTokenizer(description); while (st.hasMoreElements()) { line.append(st.nextElement().toString()); line.append(' '); if (line.length() >= 60) { desc.append(line); desc.append(newLine); if (st.hasMoreElements()) { line = new StringBuffer(); line.append("# "); } } else if (!st.hasMoreElements()) { desc.append(line); desc.append(newLine); } } desc.append(stringnifyMatrix()); return desc.toString(); } /** * Just to perform some test. It prints the matrix on the screen. */ public void printMatrix() { // Test output: Iterator rowKeys = rowSymbols.keySet().iterator(); while (rowKeys.hasNext()) { Iterator colKeys = colSymbols.keySet().iterator(); Symbol rowSym = rowKeys.next(); System.out.print(rowSym.getName() + "\t"); while (colKeys.hasNext()) { Symbol colSym = colKeys.next(); int x = rowSymbols.get(rowSym).intValue(); int y = colSymbols.get(colSym).intValue(); System.out.print(colSym.getName() + " " + " " + x + " " + y + " " + matrix[x][y] + "\t"); } System.out.println(newLine); } System.out.println(toString()); } /** * With this method you can get a “normalized” * SubstitutionMatrix object; however, since this * implementation uses an short matrix, the normalized matrix will be scaled * by ten. If you need values between zero and one, you have to divide every * value returned by getValueAt by ten. * * @return a new and normalized SubstitutionMatrix object given * by this substitution matrix. Because this uses an * short matrix, all values are scaled by 10. * @throws BioException * @throws IOException * @throws NumberFormatException */ public SubstitutionMatrix normalizeMatrix() throws BioException, NumberFormatException, IOException { int i, j; short min = getMin(), newMax = Short.MIN_VALUE; short[][] mat = new short[matrix.length][matrix[matrix.length - 1].length]; String name = getName() + "_normalized"; String matString = stringnifyDescription() + " "; FiniteAlphabet alphabet = getAlphabet(); Map rowMap = this.rowSymbols; Map colMap = this.colSymbols; SymbolTokenization symtok = alphabet.getTokenization("default"); for (i = 0; i < matrix.length; i++) for (j = 0; j < matrix[matrix.length - 1].length; j++) { mat[i][j] = (short) (matrix[i][j] - min); if (mat[i][j] > newMax) newMax = mat[i][j]; } for (i = 0; i < mat.length; i++) for (j = 0; j < mat[mat.length - 1].length; j++) mat[i][j] = (short) (mat[i][j] * 10 / newMax); Object[] rows = rowSymbols.keySet().toArray(); Object[] cols = colSymbols.keySet().toArray(); for (i = 0; i < cols.length; i++) matString += symtok.tokenizeSymbol((Symbol) cols[i]) + " "; for (i = 0; i < rows.length; i++) { matString += newLine + symtok.tokenizeSymbol((Symbol) rows[i]) + " "; for (j = 0; j < cols.length; j++) { matString += mat[rowMap.get((Symbol) rows[i]).intValue()][colMap.get( (Symbol) cols[j]).intValue()] + " "; } } matString += newLine; return new SubstitutionMatrix(alphabet, matString, name); } }