/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.seq;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.MissingResourceException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.biojava.bio.BioError;
import org.biojava.bio.BioException;
import org.biojava.bio.SimpleAnnotation;
import org.biojava.bio.seq.impl.SimpleGappedSequence;
import org.biojava.bio.seq.impl.SimpleSequenceFactory;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.AtomicSymbol;
import org.biojava.bio.symbol.FiniteAlphabet;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.SimpleSymbolList;
import org.biojava.bio.symbol.SimpleSymbolPropertyTable;
import org.biojava.bio.symbol.Symbol;
import org.biojava.bio.symbol.SymbolList;
import org.biojava.bio.symbol.SymbolPropertyTable;
import org.biojava.utils.ClassTools;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
/**
* The central port-of-call for all information and functionality specific to
* SymbolLists over the protein alphabet.
*
* @author Matthew Pocock
* @author Greg Cox
* @author Thomas Down
* @author MarkSchreiber
* @author Jonathan Warren
* @author gwaldon (pyrrolysine, pKs)
*/
public class ProteinTools {
private static final FiniteAlphabet proteinAlpha;
private static final FiniteAlphabet proteinTAlpha;
private static final Map tokenToSymbol = new HashMap();
private static final Map propertyTableMap = new HashMap();
static {
try {
proteinAlpha = (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN");
proteinTAlpha = (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN-TERM");
SymbolTokenization st = proteinTAlpha.getTokenization("token");
for (Iterator i = proteinTAlpha.iterator(); i.hasNext(); ) {
AtomicSymbol s = (AtomicSymbol)i.next();
tokenToSymbol.put(st.tokenizeSymbol(s), s);
}
} catch (Exception e) {
throw new BioError(" Could not initialize ProteinTools", e);
}
}
static {
Document doc = null;
/* try {
URL proteaseManagerURL = ProteinTools.class.getClassLoader().getResource(
"org/biojava/bio/symbol/ResidueProperties.xml"
);
//If I try and do this here on compile it says "An exception can't be thrown by an initializer"
InputSource is = Resolver.createInputSource(proteaseManagerURL, true);
doc = XmlDocument.createXmlDocument(is, true);*/
try {
InputStream tablesStream = ClassTools.getClassLoader(ProteinTools.class).getResourceAsStream(
"org/biojava/bio/symbol/ResidueProperties.xml"
);
if(tablesStream == null ) {
throw new BioError("Couldn't locate ResidueProperties.xml.");
}
InputSource is = new InputSource(tablesStream);
DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
doc = parser.parse(is);
}catch (MissingResourceException mre) {
System.err.println(mre.getMessage());
}catch(Exception e){//err
e.printStackTrace();
}
try {
SimpleSymbolPropertyTable monoMassPropertyTable = new SimpleSymbolPropertyTable(
getAlphabet(),
SymbolPropertyTable.MONO_MASS
);
SimpleSymbolPropertyTable avgMassPropertyTable = new SimpleSymbolPropertyTable(
getAlphabet(),
SymbolPropertyTable.AVG_MASS
);
SimpleSymbolPropertyTable pK_NtermPropertyTable = new SimpleSymbolPropertyTable(
getAlphabet(),
SymbolPropertyTable.PK_Nterm
);
SimpleSymbolPropertyTable pKPropertyTable = new SimpleSymbolPropertyTable(
getAlphabet(),
SymbolPropertyTable.PK
);
SimpleSymbolPropertyTable pK_CtermPropertyTable = new SimpleSymbolPropertyTable(
getAlphabet(),
SymbolPropertyTable.PK_Cterm
);
SimpleSymbolPropertyTable HydropathicityTable = new SimpleSymbolPropertyTable(
getAlphabet(),
SymbolPropertyTable.HYDROPATHICITY
);
SymbolTokenization tokens = getAlphabet().getTokenization("token");
NodeList children = doc.getDocumentElement().getChildNodes();
for(int i = 0; i < children.getLength(); i++) {
Node cnode = (Node) children.item(i);
if(! (cnode instanceof Element)) {
continue;
}
Element child = (Element) cnode;
if(child.getNodeName().equals("residue")) {
String token = child.getAttribute("token");
Symbol s = tokens.parseToken(token);
NodeList properyNodes = child.getChildNodes();
for(int j = 0; j < properyNodes.getLength(); j++) {
cnode = (Node) properyNodes.item(j);
if(! (cnode instanceof Element)) {
continue;
}
Element el = (Element) cnode;
String name = el.getAttribute("name");
if(name.equals(SymbolPropertyTable.MONO_MASS)) {
String value = el.getAttribute("value");
monoMassPropertyTable.setDoubleProperty(s, value);
} else if (name.equals(SymbolPropertyTable.AVG_MASS)) {
String value = el.getAttribute("value");
avgMassPropertyTable.setDoubleProperty(s, value);
} else if (name.equals(SymbolPropertyTable.PK_Nterm)) {
String value = el.getAttribute("value");
pK_NtermPropertyTable.setDoubleProperty(s, value);
} else if (name.equals(SymbolPropertyTable.PK)) {
String value = el.getAttribute("value");
pKPropertyTable.setDoubleProperty(s, value);
} else if (name.equals(SymbolPropertyTable.PK_Cterm)) {
String value = el.getAttribute("value");
pK_CtermPropertyTable.setDoubleProperty(s, value);
}else if (name.equals(SymbolPropertyTable.HYDROPATHICITY)) {
String value = el.getAttribute("value");
HydropathicityTable.setDoubleProperty(s, value);
}
}
}
}
propertyTableMap.put(SymbolPropertyTable.MONO_MASS, (SymbolPropertyTable) monoMassPropertyTable);
propertyTableMap.put(SymbolPropertyTable.AVG_MASS, (SymbolPropertyTable) avgMassPropertyTable);
propertyTableMap.put(SymbolPropertyTable.PK_Nterm, (SymbolPropertyTable) pK_NtermPropertyTable);
propertyTableMap.put(SymbolPropertyTable.PK, (SymbolPropertyTable) pKPropertyTable);
propertyTableMap.put(SymbolPropertyTable.PK_Cterm, (SymbolPropertyTable) pK_CtermPropertyTable);
propertyTableMap.put(SymbolPropertyTable.HYDROPATHICITY, (SymbolPropertyTable) HydropathicityTable);
} catch (Exception e) {
throw new BioError(" Could not initialize ProteinTools", e);
}
}
private ProteinTools() {
}
/**
*Gets the protein alphabet
*/
public static final FiniteAlphabet getAlphabet() {
return proteinAlpha;
}
/**
*Gets the protein alphabet including the translation termination symbols
*/
public static final FiniteAlphabet getTAlphabet() {
return proteinTAlpha;
}
public static final SymbolPropertyTable getSymbolPropertyTable(String name)
{
return (SymbolPropertyTable)propertyTableMap.get(name);
}
/**
* Return a new Protein SymbolList for protein.
*
* @param theProtein a String to parse into Protein
* @return a SymbolList created form Protein
* @throws IllegalSymbolException if dna contains
* any non-Amino Acid characters.
*/
public static SymbolList createProtein(String theProtein)
throws IllegalSymbolException
{
SymbolTokenization p = null;
try {
p = getTAlphabet().getTokenization("token");
} catch (BioException e) {
throw new BioError("Something has gone badly wrong with Protein", e);
}
return new SimpleSymbolList(p, theProtein);
}
/** Get a new protein as a GappedSequence */
public static GappedSequence createGappedProteinSequence(String theProtein, String name) throws IllegalSymbolException{
String theProtein1 = theProtein.replaceAll("-", "");
Sequence protein = createProteinSequence(theProtein1, name);
GappedSequence protein1 = new SimpleGappedSequence(protein);
int pos = theProtein.indexOf('-', 0);
while(pos!=-1){
protein1.addGapInView(pos+1);
pos = theProtein.indexOf('-', pos+1);
}
return protein1;
}
/**
* Return a new PROTEIN Sequence for
* protein.
*
* @param protein a String to parse into PROTEIN
* @param name a String to use as the name
* @return a Sequence created form
* protein
* @throws IllegalSymbolException if protein contains
* any non-PROTEIN characters
*/
public static Sequence createProteinSequence(String protein, String name)
throws IllegalSymbolException {
try {
return new SimpleSequenceFactory().createSequence(
createProtein(protein),
"", name, new SimpleAnnotation()
);
} catch (BioException se) {
throw new BioError("Something has gone badly wrong with ProteinTAlpha", se);
}
}
/**
* Returns the AtomicSymbol
for the amino acid Alanine
* (A)
*/
public static AtomicSymbol ala() {
return (AtomicSymbol) tokenToSymbol.get("A");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Alanine
*/
public static AtomicSymbol a() {
return ala();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Arginine (R)
*/
public static AtomicSymbol arg() {
return (AtomicSymbol) tokenToSymbol.get("R");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Arginine
*/
public static AtomicSymbol r() {
return arg();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Asparagine (N)
*/
public static AtomicSymbol asn() {
return (AtomicSymbol) tokenToSymbol.get("N");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Asparagine
*/
public static AtomicSymbol n() {
return asn();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Aspartic Acid (D)
*/
public static AtomicSymbol asp() {
return (AtomicSymbol) tokenToSymbol.get("D");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Aspartic Acid
*/
public static AtomicSymbol d() {
return asp();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Cysteine (C)
*/
public static AtomicSymbol cys() {
return (AtomicSymbol) tokenToSymbol.get("C");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Cysteine
*/
public static AtomicSymbol c() {
return cys();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Glutamine (Q)
*/
public static AtomicSymbol gln() {
return (AtomicSymbol) tokenToSymbol.get("Q");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Glutamine
*/
public static AtomicSymbol q() {
return gln();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Glutamic Acid (E)
*/
public static AtomicSymbol glu() {
return (AtomicSymbol) tokenToSymbol.get("E");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Glutamic Acid
*/
public static AtomicSymbol e() {
return glu();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Glycine (G)
*/
public static AtomicSymbol gly() {
return (AtomicSymbol) tokenToSymbol.get("G");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Glycine
*/
public static AtomicSymbol g() {
return gly();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Histidine (H)
*/
public static AtomicSymbol his() {
return (AtomicSymbol) tokenToSymbol.get("H");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Histidine
*/
public static AtomicSymbol h() {
return his();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Isoleucine (I)
*/
public static AtomicSymbol ile() {
return (AtomicSymbol) tokenToSymbol.get("I");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Isoleucine
*/
public static AtomicSymbol i() {
return ile();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Leucine (L)
*/
public static AtomicSymbol leu() {
return (AtomicSymbol) tokenToSymbol.get("L");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Leucine
*/
public static AtomicSymbol l() {
return leu();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Lysine (K)
*/
public static AtomicSymbol lys() {
return (AtomicSymbol) tokenToSymbol.get("K");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Lysine
*/
public static AtomicSymbol k() {
return lys();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Methionine (M)
*/
public static AtomicSymbol met() {
return (AtomicSymbol) tokenToSymbol.get("M");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Methionine
*/
public static AtomicSymbol m() {
return met();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Phenylalanine (F)
*/
public static AtomicSymbol phe() {
return (AtomicSymbol) tokenToSymbol.get("F");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Phenylalanine
*/
public static AtomicSymbol f() {
return phe();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Proline (P)
*/
public static AtomicSymbol pro() {
return (AtomicSymbol) tokenToSymbol.get("P");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Proline
*/
public static AtomicSymbol p() {
return pro();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Pyrrolysine (O)
*/
public static AtomicSymbol pyl() {
return (AtomicSymbol) tokenToSymbol.get("O");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Pyrrolysine
*/
public static AtomicSymbol o() {
return pyl();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Selenocysteine (U)
*/
public static AtomicSymbol sec() {
return (AtomicSymbol) tokenToSymbol.get("U");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Selenocysteine
*/
public static AtomicSymbol u(){
return sec();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Serine (S)
*/
public static AtomicSymbol ser() {
return (AtomicSymbol) tokenToSymbol.get("S");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Serine
*/
public static AtomicSymbol s() {
return ser();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Threonine (T)
*/
public static AtomicSymbol thr() {
return (AtomicSymbol) tokenToSymbol.get("T");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Threonine
*/
public static AtomicSymbol t() {
return thr();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Tryptophan (W)
*/
public static AtomicSymbol trp() {
return (AtomicSymbol) tokenToSymbol.get("W");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Tryptophan
*/
public static AtomicSymbol w() {
return trp();
}
/**
* Returns the AtomicSymbol
for the amino acid
* Tyrosine (Y)
*/
public static AtomicSymbol tyr() {
return (AtomicSymbol) tokenToSymbol.get("Y");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Tyrosine
*/
public static AtomicSymbol y() {
return tyr();
}
/**
* Returns the AtomicSymbol
for the amino acid Valine (V)
*/
public static AtomicSymbol val() {
return (AtomicSymbol) tokenToSymbol.get("V");
}
/**
* Returns the AtomicSymbol
for the amino acid
* Valine
*/
public static AtomicSymbol v() {
return val();
}
/**
* Returns the AtomicSymbol
for the termination (*)
* placeholder
*/
public static AtomicSymbol ter() {
return (AtomicSymbol) tokenToSymbol.get("*");
}
}