/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojavax.bio.seq.io; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.biojava.bio.seq.io.ParseException; import org.biojavax.Comment; /** * * @author Richard Holland * @since 1.5 */ public class UniProtCommentParser { /** * Creates a new instance of UniProtCommentParser. */ public UniProtCommentParser() { this.interactions = new ArrayList(); this.isoforms = new ArrayList(); this.events = new ArrayList(); this.KMs = new ArrayList(); this.VMaxes = new ArrayList(); this.seqCautions = new ArrayList(); } // the prefix for comments private static final String PREFIX = "-!-"; /** * A name for a comment type. */ public static final String BIOPHYSICOCHEMICAL_PROPERTIES = "BIOPHYSICOCHEMICAL PROPERTIES"; /** * A name for a comment type. */ public static final String DATABASE = "DATABASE"; /** * A name for a comment type. */ public static final String MASS_SPECTROMETRY = "MASS SPECTROMETRY"; /** * A name for a comment type. */ public static final String ALTERNATIVE_PRODUCTS = "ALTERNATIVE PRODUCTS"; /** * A name for a comment type. */ public static final String INTERACTION = "INTERACTION"; /** * A name for a comment type. */ public static final String PTM = "PTM"; /** * A name for a comment type. */ public static final String SEQUENCE_CAUTION = "SEQUENCE CAUTION"; /** * Parses the comment string from the given comment and populates * the internal fields appropriately. If the comment is not a * UniProt comment (does not start with -!-) then an exception is * thrown. * @param c the comment to parse. * @throws ParseException if the comment was not parseable. */ public void parseComment(Comment c) throws ParseException { this.parseComment(c.getComment()); } /** * Parses the comment string from the given comment and populates * the internal fields appropriately. If the comment is not a * UniProt comment (does not start with -!-) then an exception is * thrown. * @param c the comment to parse. * @throws ParseException if the comment was not parseable. */ public void parseComment(String c) throws ParseException { if (!isParseable(c)) throw new ParseException("Comment is not a UniProt structured comment. Comment was "+c); String comment = new String(c); //keep the original just in case... // do the parsing here. try{ c = c.replaceAll("\\s+", " ").trim(); // replace all multi-spaces and newlines with single spaces // our comment is now one long string, -!- TYPE: [prefix: key=value; | key=value; | text] c = c.substring(PREFIX.length()+1); // chomp "-!- " String type = c.substring(0,c.indexOf(':')); // find type this.setCommentType(type); // remember type c = c.substring(c.indexOf(':')+1); // chomp type and colon if (c.endsWith(".")) c=c.substring(0,c.length()-1); // chomp trailing full stop // what we have left is the [prefix: key=value; | key=value; | text.] section if (this.getCommentType().equalsIgnoreCase(BIOPHYSICOCHEMICAL_PROPERTIES)) { /* CC -!- BIOPHYSICOCHEMICAL PROPERTIES: CC Absorption: CC Abs(max)=xx nm; CC Note=free_text; CC Kinetic parameters: CC KM=xx unit for substrate [(free_text)]; CC Vmax=xx unit enzyme [free_text]; CC Note=free_text; CC pH dependence: CC free_text; CC Redox potential: CC free_text; CC Temperature dependence: CC free_text; */ do { String[] parts = c.split(";"); if (parts.length==1) { // we are one of the last three options on the list int firstColon = parts[0].indexOf(':'); String key = parts[0].substring(0,firstColon).trim(); String value = parts[0].substring(firstColon+1).trim(); if (key.equalsIgnoreCase("pH dependence")) this.setPHDependence(value); else if (key.equalsIgnoreCase("Redox potential")) this.setRedoxPotential(value); else if (key.equalsIgnoreCase("Temperature dependence")) this.setTemperatureDependence(value); // skip to next chunk c = c.substring(c.indexOf(";")+1); } else { // we are one of the first two options on the list int skippos = -1; String key = parts[0].split(":")[0].trim(); if (key.equalsIgnoreCase("Absorption")) { String[] subparts = parts[0].split(":")[1].split("="); this.setAbsorptionMax(subparts[1].trim()); subparts = parts[1].split("="); this.setAbsorptionNote(subparts[1].trim()); skippos = 2; } else if (key.equalsIgnoreCase("Kinetic parameters")) { int partCount = 0; String[] subparts = parts[partCount].split(":")[1].split("="); key = subparts[0].trim(); String value = subparts[1].trim(); while (!key.equalsIgnoreCase("Note")) { if (key.equalsIgnoreCase("KM")) this.getKMs().add(value); else if (key.equalsIgnoreCase("VMax")) this.getVMaxes().add(value); subparts = parts[++partCount].split("="); key = subparts[0].trim(); value = subparts[1].trim(); } this.setKineticsNote(value); } // skip to next chunk int chunkpos = c.indexOf(parts[skippos]); c = c.substring(chunkpos); } c = c.trim(); } while (c.length()>0); } else if (this.getCommentType().equalsIgnoreCase(DATABASE)) { /* CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"]. */ c = c.substring(0,c.length()-1); // chomp trailing dot String[] parts = c.split(";"); for (int i = 0; i < parts.length; i++) { String[] subparts = parts[i].split("="); String key = subparts[0].trim(); String value = subparts[1].trim(); if (key.equalsIgnoreCase("NAME")) this.setDatabaseName(value); else if (key.equalsIgnoreCase("NOTE")) this.setNote(value); else if (key.equalsIgnoreCase("WWW") || key.equalsIgnoreCase("FTP")) this.setUri(value); } } else if (this.getCommentType().equalsIgnoreCase(MASS_SPECTROMETRY)) { /* CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX]; METHOD=XX; RANGE=XX-XX[ (Name)]; NOTE={Free text (Ref.n)|Ref.n}. */ c = c.substring(0,c.length()-1); // chomp trailing dot String[] parts = c.split(";"); for (int i = 0; i < parts.length; i++) { String[] subparts = parts[i].split("="); String key = subparts[0].trim(); String value = subparts[1].trim(); if (key.equalsIgnoreCase("MW")) this.setMolecularWeight(Integer.parseInt(value)); else if (key.equalsIgnoreCase("MW_ERR")) this.setMolWeightError(new Integer(value)); else if (key.equalsIgnoreCase("METHOD")) this.setMolWeightMethod(value); else if (key.equalsIgnoreCase("RANGE")) { if (value.indexOf(' ')>-1) value = value.substring(0, value.indexOf(' ')); // drop name String[] locs = value.split("-"); this.setMolWeightRangeStart(Integer.parseInt(locs[0])); this.setMolWeightRangeEnd(Integer.parseInt(locs[1])); } else if (key.equalsIgnoreCase("NOTE")) this.setNote(value); } } else if (this.getCommentType().equalsIgnoreCase(INTERACTION)) { /* CC -!- INTERACTION: CC {{SP_Ac:identifier[ (xeno)]}|Self}; NbExp=n; IntAct=IntAct_Protein_Ac, IntAct_Protein_Ac; */ String[] parts = c.split(";"); Interaction interact = null; for (int i = 0; i < parts.length; i++) { String[] subparts = parts[i].split("="); String key = subparts[0].trim(); String value = null; if (key.equalsIgnoreCase("Self")) { // start new self-self interaction interact = new Interaction(); interact.setID("Self"); interact.setOrganismsDiffer(false); this.getInteractions().add(interact); } else if (subparts.length==1) { // start new protein-protein interaction subparts = key.split(":"); boolean differ = false; if (subparts[1].indexOf("(xeno)")>-1) { differ = true; subparts[1] = subparts[1].substring(0,subparts[1].indexOf("(xeno)")); } interact = new Interaction(); interact.setID(subparts[0].trim()); interact.setLabel(subparts[1].trim()); interact.setOrganismsDiffer(differ); this.getInteractions().add(interact); } else { value = subparts[1].trim(); // continue existing interaction if (key.equalsIgnoreCase("NbExp")) interact.setNumberExperiments(Integer.parseInt(value)); else if (key.equalsIgnoreCase("IntAct")) { subparts = value.split(","); interact.setFirstIntActID(subparts[0].trim()); interact.setSecondIntActID(subparts[1].trim()); } } } } else if (this.getCommentType().equalsIgnoreCase(ALTERNATIVE_PRODUCTS)) { /* CC -!- ALTERNATIVE PRODUCTS: CC Event=Alternative promoter; CC Comment=Free text; CC Event=Alternative splicing; Named isoforms=n; CC Comment=Optional free text; CC Name=Isoform_1; Synonyms=Synonym_1[, Synonym_n]; CC IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=Displayed; CC Note=Free text; CC Name=Isoform_n; Synonyms=Synonym_1[, Synonym_n]; CC IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=VSP_identifier_1 [, VSP_identifier_n]; CC Note=Free text; CC Event=Alternative initiation; CC Comment=Free text; */ Event event = null; Isoform isoform = null; String[] parts = c.split(";"); for (int i = 0; i < parts.length; i++) { String[] subparts = parts[i].split("="); if (subparts.length<2) continue; String key = subparts[0].trim(); String value = subparts[1].trim(); if (key.equalsIgnoreCase("Event")) { // new event event = new Event(); this.getEvents().add(event); event.setType(value); } else if (key.equalsIgnoreCase("Name")) { // new isoform isoform = new Isoform(); this.getIsoforms().add(isoform); isoform.getNames().add(value); } else if (key.equalsIgnoreCase("Synonyms")) { subparts = value.split(","); for (int j = 0; j < subparts.length; j++) isoform.getNames().add(subparts[j].trim()); } else if (key.equalsIgnoreCase("IsoId")) { subparts = value.split(","); for (int j = 0; j < subparts.length; j++) isoform.getIsoIDs().add(subparts[j].trim()); } else if (key.equalsIgnoreCase("Sequence")) { if (value.equalsIgnoreCase("Displayed")) isoform.setSequenceType("Displayed"); else if (value.equalsIgnoreCase("Not described")) isoform.setSequenceType("Not described"); else if (value.equalsIgnoreCase("External")) isoform.setSequenceType("External"); else { isoform.setSequenceType("Described"); isoform.setSequenceRef(value); } } else if (key.equalsIgnoreCase("Note")) { isoform.setNote(value); } else if (key.equalsIgnoreCase("Named isoforms")) { event.setNamedIsoforms(Integer.parseInt(value)); } else if (key.equalsIgnoreCase("Comment")) { event.setComment(value); } } } else if (this.getCommentType().equalsIgnoreCase(SEQUENCE_CAUTION)) { /* CC -!- SEQUENCE_CAUTION: Sequence=Sequence; Type=Type;[ Positions=Positions;][ Note=Note;] */ SeqCaution seqc = null; c = c.substring(0,c.length()-1); // chomp trailing dot String[] parts = c.split(";"); for (int i = 0; i < parts.length; i++) { String[] subparts = parts[i].split("="); String key = subparts[0].trim(); String value = subparts[1].trim(); if (key.equalsIgnoreCase("SEQUENCE")) { seqc = new SeqCaution(); this.getSeqCautions().add(seqc); seqc.setSequence(value); } else if (key.equalsIgnoreCase("TYPE")) seqc.setType(value); else if (key.equalsIgnoreCase("POSITIONS")) seqc.setPositions(value); else if (key.equalsIgnoreCase("NOTE")) seqc.setNote(value); } } else { // all others are just free text. this.setText(c); } }catch(RuntimeException ex){ throw new ParseException(ex, "Cannot parse the comment: "+comment); } // all done } /** * Returns true if the comment may be parseable (starts with -!-). * @param c the comment to check. * @return true if it starts with -!-, false otherwise. */ public static boolean isParseable(Comment c) { return isParseable(c.getComment()); } /** * Returns true if the comment may be parseable (starts with -!-). * @param c the comment to check. * @return true if it starts with -!-, false otherwise. */ public static boolean isParseable(String c) { return c.trim().startsWith(PREFIX); } /** * Generates a comment string based on the current values of the * internal fields. * @return the comment string representing the current settings. * @throws ParseException if the current settings do not allow the * creation of a correct comment string. */ public String generate() throws ParseException { StringBuffer sb = new StringBuffer(); sb.append(PREFIX); sb.append(" "); sb.append(this.getCommentType()); sb.append(": "); // output the specifics if (this.getCommentType().equalsIgnoreCase(BIOPHYSICOCHEMICAL_PROPERTIES)) { /* CC -!- BIOPHYSICOCHEMICAL PROPERTIES: CC Absorption: CC Abs(max)=xx nm; CC Note=free_text; CC Kinetic parameters: CC KM=xx unit for substrate [(free_text)]; CC Vmax=xx unit enzyme [free_text]; CC Note=free_text; CC pH dependence: CC free_text; CC Redox potential: CC free_text; CC Temperature dependence: CC free_text; */ if (this.getAbsorptionNote()!=null) { // we have an absorption line! sb.append("\nAbsorption:\n Abs(max)="); sb.append(this.getAbsorptionMax()); sb.append(";\n Note="); sb.append(this.getAbsorptionNote()); sb.append(";"); } if (this.getKineticsNote()!=null) { // we have a kinetics note! sb.append("\nKinetic parameters:\n"); for (Iterator j = this.getKMs().iterator(); j.hasNext(); ) { sb.append(" KM="); sb.append((String)j.next()); sb.append(";\n"); } for (Iterator j = this.getVMaxes().iterator(); j.hasNext(); ) { sb.append(" VMax="); sb.append((String)j.next()); sb.append(";\n"); } sb.append(" Note="); sb.append(this.getKineticsNote()); sb.append(";"); } if (this.getPHDependence()!=null) { sb.append("\npH dependence:\n "); sb.append(this.getPHDependence()); sb.append(";"); } if (this.getRedoxPotential()!=null) { sb.append("\nRedox potential:\n "); sb.append(this.getRedoxPotential()); sb.append(";"); } if (this.getTemperatureDependence()!=null) { sb.append("\nTemperature dependence:\n "); sb.append(this.getTemperatureDependence()); sb.append(";"); } } else if (this.getCommentType().equalsIgnoreCase(DATABASE)) { if (this.getDatabaseName()==null) throw new ParseException("Database name is missing"); /* CC -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"]. */ sb.append("NAME="); sb.append(this.getDatabaseName()); if (this.getNote()!=null) { sb.append("; NOTE="); sb.append(this.getNote()); } if (this.getUri()!=null) { sb.append("; "); if (this.getUri().startsWith("ftp")) sb.append(" FTP="); else sb.append(" WWW="); sb.append(this.getUri()); } sb.append("."); } else if (this.getCommentType().equalsIgnoreCase(MASS_SPECTROMETRY)) { /* CC -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX]; METHOD=XX; RANGE=XX-XX[ (Name)]; NOTE={Free text (Ref.n)|Ref.n}. */ sb.append("MW="); sb.append(""+this.getMolecularWeight()); if (this.getMolWeightError()!=null) { sb.append("; MW_ERR="); sb.append(""+this.getMolWeightError()); } sb.append("; METHOD="); sb.append(this.getMolWeightMethod()); sb.append("; RANGE="); sb.append(""+this.getMolWeightRangeStart()); sb.append("-"); sb.append(""+this.getMolWeightRangeEnd()); sb.append("; NOTE="); sb.append(this.getNote()); sb.append("."); } else if (this.getCommentType().equalsIgnoreCase(INTERACTION)) { /* CC -!- INTERACTION: CC {{SP_Ac:identifier[ (xeno)]}|Self}; NbExp=n; IntAct=IntAct_Protein_Ac, IntAct_Protein_Ac; */ for (Iterator i = this.getInteractions().iterator(); i.hasNext(); ) { Interaction interact = (Interaction)i.next(); sb.append("\n"); // each interaction starts on a new line if (interact.getID().equals("Self")) { sb.append("Self; "); } else { sb.append(interact.getID()); sb.append(":"); sb.append(interact.getLabel()); if (interact.isOrganismsDiffer()) sb.append(" (xeno)"); sb.append("; "); } sb.append("NbExp="); sb.append(""+interact.getNumberExperiments()); sb.append("; "); sb.append("IntAct="); sb.append(interact.getFirstIntActID()); sb.append(", "); sb.append(interact.getSecondIntActID()); sb.append(";"); } } else if (this.getCommentType().equalsIgnoreCase(ALTERNATIVE_PRODUCTS)) { /* CC -!- ALTERNATIVE PRODUCTS: CC Event=Alternative promoter; CC Comment=Free text; CC Event=Alternative splicing; Named isoforms=n; CC Comment=Optional free text; CC Name=Isoform_1; Synonyms=Synonym_1[, Synonym_n]; CC IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=Displayed; CC Note=Free text; CC Name=Isoform_n; Synonyms=Synonym_1[, Synonym_n]; CC IsoId=Isoform_identifier_1[, Isoform_identifier_n]; Sequence=VSP_identifier_1 [, VSP_identifier_n]; CC Note=Free text; CC Event=Alternative initiation; CC Comment=Free text; */ for (Iterator i = this.getEvents().iterator(); i.hasNext(); ) { Event event = (Event)i.next(); sb.append("\n"); // each one starts on a new line sb.append("Event="); sb.append(event.getType()); if (event.getType().equals("Alternative splicing")) { sb.append("; Named isoforms="); sb.append(""+event.getNamedIsoforms()); } sb.append(";\n Comment="); // comment is indented two on next line sb.append(event.getComment()); sb.append(";"); if (event.getType().equals("Alternative splicing")) { for (Iterator j = this.getIsoforms().iterator(); j.hasNext(); ) { Isoform isoform = (Isoform)j.next(); sb.append("\nName="); // each isoform on a new line sb.append(isoform.getNames().get(0)); sb.append("; Synonyms="); for (int k =1 ; k < isoform.getNames().size(); k++) { sb.append(isoform.getNames().get(k)); if (k