/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.program.sax.blastxml; import org.biojava.bio.seq.io.game.ElementRecognizer; import org.biojava.utils.stax.DelegationManager; import org.biojava.utils.stax.StAXContentHandler; import org.biojava.utils.stax.StringElementHandlerBase; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * Class to parse NCBI Blast-XML output. *

* WARNING: when Blast is supplied with multiple sequences in a single * FASTA file, it generates horribly-formed XML code. This occurs because * it generates a complete XML document per query sequence including * lines and concatenates all of these into one output file. This document * is not well-formed and cannot be parsed by standard XML parsers. *

* You have various options to solve this. You can parse the output file * externally, separating the component documents and feeding each individually to * this parser. Alternatively, you could strip out the lines and wrap the * entire output in a fake element. This can be * done in Linux (and other Unixen) with :- *


 * #!/bin/sh
 * # Converts a Blast XML output to something vaguely well-formed
 * # for parsing.
 * # Use: blast_aggregate  
 * # strips all <?xml> and <!DOCTYPE> tags
 * # encapsulates the multiple <BlastOutput> elements into <blast_aggregator>
 * sed '/>?xml/d' $1 | sed '/<!DOCTYPE/d' | sed '1i\
 * <blast_aggregator>
 * $a\
 * </blast_aggregator>' > $2

* The resultant file can then be parsed with the BlastAggregator object. * * @author David Huen */ class BlastOutputHandler extends StAXFeatureHandler { // create static factory class that makes an instance // of this class. public final static StAXHandlerFactory BLASTOUTPUT_HANDLER_FACTORY = new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new BlastOutputHandler(staxenv); } }; // class variables private String program = null; private String version = null; private String databaseId = null; private String queryId = null; private String queryDef = null; /** * If set, the output is wrapped in a . *

* If not, it is left as a . *

* Default is true. */ boolean wrap = true; // constructor when this is a element class in a document public BlastOutputHandler(StAXFeatureHandler staxenv) { super(staxenv); // System.out.println("BlastOutputHandler staxenv " + staxenv); // initialise delegation initDelegation(); } // constructor private void initDelegation() { // delegate handling of // super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_param"), // BlastOutputParamHandler.BLAST_OUTPUT_PARAM_HANDLER_FACTORY); // delegate handling of super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_program"), new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new StringElementHandlerBase() { public void setStringValue(String s) throws SAXException { program = s.trim(); // at this point, I can set the sequence types // note that the sequence type here is the form // in which blast DISPLAYS its output, not the // sequence type of either query or target. if (program.equals("blastn")) { querySequenceType = "dna"; hitSequenceType = "dna"; } else if (program.equals("blastp")) { querySequenceType = "protein"; hitSequenceType = "protein"; } else if (program.equals("blastx")) { // nucleotide query translated in all frames // against protein database. querySequenceType = "protein"; hitSequenceType = "protein"; } else if (program.equals("tblastn")) { // protein query against dna database in all frames // hit frame is displayed only, no query frame // irrespective of frame, both sequences displayed // in increasing seq DNA coordinates (by from-to). querySequenceType = "protein"; hitSequenceType = "protein"; } else if (program.equals("tblastx")) { // dna query translated in all frames against // dna database in all frames // irrespective of frame, both sequences displayed // in increasing seq DNA coordinates. querySequenceType = "protein"; hitSequenceType = "protein"; } else throw new SAXException("unknown BLAST program."); } }; } } ); // delegate handling of super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_version"), new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new StringElementHandlerBase() { public void setStringValue(String s) throws SAXException { version = s.trim(); } public void startElement( String nsURI, String localName, String qName, Attributes attrs, DelegationManager dm) throws SAXException { // now generate my own start element super.startElement(nsURI, localName, qName, attrs, dm); } public void endElement( String nsURI, String localName, String qName, StAXContentHandler handler) throws SAXException { // necessary as staxenv cannot be final and therefore // staxenv.listener cannot be accessed from inner class ContentHandler listener = getListener(); super.endElement(nsURI, localName, qName, handler); // generate the start of here. // generate attributes AttributesImpl bldsAttrs = new AttributesImpl(); // System.out.println("program, version " + program + " " + version); if ((program != null) && (version != null)) { bldsAttrs.addAttribute(biojavaUri, "program", "program", CDATA, program); bldsAttrs.addAttribute(biojavaUri, "version", "version", CDATA, version); } listener.startElement(biojavaUri, "BlastLikeDataSet", biojavaUri + ":BlastLikeDataSet", bldsAttrs); // generate start of header listener.startElement(biojavaUri, "Header", biojavaUri + ":Header", new AttributesImpl()); // we don't have raw output but it is compulsory listener.startElement(biojavaUri, "RawOutput", biojavaUri + ":RawOutput", new AttributesImpl()); listener.endElement(biojavaUri, "RawOutput", biojavaUri + ":RawOutput"); } }; } } ); // delegate handling of // super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_reference"), // SearchPropertyHandler.SEARCH_PROPERTY_HANDLER_FACTORY); // delegate handling of super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_db"), new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new StringElementHandlerBase() { public void setStringValue(String s) throws SAXException { databaseId = s.trim(); } }; } } ); // delegate handling of super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_query-ID"), new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new StringElementHandlerBase() { public void setStringValue(String s) throws SAXException { queryId = s.trim(); } }; } } ); // delegate handling of super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_query-def"), new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new StringElementHandlerBase() { public void setStringValue(String s) throws SAXException { queryDef = s.trim(); } }; } } ); // delegate handling of // super.addHandler(new ElementRecognizer.ByLocalName(""), // SearchPropertyHandler.SEARCH_PROPERTY_HANDLER_FACTORY); // delegate handling of super.addHandler(new ElementRecognizer.ByLocalName("BlastOutput_iterations"), new StAXHandlerFactory() { public StAXContentHandler getHandler(StAXFeatureHandler staxenv) { return new BlastOutputIterationsHandler(staxenv) { public void startElementHandler( String nsURI, String localName, String qName, Attributes attrs) throws SAXException { // necessary as staxenv cannot be final and therefore // staxenv.listener cannot be accessed from inner class ContentHandler listener = getListener(); // System.out.println("about to start outputting QueryId. listener is " + listener); // the DatabaseId and QueryId elements are generated // in reversed order so I cannot generate them on-the-fly. // generate if required if (queryId != null) { AttributesImpl queryAttrs = new AttributesImpl(); queryAttrs.addAttribute(biojavaUri, "id", "id", CDATA, queryId); queryAttrs.addAttribute(biojavaUri, "metadata", "metadata", CDATA, "none"); listener.startElement(biojavaUri, "QueryId", biojavaUri + ":QueryId", queryAttrs); listener.endElement(biojavaUri, "QueryId", biojavaUri + ":QueryId"); } // generate if required if (queryDef != null) { listener.startElement(biojavaUri, "QueryDescription", biojavaUri + ":QueryDescription", new AttributesImpl()); listener.characters(queryDef.toCharArray(), 0, queryDef.length()); listener.endElement(biojavaUri, "QueryDescription", biojavaUri + ":QueryDescription"); } if (databaseId != null) { AttributesImpl dbAttrs = new AttributesImpl(); dbAttrs.addAttribute(biojavaUri, "id", "id", CDATA, databaseId); dbAttrs.addAttribute(biojavaUri, "metadata", "metadata", CDATA, "none"); listener.startElement(biojavaUri, "DatabaseId", biojavaUri + ":DatabaseId", dbAttrs); listener.endElement(biojavaUri, "DatabaseId", biojavaUri + ":DatabaseId"); } // generate end of

listener.endElement(biojavaUri, "Header", biojavaUri + ":Header"); // now I generate my own start element: does nothing. super.startElementHandler(nsURI, localName, qName, attrs); } }; } } ); } void endElementHandler( String nsURI, String localName, String qName, StAXContentHandler handler) throws SAXException { // generate end of staxenv.listener.endElement(biojavaUri, "BlastLikeDataSet", biojavaUri + ":BlastLikeDataSet"); } }