/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.program.sax; import java.io.BufferedReader; import java.io.IOException; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * A facade class allowing for direct SAX2-like parsing of the native * output from Blast-like bioinformatics software. Because the parser is SAX2 * compliant, application writers can simply pass XML ContentHandlers * to the parser in order to receive notifcation of SAX2 events. *
* The SAX2 events produced are as if the input to the parser was * an XML file validating against the biojava BlastLikeDataSetCollection DTD. * There is no requirement for an intermediate conversion of native output to * XML format. An application of the parsing framework, however, is to * create XML format files from native output files. *
* The biojava Blast-like parsing framework is designed to uses minimal * memory,so that in principle, extremely large native outputs can be * parsed and XML ContentHandlers can listen only for small amounts of * information. *
* The framework currently supports parsing of native output from * the following bioinformatics programs. Please note that if * you are using different versions of NCBI or WU Blast to those * listed below, it is worth considering trying setting the parsing * mode to Lazy, which means parsing will be attempted if the program * is recognised, regardless of version. *
*
* Notes to SAX driver writers *
* The framework that this parser is built on is designed to be * extensible with support for both different pieces of software * (i.e. not just software that produces Blast-like output), * and multiple versions of programs. *
* This class inherits from the * org.biojava.bio.program.sax.AbstractNativeAppSAXParser * abstract base class. The abstract base class is a good place to * start looking if you want to write new native application SAX2 parsers. * This and releated classes have only package-level visibility. * Typically, application writers are expected to provide a facade class * in this package (similar to the current class) to allow * users access to functionality. *
* NB Support for InputSource is not complete due to the fact * that URLs are not resolved and cannot, therefore, be used * as an InputSource. System pathnames, ByteStreams and CharacterStreams, * however, are all supported. *
* * Copyright © 2000 Cambridge Antibody Technology. * *
* Primary author -
parse
initiates the parsing operation.
*
* @param poSource an InputSource
.
* @exception IOException if an error occurs.
* @exception SAXException if an error occurs.
*/
public void parse(InputSource poSource )
throws IOException, SAXException {
BufferedReader oContents;
String oLine;
this.changeState(STARTUP);
//Use method form superclass
oContents = this.getContentStream(poSource);
//This sets contentHandler document for XSLT
this.getContentHandler().startDocument();
try {
// loop over file
oLine = oContents.readLine();
while (oLine != null) {
//interpret line and send messages accordingly
this.interpret(oContents,oLine);
//do extra interpretation of lines reached by subparser
//objects
if (iState == INSIDE_FILE) {
oLine = oStoredLine;
if (oStoredLine != null) {
this.interpret(oContents,oLine);
}
} else {
oLine = oContents.readLine();
}
} // end while
} catch (IOException x) {
System.out.println(x.getMessage());
System.out.println("File read interrupted");
} // end try/catch
//at end of file...
oContents.close();
if (!tValidFormat) {
throw (new SAXException("Could not recognise the format " +
"of this file as one supported by the framework."));
}
this.endElement(new QName(this,
this.prefix("BlastLikeDataSetCollection")));
}
/**
* This is the default, parsing will be attempted only if both
* the program e.g. NCBI BlastP, and a particular version
* are recognised as bsing supported.
*
*/
public void setModeStrict() {
oVersion.setMode(BlastLikeVersionSupport.STRICT);
}
/**
* Setting the mode to lazy means that, if the program is recognised,
* e.g. WU-TBlastX, then parsing will be attempted even if
* the particular version is not recognised. Using this option
* is more likely to result in erroneous parsing than if the
* strict mode is used.
*
*/
public void setModeLazy() {
oVersion.setMode(BlastLikeVersionSupport.LAZY);
}
/**
* Deal with line according to state parser is in.
*
* @param poLine A line of Blast output
*/
private void interpret(BufferedReader poContents, String poLine)
throws SAXException {
//For a brand new collection,
//check for the start of a new BlastDataSet
if (iState == STARTUP) {
//look for characteristic of start of dataset
if (oVersion.isStartOfDataSet(poLine)) {
//For GCG, oVersion is set as an indicator to get
//program info from the second line
if (oVersion.getProgram() == BlastLikeVersionSupport.GCG) {
//if GCG, skip to next line to get program info
try {
poLine = poContents.readLine ();
} catch (java.io.IOException x) {
System.out.println(x.getMessage());
System.out.println("File read interrupted");
throw (new SAXException("Error parsing GCG File"));
} // end try/catch
}
tValidFormat = oVersion.assignProgramAndVersion(poLine);
if (!oVersion.isSupported()) {
throw (new SAXException("Program "
+ oVersion.getProgramString()
+ " Version "
+ oVersion.getVersionString()
+ " is not supported by the biojava blast-like "
+ "parsing framework"));
}
oAtts.clear();
oAttQName.setQName("xmlns");
//check if namespace configuration means attribute
//should not be reported.
if (!oAttQName.getLocalName().equals("")) {
oAtts.addAttribute(oAttQName.getURI(),
oAttQName.getLocalName(),
oAttQName.getQName(),
"CDATA","");
}
oAttQName.setQName("xmlns:biojava");
//check if namespace configuration means attribute
//should not be reported.
if (!oAttQName.getLocalName().equals("")) {
oAtts.addAttribute(oAttQName.getURI(),
oAttQName.getLocalName(),
oAttQName.getQName(),
"CDATA","http://www.biojava.org");
}
this.startElement(new QName(this,
this.prefix("BlastLikeDataSetCollection")),
(Attributes)oAtts);
this.onNewDataSet(poContents,poLine);
return;
}
} //End check for the start of a new BlastDataSet
if (iState == INSIDE_FILE) {
//look for characteristic of start of dataset
if (oVersion.isStartOfDataSet(poLine)) {
tValidFormat = oVersion.assignProgramAndVersion(poLine);
this.onNewDataSet(poContents,poLine);
return;
}
} //End check for the start of a new BlastDataSet
}
/**
*
* When this method is called, the line will look something line:
*
* BLASTN 2.0.11 [Jan-20-2000]
*
* The above would be parsed to program blastn, and version number.
*
* @param poLine -
*/
private void onNewDataSet(BufferedReader poContents, String poLine)
throws SAXException {
//choose according to version...
oBlast = new BlastSAXParser(oVersion,this.getNamespacePrefix());
String oLine="";
//Parse Contents stream up to end of a single BlastDataSet.
oBlast.setContentHandler(oHandler);
while(oLine!=null) {
oLine = oBlast.parse(poContents,poLine);
}
this.changeState(INSIDE_FILE);
//now make sure to interpret the line the BlastSAXParser returned from
//in top-level parse method.
if (oLine != null) {
oStoredLine = oLine;
return;
// this.interpret(poContents,oLine);
} else {
//here if at the EOF
oStoredLine = null;
return;
}
}
}