/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.program.sax; import java.io.BufferedReader; import java.io.IOException; import java.util.HashSet; import java.util.Set; import java.util.StringTokenizer; import org.biojava.bio.BioException; import org.biojava.bio.search.SearchContentHandler; import org.biojava.utils.ParserException; /** *

FastaSearchParser objects provide Fasta search * parsing functionality for the '-m 10' output format (see the Fasta * documentation). Data are passed to a SearchContentHandler which * coordinates its interpretation and creation of objects representing * the result.

* *

If the search output contains no hits only the header data will * be sent to the handler before the dataset ends. The handler is * responsible for dealing with this state.

* *

This class was originally used outside of the BioJava SAX * framework, but is now only used to provide functionality for * FastaSearchSAXParser.

* * @author Keith James * @author Greg Cox * @since 1.1 */ class FastaSearchParser { private static final int NODATA = 0; private static final int INHEADER = 1; private static final int INHIT = 2; private static final int INQUERY = 3; private static final int INSUBJECT = 4; private static final int INALIGN = 5; // Valid line identifiers for result annotation private static HashSet resultAnnoTokens = (HashSet) fillSet(new String [] { "mp_name", "mp_ver", "mp_argv", "mp_extrap", "mp_stats", "mp_KS", "pg_name", "pg_ver", "pg_optcut", "pg_cgap" }, new HashSet()); // Valid line identifiers for search parameters private static HashSet resultSearchParmTokens = (HashSet) fillSet(new String [] { "pg_matrix", "pg_ktup", "pg_gap-pen" }, new HashSet()); // Valid line identifiers for hit annotation private static HashSet hitAnnoTokens = (HashSet) fillSet(new String [] { "fa_frame", "fa_initn", "fa_init1", "fa_opt", "fa_bits", "sw_score", "sw_ident", "sw_gident", "sw_overlap", "fa_ident", "fa_gident", "fa_overlap", "fa_score" }, new HashSet()); // Valid line identifiers for hit data private static HashSet hitDataTokens = (HashSet) fillSet(new String [] { "fa_expect", "fa_z-score" }, new HashSet()); private int searchStatus = NODATA; private boolean searchParsed = false; private SearchContentHandler handler; private String line; private int lineNumber; StringBuffer querySeqTokens = new StringBuffer(1024); StringBuffer subjectSeqTokens = new StringBuffer(1024); StringBuffer matchTokens = new StringBuffer(1024); /** * The parseSearch method performs the core parsing * operations. It parses one result from the stream before * returning. The handler is informed whether or not there are * further searches in the stream. * * @param reader a BufferedReader to read from. * @param handler a SearchContentHandler to notify * of events. * * @exception IOException if the BufferedReader fails. * @exception BioException if the parser (via the registered * SearchContentHandler) fails to resolve a query sequence and * target database. * @exception ParserException if the parser fails to parse a * line. */ public void parseSearch(BufferedReader reader, SearchContentHandler handler) throws IOException, BioException, ParserException { lineNumber = 0; this.handler = handler; LINE: while ((line = reader.readLine()) != null) { lineNumber++; // This token indicates the end of the formatted search // data. Some outputs don't have any alignment consensus // tokens, so we need to check here as well as INALIGN if (line.startsWith(">>><<<")) { // If we got here while INHEADER then the search had // no hits if (searchStatus == INHEADER) { // Inform of end of header and search handler.endHeader(); handler.endSearch(); searchParsed = true; searchStatus = NODATA; continue LINE; } // Pass final data to handler handler.addSubHitProperty("querySeqTokens", querySeqTokens.substring(0)); handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0)); handler.addSubHitProperty("matchTokens", matchTokens.substring(0)); handler.endSubHit(); handler.endSearch(); searchParsed = true; searchStatus = NODATA; // Continue getting lines to look for start of another // search. This allows setMoreSearches(boolean flag) // to be called on the handler. When true it will know // to expect more. continue LINE; } STATUS: switch (searchStatus) { case NODATA: // This token marks the line describing the query // sequence file and database searched. It is // followed by header lines containing data about // the search if (line.startsWith(">>>")) { searchStatus = INHEADER; handler.setQueryID(parseQueryID(line)); handler.setDatabaseID(parseDatabaseID(line)); handler.startSearch(); handler.startHeader(); // If we already saw an end of search token // then this is the start of another // dataset. We break from the loop and return // that the stream is not empty if (searchParsed) { searchParsed = false; handler.setMoreSearches(true); // We have parsed one result, so return return; } else // We break out and setMoreSearches(false) // is called below break STATUS; } else // Continue getting lines to look for start of another // search. This allows setMoreSearches(boolean flag) // to be called on the handler. When true it will know // to expect more. continue LINE; case INHEADER: // This token marks the line describing a hit. It // is followed by header lines containing data // about the hit if (line.startsWith(">>")) { searchStatus = INHIT; handler.endHeader(); handler.startHit(); querySeqTokens.setLength(0); subjectSeqTokens.setLength(0); matchTokens.setLength(0); handler.addHitProperty("id", parseID(line)); handler.addHitProperty("desc", parseDesc(line)); } else { if (! parseHeaderLine(line, resultAnnoTokens)) if (! parseHeaderLine(line, resultSearchParmTokens)) throw new ParserException("Fasta parser failed to recognise line type", null, lineNumber, line); } break STATUS; case INHIT: // This token marks the line describing the query // sequence. if (line.startsWith(">")) { searchStatus = INQUERY; handler.endHit(); handler.startSubHit(); } else { if (! parseHitLine(line, hitAnnoTokens)) if (! parseHitLine(line, hitDataTokens)) throw new ParserException("Fasta parser failed to recognise line type", null, lineNumber, line); } break STATUS; case INQUERY: // This token marks the line describing the // subject sequence. if (line.startsWith(">")) { searchStatus = INSUBJECT; } else { parseQuerySequence(line); } break STATUS; case INSUBJECT: // This token marks the start of lines containing // the consensus symbols from the Fasta alignment, // which we ignore if (line.startsWith("; al_cons:")) { searchStatus = INALIGN; } else if (line.startsWith(">>")) { searchStatus = INHIT; // Pass data to handler handler.addSubHitProperty("querySeqTokens", querySeqTokens.substring(0)); handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0)); handler.addSubHitProperty("matchTokens", matchTokens.substring(0)); handler.endSubHit(); handler.startHit(); querySeqTokens.setLength(0); subjectSeqTokens.setLength(0); matchTokens.setLength(0); handler.addHitProperty("id", parseID(line)); handler.addHitProperty("desc", parseDesc(line)); } else { parseSubjectSequence(line); } break STATUS; case INALIGN: if (line.startsWith(">>")) { searchStatus = INHIT; // Pass data to handler handler.addSubHitProperty("querySeqTokens", querySeqTokens.substring(0)); handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0)); handler.addSubHitProperty("matchTokens", matchTokens.substring(0)); handler.endSubHit(); handler.startHit(); querySeqTokens.setLength(0); subjectSeqTokens.setLength(0); matchTokens.setLength(0); handler.addHitProperty("id", parseID(line)); handler.addHitProperty("desc", parseDesc(line)); } else if (line.startsWith(">>><<<")) { searchStatus = NODATA; // Pass final data to handler handler.addSubHitProperty("querySeqTokens", querySeqTokens.substring(0)); handler.addSubHitProperty("subjectSeqTokens", subjectSeqTokens.substring(0)); handler.addSubHitProperty("matchTokens", matchTokens.substring(0)); handler.endSubHit(); handler.endSearch(); searchParsed = true; handler.setMoreSearches(true); continue LINE; } else { matchTokens.append(line); } break STATUS; default: break STATUS; } // end switch } // end while // This is false if we reach here and return handler.setMoreSearches(false); } /** * The fillSet method populates a Set * with the elements of an array. * * @param tokenArray a String [] array. * @param set a Set to fill. * * @return a Set. */ private static Set fillSet(String [] tokenArray, Set set) { for (int i = 0; i < tokenArray.length; i++) set.add(tokenArray[i]); return set; } /** * The parseID method parses sequence IDs from * lines starting with '>' and '>>'. * * @param line a String to be parsed. * * @return a String containing the ID. * * @exception ParserException if an error occurs. */ private String parseID(String line) throws ParserException { String trimmed = line.trim(); int firstSpace = trimmed.indexOf(' '); // For Hit header lines (always start with >>) if (trimmed.startsWith(">>")) { if (trimmed.length() == 2) throw new ParserException("Fasta parser encountered a sequence with no Id", null, lineNumber, line); if (firstSpace == -1) return trimmed.substring(2); else return trimmed.substring(2, firstSpace); } // For SubHit header lines (always start with >) else { if (trimmed.length() == 1) throw new ParserException("Fasta parser encountered a sequence with no Id", null, lineNumber, line); if (firstSpace == -1) return trimmed.substring(1); else return trimmed.substring(1, firstSpace); } } /** * The parseDesc method parses the sequence * description from subject header lines. * * @param line a String to be parsed. * * @return a String containing the description. */ private String parseDesc(String line) { String trimmed = line.trim(); int firstSpace = trimmed.indexOf(' '); if (firstSpace == -1) return "No description"; return trimmed.substring(firstSpace + 1); } /** * Creates a new parseQueryID instance. * * @param line a String to be parsed. * * @return a String. * * @exception ParserException if an error occurs */ private String parseQueryID(String line) throws ParserException { int firstComma = line.indexOf(","); if (firstComma == -1) throw new ParserException("Fasta parser failed to parse a query ID", null, lineNumber, line); // return string between >>> and , return line.substring(3, firstComma).trim(); } /** * The parseDatabaseID method parses a database * filename from the relevant output line. * * @param line a String to be parsed. * * @return a String. * * @exception ParserException if an error occurs. */ private String parseDatabaseID(String line) throws ParserException { StringTokenizer st = new StringTokenizer(line); String id = null; int count = st.countTokens(); for (int i = 0; i < count - 1; i++) { id = st.nextToken(); } if (id == null) throw new ParserException("Fasta parser failed to parse a database ID", null, lineNumber, line); return id; } private boolean parseHeaderLine(String line, Set tokenSet) throws ParserException { String [] data = parseLine(line, tokenSet); if (data.length > 0) { handler.addSearchProperty(data[0], data[1]); return true; } else { return false; } } private boolean parseHitLine(String line, Set tokenSet) throws ParserException { String [] data = parseLine(line, tokenSet); if (data.length > 0) { handler.addHitProperty(data[0], data[1]); return true; } return false; } private String [] parseLine(String line, Set tokenSet) throws ParserException { int idTokenStart = line.indexOf(";"); int idTokenEnd = line.indexOf(":"); String idToken = line.substring(idTokenStart + 1, idTokenEnd); idToken = idToken.trim(); String idValue = line.substring(idTokenEnd + 1); idValue = idValue.trim(); if (tokenSet.contains(idToken)) return new String [] { idToken, idValue }; else return new String [0]; } private void parseQuerySequence(String line) { String [] data = parseSequence(line); if (data.length > 0) { // We have a key/value pair handler.addSubHitProperty("query" + data[0], data[1]); } else { // We have a line of sequence tokens querySeqTokens.append(line); } } private void parseSubjectSequence(String line) { String [] data = parseSequence(line); if (data.length > 0) { // We have a key/value pair handler.addSubHitProperty("subject" + data[0], data[1]); } else { // We have a line of sequence tokens subjectSeqTokens.append(line); } } private String [] parseSequence(String line) { if (line.startsWith(";")) { // Check the sequence type given by the report if (line.equals("; sq_type: p")) { return new String [] { "_sq_type", "protein"}; } else if (line.equals("; sq_type: D")) { return new String [] { "_sq_type", "dna"}; } // Record the coordinates and offset of the alignment if (line.startsWith("; al_start:")) { return new String [] { "_al_start", parseCoord(line) }; } else if (line.startsWith("; al_stop:")) { return new String [] { "_al_stop", parseCoord(line) }; } else if (line.startsWith("; al_display_start:")) { return new String [] {"_al_display_start", parseCoord(line) }; } else if (line.startsWith("; sq_len:")) { return new String [] { "_sq_len", parseCoord(line) }; } else if (line.startsWith("; sq_offset:")) { return new String [] { "_sq_offset", parseCoord(line) }; } } return new String [0]; } /** * The parseCoord method extracts integer coordinates * from Fasta output lines. * * @param line a String to parse. * * @return a String coordinate. */ private String parseCoord(String line) { int sepIndex = line.lastIndexOf(":"); return line.substring(sepIndex + 1).trim(); } }