/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.program.ssbind; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.biojava.bio.Annotation; import org.biojava.bio.BioException; import org.biojava.bio.search.SearchBuilder; import org.biojava.bio.search.SeqSimilaritySearchHit; import org.biojava.bio.search.SeqSimilaritySearchResult; import org.biojava.bio.search.SeqSimilaritySearchSubHit; import org.biojava.bio.search.SimpleSeqSimilaritySearchHit; import org.biojava.bio.search.SimpleSeqSimilaritySearchResult; import org.biojava.bio.search.SimpleSeqSimilaritySearchSubHit; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.StrandedFeature; import org.biojava.bio.seq.StrandedFeature.Strand; import org.biojava.bio.seq.db.SequenceDB; import org.biojava.bio.seq.db.SequenceDBInstallation; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.SimpleAlignment; import org.biojava.bio.symbol.SimpleSymbolList; import org.biojava.utils.SmallMap; /** *

BlastLikeSearchBuilder will create * SeqSimilaritySearchResults from SAX events via a * SeqSimilarityAdapter. The SAX events should describe * elements conforming to the BioJava BlastLikeDataSetCollection * DTD. Suitable sources are BlastLikeSAXParser or * FastaSearchSAXParser. The result objects are placed in * the List supplied to the constructor.

* *

The start/end/strand of SeqSimilaritySearchHits are * calculated from their constituent * SeqSimilaritySearchSubHits as follows:

* * * *

* This class has special meanings for particular keys: if you want to * adapt this class for another parser, you will need to be aware of * this. These originate from and are fully described in the * BlastLikeDataSetCollection DTD. *

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
KeyMeaning
programeither this value or the subjectSequenceType value must be set. This can take values * acceptable to AlphabetResolver. These are BLASTN, BLASTP, BLASTX, TBLASTN, * TBLASTX, DNA and PROTEIN.
databaseIdIdentifier of database searched (in SequenceDBInstallation).
subjectSequenceTypetype of sequence that hit is. Can be DNA or PROTEIN.
subjectIdid of sequence that is hit
subjectDescriptiondescription of sequence that is hit
queryStrandStrandedness of query in alignment. Takes values of "plus" and "minus"
subjectStrandStrandedness of query in alignment. Takes values of "plus" and "minus"
queryFrameself-evident
subjectFrameself-evident
querySequenceStartself-evident
querySequenceEndself-evident
subjectSequenceStartself-evident
subjectSequenceEndself-evident
scoreself-evident
expectValueself-evident
pValueself-evident
* * @author Keith James * @author Greg Cox * @since 1.2 */ public class BlastLikeSearchBuilder implements SearchBuilder { // Supplier of instances of searched databases private SequenceDBInstallation subjectDBs; // Holder for all query sequences private SequenceDB querySeqHolder; // The ID of the database searched private String databaseID; // The ID of the query sequence private String queryID; // Hit and Result annotation private Annotation resultAnnotation; // Data holders for search result properties private Map resultPreAnnotation; private Map searchParameters; private Map hitData; private Map subHitData; private SymbolTokenization tokenParser; private List hits; private List subHits; private SeqSimilaritySearchSubHit [] subs; // Flag indicating whether there are more results in the stream private boolean moreSearchesAvailable = false; // List to accept all results in the stream private List target; /** * Creates a new BlastLikeSearchBuilder which will * instantiate results into the List target. * * @param target a List. */ public BlastLikeSearchBuilder(List target) { this.target = target; resultPreAnnotation = new HashMap(); searchParameters = new HashMap(); hitData = new HashMap(); subHitData = new HashMap(); } /** * Creates a new BlastLikeSearchBuilder which will * instantiate results into the List target. * * @param target a List. * @param querySeqHolder a SequenceDB of query * sequences. * @param subjectDBs a SequenceDBInstallation of * databases searched. */ public BlastLikeSearchBuilder(List target, SequenceDB querySeqHolder, SequenceDBInstallation subjectDBs) { this(target); this.querySeqHolder = querySeqHolder; this.subjectDBs = subjectDBs; } public SeqSimilaritySearchResult makeSearchResult() throws BioException { if (querySeqHolder == null) throw new BioException("Running BlastLikeSearchBuilder with null query SequenceDB"); if (subjectDBs == null) throw new BioException("Running BlastLikeSearchBuilder with null subject SequenceDB installation"); Sequence query = querySeqHolder.getSequence(queryID); if (query == null) throw new BioException("Failed to retrieve query sequence from queryDB using ID '" + queryID + "' (sequence was null)"); SequenceDB subjectDB = (SequenceDB) subjectDBs.getSequenceDB(databaseID); if (subjectDB == null) throw new BioException("Failed to retrieve database from installation using ID '" + databaseID + "' (database was null)"); return new SimpleSeqSimilaritySearchResult(query, subjectDB, searchParameters, hits, resultAnnotation); } /** * setQuerySeqHolder sets the query sequence holder * to a specific database. * * @param querySeqHolder a SequenceDB containing the * query sequence(s). */ public void setQuerySeqHolder(SequenceDB querySeqHolder) { this.querySeqHolder = querySeqHolder; } /** * setSubjectDBInstallation sets the subject database * holder to a specific installation. * * @param subjectDBs a SequenceDBInstallation * containing the subject database(s) */ public void setSubjectDBInstallation(SequenceDBInstallation subjectDBs) { this.subjectDBs = subjectDBs; } public void setQueryID(String queryID) { this.queryID = queryID; addSearchProperty("queryId", queryID); } public void setDatabaseID(String databaseID) { this.databaseID = databaseID; addSearchProperty("databaseId", databaseID); } public boolean getMoreSearches() { return moreSearchesAvailable; } public void setMoreSearches(boolean value) { moreSearchesAvailable = value; } public void startSearch() { hits = new ArrayList(); } public void endSearch() { try { resultAnnotation = AnnotationFactory.makeAnnotation(resultPreAnnotation); target.add(makeSearchResult()); } catch (BioException be) { System.err.println("Failed to build SeqSimilaritySearchResult:"); be.printStackTrace(); } } public void startHeader() { resultPreAnnotation.clear(); searchParameters.clear(); } public void endHeader() { } public void startHit() { hitData.clear(); subHits = new ArrayList(); } public void endHit() { hits.add(makeHit()); } public void startSubHit() { subHitData.clear(); } public void endSubHit() { try { subHits.add(makeSubHit()); } catch (BioException be) { be.printStackTrace(); } } public void addSearchProperty(Object key, Object value) { resultPreAnnotation.put(key, value); } public void addHitProperty(Object key, Object value) { hitData.put(key, value); } public void addSubHitProperty(Object key, Object value) { subHitData.put(key, value); } /** * makeHit creates a new hit. The hit's strand data * is the same as that of the highest-scoring sub-hit. The hit's * start/end data are the same as the extent of the sub-hits on * that strand. * * @return a SeqSimilaritySearchHit. */ private SeqSimilaritySearchHit makeHit() { double sc = Double.NaN; double ev = Double.NaN; double pv = Double.NaN; subs = (SeqSimilaritySearchSubHit []) subHits .toArray(new SeqSimilaritySearchSubHit [subHits.size() - 1]); // Sort to get highest score Arrays.sort(subs, SeqSimilaritySearchSubHit.byScore); sc = subs[subs.length - 1].getScore(); ev = subs[subs.length - 1].getEValue(); pv = subs[subs.length - 1].getPValue(); // Check for any mixed or null strands boolean mixQueryStrand = false; boolean mixSubjectStrand = false; boolean nullQueryStrand = false; boolean nullSubjectStrand = false; // Start with index 0 value (arbitrarily) Strand qStrand = subs[0].getQueryStrand(); Strand sStrand = subs[0].getSubjectStrand(); int qStart = subs[0].getQueryStart(); int qEnd = subs[0].getQueryEnd(); int sStart = subs[0].getSubjectStart(); int sEnd = subs[0].getSubjectEnd(); if (qStrand == null) nullQueryStrand = true; if (sStrand == null) nullSubjectStrand = true; // Compare all other values for (int i = subs.length; --i > 0;) { Strand qS = subs[i].getQueryStrand(); Strand sS = subs[i].getSubjectStrand(); if (qS == null) nullQueryStrand = true; if (sS == null) nullSubjectStrand = true; if (qS != qStrand) mixQueryStrand = true; if (sS != sStrand) mixSubjectStrand = true; qStart = Math.min(qStart, subs[i].getQueryStart()); qEnd = Math.max(qEnd, subs[i].getQueryEnd()); sStart = Math.min(sStart, subs[i].getSubjectStart()); sEnd = Math.max(sEnd, subs[i].getSubjectEnd()); } // Note any mixed strand hits as unknown strand if (mixQueryStrand) qStrand = StrandedFeature.UNKNOWN; if (mixSubjectStrand) sStrand = StrandedFeature.UNKNOWN; // Any null strands from protein sequences if (nullQueryStrand) qStrand = null; if (nullSubjectStrand) sStrand = null; String subjectID = (String) hitData.get("subjectId"); return new SimpleSeqSimilaritySearchHit(sc, ev, pv, qStart, qEnd, qStrand, sStart, sEnd, sStrand, subjectID, AnnotationFactory.makeAnnotation(hitData), subHits); } /** * makeSubHit creates a new sub-hit. * * @return a SeqSimilaritySearchSubHit. * * @exception BioException if an error occurs. */ private SeqSimilaritySearchSubHit makeSubHit() throws BioException { // Try to get a valid TokenParser if (tokenParser == null) { String identifier; // Try explicit sequence type first if (subHitData.containsKey("subjectSequenceType")) identifier = (String) subHitData.get("subjectSequenceType"); // Otherwise try to resolve from the program name (only // works for Blast) else if (resultPreAnnotation.containsKey("program")) identifier = (String) resultPreAnnotation.get("program"); else throw new BioException("Failed to determine sequence type"); FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier); tokenParser = alpha.getTokenization("token"); } // BLASTP output has the strands set null (protein sequences) Strand qStrand = null; Strand sStrand = null; // Override where an explicit strand is given (FASTA DNA, // BLASTN) if (subHitData.containsKey("queryStrand")) if (subHitData.get("queryStrand").equals("plus")) qStrand = StrandedFeature.POSITIVE; else qStrand = StrandedFeature.NEGATIVE; if (subHitData.containsKey("subjectStrand")) if (subHitData.get("subjectStrand").equals("plus")) sStrand = StrandedFeature.POSITIVE; else sStrand = StrandedFeature.NEGATIVE; // Override where a frame is given as this contains strand // information (BLASTX for query, TBLASTN for hit, TBLASTX for // both) if (subHitData.containsKey("queryFrame")) if (((String) subHitData.get("queryFrame")).startsWith("plus")) qStrand = StrandedFeature.POSITIVE; else qStrand = StrandedFeature.NEGATIVE; if (subHitData.containsKey("subjectFrame")) if (((String) subHitData.get("subjectFrame")).startsWith("plus")) sStrand = StrandedFeature.POSITIVE; else sStrand = StrandedFeature.NEGATIVE; // Get start/end int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart")); int qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd")); int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart")); int sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd")); // The start/end coordinates from BioJava XML don't follow the // BioJava paradigm of start < end, with orientation given by // the strand property. Rather, they present start/end as // displayed in BLAST output, with the coordinates being // inverted on the reverse strand. We account for this here. if (qStrand == StrandedFeature.NEGATIVE) { int swap = qStart; qStart = qEnd; qEnd = swap; } if (sStrand == StrandedFeature.NEGATIVE) { int swap = sStart; sStart = sEnd; sEnd = swap; } // Get scores double sc = Double.NaN; double ev = Double.NaN; double pv = Double.NaN; if (subHitData.containsKey("score")) sc = Double.parseDouble((String) subHitData.get("score")); if (subHitData.containsKey("expectValue")) { String val = (String) subHitData.get("expectValue"); // Blast sometimes uses invalid formatting such as 'e-156' // rather than '1e-156' if (val.startsWith("e")) ev = Double.parseDouble("1" + val); else ev = Double.parseDouble(val); } if (subHitData.containsKey("pValue")) pv = Double.parseDouble((String) subHitData.get("pValue")); Map labelMap = new SmallMap(); // Note that the following is removing the raw sequences StringBuffer tokenBuffer = new StringBuffer(1024); tokenBuffer.append((String) subHitData.remove("querySequence")); labelMap.put(SeqSimilaritySearchSubHit.QUERY_LABEL, new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); tokenBuffer = new StringBuffer(1024); tokenBuffer.append((String) subHitData.remove("subjectSequence")); labelMap.put(hitData.get("subjectId"), new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); return new SimpleSeqSimilaritySearchSubHit(sc, ev, pv, qStart, qEnd, qStrand, sStart, sEnd, sStrand, new SimpleAlignment(labelMap), AnnotationFactory.makeAnnotation(subHitData)); } }