/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq.io; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.biojava.bio.BioError; import org.biojava.bio.BioException; import org.biojava.bio.seq.Feature; import org.biojava.bio.seq.RemoteFeature; import org.biojava.bio.seq.StrandedFeature; import org.biojava.bio.symbol.BetweenLocation; import org.biojava.bio.symbol.FuzzyLocation; import org.biojava.bio.symbol.FuzzyPointLocation; import org.biojava.bio.symbol.Location; import org.biojava.bio.symbol.LocationTools; import org.biojava.bio.symbol.PointLocation; import org.biojava.bio.symbol.RangeLocation; import org.biojava.utils.ChangeVetoException; import org.biojava.utils.SmallMap; /** * EmblLikeLocationParser parses EMBL/Genbank style * locations. Supported location forms: * *
 *   123
 *  <123 or >123
 *  (123.567)
 *  (123.567)..789
 *   123..(567.789)
 *  (123.345)..(567.789)
 *   123..456
 *  <123..567 or 123..>567 or <123..>567
 *   123^567
 *   AL123465:(123..567)
 * 
* * @author Keith James * @author Greg Cox * @since 1.2 * @deprecated Use org.biojavax.bio.seq.io framework instead */ public class EmblLikeLocationParser { // For the LocationLexer inner classs private String location; private LocationLexer lexer; private int nextCharIndex; private Object thisToken; // seq ID of the sequence we are parsing features for private String parentSeqID; // Stores join/order/complement instructions private List instructStack = new ArrayList(); // joinType is a hack to store if a compound location is a // join(...) or an order(...) location. If this isn't sufficient // for your needs, feel free to fix it. If you do, please make // sure the AbstractGenEmblFileFormer class is still able to // format join and order correctly. The joinType is stored in the // Feature Annotation under the internal data key // Feature.PROPERTY_DATA_KEY which means that it won't get printed // in flatfile dumps. private String joinType = null; // List of sublocations. Used for compound locations on the current // sequence private List subLocations = new ArrayList(); // List of subRegions. Used to store remote regions private List subRegions = new ArrayList(); // These hold working data for each (sub)location and are cleared // by calling the processCoords() function private String mRegionSeqID; private List startCoords = new ArrayList(); private List endCoords = new ArrayList(); private boolean isPointLoc = true; private boolean fuzzyCoord = false; private boolean unboundMin = false; private boolean unboundMax = false; private boolean isBetweenLocation = false; // Currently set per Feature; this is a deficiency in the current // parser. Features are assumed to be on the positive strand until // complemented. // No features have a strand type of UNKNOWN private StrandedFeature.Strand mStrandType = StrandedFeature.POSITIVE; EmblLikeLocationParser(String parentSeqID) { this.lexer = new LocationLexer(); this.parentSeqID = parentSeqID; } /** * parseLocation creates a Location from * the String and returns a stranded location. * * @param location a location String. * @param theTemplate the template to be filled with the parsed out location * information. * * @exception BioException if an error occurs. */ public Feature.Template parseLocation(String location, Feature.Template theTemplate) throws BioException { this.location = location; //fixme: mrp: removed this check - it may be killing performance //if ((countChar(location, '(')) != (countChar(location, ')'))) // throw new BioException("Unbalanced parentheses in location: " // + location); nextCharIndex = 0; instructStack.clear(); subLocations.clear(); subRegions.clear(); // 'join' vs. 'order' joinType = null; thisToken = lexer.getNextToken(); while (thisToken != null) { if (String.class.isInstance(thisToken)) { String toke = (String) thisToken; if (toke.equals("..")) { // This token indicates that this isn't a point isPointLoc = false; } else { instructStack.add(thisToken); } } else if (Integer.class.isInstance(thisToken)) { if (isPointLoc) startCoords.add(thisToken); else endCoords.add(thisToken); } else if (Character.class.isInstance(thisToken)) { char toke = ((Character) thisToken).charValue(); switch (toke) { case '(': break; case ':': processInstructs(); break; case '^': isBetweenLocation = true; break; case '<': unboundMin = true; break; case '>': unboundMax = true; break; case '.': // Catch range: (123.567) fuzzyCoord = true; break; case ',': processCoords(); break; case ')': // Catch the end of range: (123.567) if (fuzzyCoord) { fuzzyCoord = false; } else { processCoords(); processInstructs(); } break; default: throw new BioException("Unknown character '" + toke + "' within location: " + location); } } thisToken = lexer.getNextToken(); } processCoords(); // The location has been processed, and now the template gets filled if (subLocations.size() == 1) { theTemplate.location = (Location)subLocations.get(0); } else { // EMBL ordering is in reverse on the complementary strand // but LocationTools sorts them anyway theTemplate.location = LocationTools.union(subLocations); } if (theTemplate instanceof StrandedFeature.Template) { ((StrandedFeature.Template) theTemplate).strand = mStrandType; } if (subRegions.size() > subLocations.size()) { // This is a remote feature, so a new template has to be made RemoteFeature.Template newTemplate = new RemoteFeature.Template(theTemplate); newTemplate.regions = new ArrayList(subRegions); // FIXME: // I don't know how to create an appropriate resolver, so I'm leaving it // blank. No doubt this will break things. // -- Gcox newTemplate.resolver = null; theTemplate = newTemplate; } if (joinType != null) { try { // Feature.PROPERTY_DATA_KEY signifies internal (not // for flatfile) data Map dat = new SmallMap(); dat.put("JoinType", joinType); theTemplate.annotation.setProperty(Feature.PROPERTY_DATA_KEY, dat); } catch (ChangeVetoException cve) { throw new BioError(cve); } } return theTemplate; } /** * processCoords uses the coordinate data in the * start/endCoords Lists to create a Location and add to the * subLocations List. As this code will require further * modification to support fuzzy point locations, please keep any * changes well-commented. * * @exception BioException if an error occurs. */ private void processCoords() throws BioException { int outerMin, innerMin, innerMax, outerMax; Location createdLocation = null; // This is expected where two calls to processCoords() are // made sequentially e.g. where two levels of parens are // closed. The second call will have no data to process. if (startCoords.isEmpty() && endCoords.isEmpty()) return; // Range of form 5^6 or 5^7 if (isBetweenLocation) { // Create a ranged location, and wrap it in a between location int minCoord = ((Integer) startCoords.get(0)).intValue(); int maxCoord = ((Integer) startCoords.get(1)).intValue(); createdLocation = new BetweenLocation(new RangeLocation(minCoord, maxCoord)); } // Range of form: 123 else if (startCoords.size() == 1 && endCoords.isEmpty()) { innerMin = outerMin = ((Integer) startCoords.get(0)).intValue(); innerMax = outerMax = innerMin; // This looks like a point, but is actually a range which // lies entirely outside the current entry if (unboundMin || unboundMax) { createdLocation = new FuzzyPointLocation(unboundMin ? Integer.MIN_VALUE : innerMin, unboundMax ? Integer.MAX_VALUE : innerMax, FuzzyPointLocation.RESOLVE_AVERAGE); } else if (isPointLoc) { createdLocation = new PointLocation(innerMin); } else { throw new BioException("Internal error in location parsing; parser became confused: " + location); } } // Range of form: (123.567) else if (startCoords.size() == 2 && endCoords.isEmpty()) { innerMin = outerMin = ((Integer) startCoords.get(0)).intValue(); innerMax = outerMax = ((Integer) startCoords.get(1)).intValue(); createdLocation = new FuzzyPointLocation(innerMin, innerMax, FuzzyPointLocation.RESOLVE_AVERAGE); } // Range of form: 123..567 or <123..567 or 123..>567 or <123..>567 else if (startCoords.size() == 1 && endCoords.size() == 1) { innerMin = outerMin = ((Integer) startCoords.get(0)).intValue(); innerMax = outerMax = ((Integer) endCoords.get(0)).intValue(); if (unboundMin || unboundMax) { createdLocation = new FuzzyLocation(unboundMin ? Integer.MIN_VALUE : outerMin, unboundMax ? Integer.MAX_VALUE : outerMax, innerMin, innerMax, FuzzyLocation.RESOLVE_INNER); } else { try { createdLocation = new RangeLocation(outerMin, outerMax); } catch (IndexOutOfBoundsException ioe) { throw new BioException(ioe); } } } // Range of form: (123.567)..789 else if (startCoords.size() == 2 && endCoords.size() == 1) { outerMin = ((Integer) startCoords.get(0)).intValue(); innerMin = ((Integer) startCoords.get(1)).intValue(); innerMax = outerMax = ((Integer) endCoords.get(0)).intValue(); createdLocation = new FuzzyLocation(outerMin, outerMax, innerMin, innerMax, FuzzyLocation.RESOLVE_INNER); } // Range of form: 123..(567.789) else if (startCoords.size() == 1 && endCoords.size() == 2) { outerMin = innerMin = ((Integer) startCoords.get(0)).intValue(); innerMax = ((Integer) endCoords.get(0)).intValue(); outerMax = ((Integer) endCoords.get(1)).intValue(); createdLocation = new FuzzyLocation(outerMin, outerMax, innerMin, innerMax, FuzzyLocation.RESOLVE_INNER); } // Range of form: (123.345)..(567.789) else if (startCoords.size() == 2 && endCoords.size() == 2) { outerMin = ((Integer) startCoords.get(0)).intValue(); innerMin = ((Integer) startCoords.get(1)).intValue(); innerMax = ((Integer) endCoords.get(0)).intValue(); outerMax = ((Integer) endCoords.get(1)).intValue(); createdLocation = new FuzzyLocation(outerMin, outerMax, innerMin, innerMax, FuzzyLocation.RESOLVE_INNER); } else { throw new BioException("Internal error in location parsing; parser became confused; " + location); } startCoords.clear(); endCoords.clear(); if (mRegionSeqID == null) { subLocations.add(createdLocation); subRegions.add(new RemoteFeature.Region(createdLocation, parentSeqID, false)); } else { subRegions.add(new RemoteFeature.Region(createdLocation, mRegionSeqID, true)); } mRegionSeqID = null; isPointLoc = true; unboundMin = false; unboundMax = false; fuzzyCoord = false; isBetweenLocation = false; mStrandType = StrandedFeature.POSITIVE; } /** * processInstructs pops an instruction off the stack * and applies it to the sub(locations). * * @exception BioException if an unsupported instruction is found. */ private void processInstructs() throws BioException { String instruct = (String) instructStack.remove(instructStack.size() - 1); if (instruct.equals("join") || instruct.equals("order")) { joinType = instruct; } else if (instruct.equals("complement")) { // This should only set the strand for a single range // within a feature. However, BioJava Locations have no // concept of strand and therefore are unable to support // construction of Features where some ranges are on // different strands. As a result the mStrandType // flag currently sets the strand for the whole feature. mStrandType = StrandedFeature.NEGATIVE; } else { // This is a primary accession number // e.g. J00194:(100..202) mRegionSeqID = instruct; } } /** * LocationLexer is based on the * LocationLexer class in the Artemis source code by * Kim Rutherford. * * @author Kim Rutherford * @author Keith James * @author Greg Cox * @since 1.2 */ private class LocationLexer { private StringBuffer intString = new StringBuffer(); private StringBuffer textString = new StringBuffer(); /** * getNextToken returns the next token. A null * indicates no more tokens. * * @return an Object value. */ Object getNextToken() { while (true) { if (nextCharIndex == location.length()) return null; char thisChar = location.charAt(nextCharIndex); switch (thisChar) { case ' ' : case '\t' : nextCharIndex++; continue; case ':' : case '^' : case ',' : case '(' : case ')' : case '<' : case '>' : nextCharIndex++; return new Character(thisChar); case '.' : if (location.charAt(nextCharIndex + 1) == '.') { nextCharIndex += 2; return ".."; } else { nextCharIndex++; return new Character('.'); } case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : case '8' : case '9' : return followInteger(); default : String text = followText(); if (text.equals("")) { nextCharIndex++; return new String("" + thisChar); } else return text; } } } /** * followInteger returns single sequence * coordinate. * * @return an Integer value. */ private Integer followInteger() { intString.setLength(0); char thisChar = location.charAt(nextCharIndex); while (Character.isDigit(thisChar)) { intString.append(thisChar); nextCharIndex++; if (nextCharIndex >= location.length()) break; thisChar = location.charAt(nextCharIndex); } return new Integer(intString.substring(0)); } /** * followText returns a single text string. * * @return a String value. */ private String followText() { textString.setLength(0); char thisChar = location.charAt(nextCharIndex); // First character must be a letter if (! Character.isLetter(thisChar)) return ""; while (Character.isLetterOrDigit(thisChar) || thisChar == '.') { textString.append(thisChar); nextCharIndex++; if (nextCharIndex >= location.length()) break; thisChar = location.charAt(nextCharIndex); } return textString.substring(0); } } }