/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq.io; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; import java.util.Vector; import org.biojava.bio.seq.Sequence; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.utils.ParseErrorEvent; import org.biojava.utils.ParseErrorListener; /** * Format reader for GenBank files. Converted from the old style io to * the new by working from EmblLikeFormat. * * @author Thomas Down * @author Thad Welch * Added GenBank header info to the sequence annotation. The ACCESSION header * tag is not included. Stored in sequence.getName(). * @author Greg Cox * @author Keith James * @author Matthew Pocock * @author Ron Kuhn * @deprecated Use org.biojavax.bio.seq.io.GenbankFormat */ public class GenbankFormat implements SequenceFormat, Serializable, org.biojava.utils.ParseErrorListener, org.biojava.utils.ParseErrorSource { public static final String DEFAULT = "GENBANK"; protected static final String LOCUS_TAG = "LOCUS"; protected static final String SIZE_TAG = "SIZE"; protected static final String STRAND_NUMBER_TAG = "STRANDS"; protected static final String TYPE_TAG = "TYPE"; protected static final String CIRCULAR_TAG = "CIRCULAR"; protected static final String DIVISION_TAG = "DIVISION"; protected static final String DATE_TAG = "MDAT"; protected static final String ACCESSION_TAG = "ACCESSION"; protected static final String VERSION_TAG = "VERSION"; protected static final String GI_TAG = "GI"; protected static final String KEYWORDS_TAG = "KW"; protected static final String DEFINITION_TAG = "DEFINITION"; protected static final String SOURCE_TAG = "SOURCE"; protected static final String ORGANISM_TAG = "ORGANISM"; protected static final String REFERENCE_TAG = "REFERENCE"; protected static final String COORDINATE_TAG = "COORDINATE"; protected static final String REF_ACCESSION_TAG = ""; protected static final String AUTHORS_TAG = "AUTHORS"; protected static final String TITLE_TAG = "TITLE"; protected static final String JOURNAL_TAG = "JOURNAL"; protected static final String PUBMED_TAG = "PUBMED"; protected static final String MEDLINE_TAG = "MEDLINE"; protected static final String COMMENT_TAG = "COMMENT"; protected static final String FEATURE_TAG = "FEATURES"; protected static final String BASE_COUNT_TAG = "BASE"; protected static final String FEATURE_FLAG = "FT"; protected static final String START_SEQUENCE_TAG = "ORIGIN"; protected static final String END_SEQUENCE_TAG = "//"; protected static final String FEATURE_LINE_PREFIX = " "; private Vector mListeners = new Vector(); private boolean elideSymbols = false; /** * Reads a sequence from the specified reader using the Symbol * parser and Sequence Factory provided. The sequence read in must * be in Genbank format. * * @return boolean True if there is another sequence in the file; false * otherwise */ public boolean readSequence(BufferedReader reader, SymbolTokenization symParser, SeqIOListener listener) throws IllegalSymbolException, IOException, ParseException { String line; boolean hasAnotherSequence = true; boolean hasInternalWhitespace = false; GenbankContext ctx = new GenbankContext(symParser, listener); ctx.addParseErrorListener(this); ctx.setElideSymbols(this.getElideSymbols()); listener.startSequence(); while ((line = reader.readLine()) != null) { if (line.startsWith(END_SEQUENCE_TAG)) { // To close the StreamParser encapsulated in the // GenbankContext object ctx.processLine(line); // Allows us to tolerate trailing whitespace without // thinking that there is another Sequence to follow while (true) { reader.mark(1); int c = reader.read(); if (c == -1) { hasAnotherSequence = false; break; } if (Character.isWhitespace((char) c)) { hasInternalWhitespace = true; continue; } if (hasInternalWhitespace) System.err.println("Warning: whitespace found between sequence entries"); reader.reset(); break; } listener.endSequence(); return hasAnotherSequence; } ctx.processLine(line); } throw new IOException("Premature end of stream for GENBANK"); } public void writeSequence(Sequence seq, PrintStream os) throws IOException { writeSequence(seq, getDefaultFormat(), os); } /** * writeSequence writes a sequence to the specified * PrintStream, using the specified format. * * @param seq a Sequence to write out. * @param format a String indicating which sub-format * of those available from a particular * SequenceFormat implemention to use when * writing. * @param os a PrintStream object. * * @exception IOException if an error occurs. * @deprecated use writeSequence(Sequence seq, PrintStream os) */ public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { SeqFileFormer former; if (format.equalsIgnoreCase("GENBANK")) former = new GenbankFileFormer(); else if (format.equalsIgnoreCase("GENPEPT")) former = new GenpeptFileFormer(); else if (format.equalsIgnoreCase("REFSEQ:PROTEIN")) former = new ProteinRefSeqFileFormer(); else throw new IllegalArgumentException("Unknown format '" + format + "'"); former.setPrintStream(os); SeqIOEventEmitter emitter = new SeqIOEventEmitter(GenEmblPropertyComparator.INSTANCE, GenEmblFeatureComparator.INSTANCE); emitter.getSeqIOEvents(seq, former); } /** * getDefaultFormat returns the String identifier for * the default format. * * @return a String. * @deprecated */ public String getDefaultFormat() { return DEFAULT; } /** * Adds a parse error listener to the list of listeners if it isn't already * included. * * @param theListener Listener to be added. */ public synchronized void addParseErrorListener(ParseErrorListener theListener) { if (mListeners.contains(theListener) == false) { mListeners.addElement(theListener); } } /** * Removes a parse error listener from the list of listeners if it is * included. * * @param theListener Listener to be removed. */ public synchronized void removeParseErrorListener( ParseErrorListener theListener) { if (mListeners.contains(theListener) == true) { mListeners.removeElement(theListener); } } /** * This method determines the behaviour when a bad line is processed. * Some options are to log the error, throw an exception, ignore it * completely, or pass the event through. *

* This method should be overwritten when different behavior is desired. * * @param theEvent The event that contains the bad line and token. */ public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) { notifyParseErrorEvent(theEvent); } // Protected methods /** * Passes the event on to all the listeners registered for ParseErrorEvents. * * @param theEvent The event to be handed to the listeners. */ protected void notifyParseErrorEvent(ParseErrorEvent theEvent) { Vector listeners; synchronized(this) { listeners = (Vector)mListeners.clone(); } int lnrCount = listeners.size(); for (int index = 0; index < lnrCount; index++) { ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index); client.BadLineParsed(theEvent); } } public boolean getElideSymbols() { return elideSymbols; } /** * Use this method to toggle reading of sequence data. If you're only * interested in header data set to true. * @param elideSymbols set to true if you don't want the sequence data. */ public void setElideSymbols(boolean elideSymbols) { this.elideSymbols = elideSymbols; } }