/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojavax.bio.seq.io; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.NoSuchElementException; import org.biojava.bio.BioException; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojavax.Namespace; import org.biojavax.bio.BioEntry; import org.biojavax.bio.seq.RichSequence; import org.biojavax.bio.seq.RichSequenceIterator; /** * Parses a stream into sequences. * This object implements SequenceIterator, so you can loop over each sequence * produced. It consumes a stream, and uses a SequenceFormat to extract each * sequence from the stream. * It is assumed that the stream contains sequences that can be handled by the * one format, and that they are not seperated other than by delimiters that the * format can handle. * Sequences are instantiated when they are requested by nextSequence, not * before, so it is safe to use this object to parse a gigabyte fasta file, and * do sequence-by-sequence processing, while being guaranteed that RichStreamReader * will not require you to keep any of the sequences in memory. * @author Matthew Pocock * @author Thomas Down * @author Richard Holland * @since 1.5 */ public class RichStreamReader implements RichSequenceIterator { /** * The symbol parser. */ private Namespace ns; /** * The symbol parser. */ private SymbolTokenization symParser; /** * The sequence format. */ private RichSequenceFormat format; /** * The sequence-builder factory. */ private RichSequenceBuilderFactory sf; /** * The stream of data to parse. */ private BufferedReader reader; /** * Flag indicating if more sequences are available. */ private boolean moreSequenceAvailable = true; /** * {@inheritDoc} */ public Sequence nextSequence() throws NoSuchElementException, BioException { return this.nextRichSequence(); } /** * {@inheritDoc} */ public BioEntry nextBioEntry() throws NoSuchElementException, BioException { return this.nextRichSequence(); } /** * {@inheritDoc} */ public RichSequence nextRichSequence() throws NoSuchElementException, BioException { if(!moreSequenceAvailable) throw new NoSuchElementException("Stream is empty"); try { RichSequenceBuilder builder = (RichSequenceBuilder)sf.makeSequenceBuilder(); moreSequenceAvailable = format.readRichSequence(reader, symParser, builder, ns); return builder.makeRichSequence(); } catch (Exception e) { throw new BioException("Could not read sequence",e); } } /** * {@inheritDoc} */ public boolean hasNext() { return moreSequenceAvailable; } /** * Creates a new stream reader on the given input stream, which will attempt to read * sequences in the given format, having symbols from the given tokenization, and * pass them to the given factory to be transformed into RichSequence objects in * the given namespace. * @param is the input stream to read from * @param format the input file format * @param symParser the tokenizer that understands the sequence symbols in the file * @param sf the factory that will build the sequences * @param ns the namespace the sequences will be loaded into. */ public RichStreamReader(InputStream is, RichSequenceFormat format, SymbolTokenization symParser, RichSequenceBuilderFactory sf, Namespace ns) { this(new BufferedReader(new InputStreamReader(is)), format,symParser,sf,ns); } /** * Creates a new stream reader on the given reader, which will attempt to read * sequences in the given format, having symbols from the given tokenization, and * pass them to the given factory to be transformed into RichSequence objects in * the given namespace. * @param reader the reader to read from * @param format the input file format * @param symParser the tokenizer that understands the sequence symbols in the file * @param sf the factory that will build the sequences * @param ns the namespace the sequences will be loaded into. */ public RichStreamReader(BufferedReader reader, RichSequenceFormat format, SymbolTokenization symParser, RichSequenceBuilderFactory sf, Namespace ns) { this.reader = reader; this.format = format; this.symParser = symParser; this.sf = sf; this.ns = ns; } }