/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq.io; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.biojava.bio.Annotation; import org.biojava.bio.seq.Sequence; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.utils.ParseErrorEvent; import org.biojava.utils.ParseErrorListener; /** * Format object representing FASTA files. These files are almost pure * sequence data. The only `sequence property' reported by this parser * is PROPERTY_DESCRIPTIONLINE, which is the contents of the * sequence's description line (the line starting with a '>' * character). Normally, the first word of this is a sequence ID. If * you wish it to be interpreted as such, you should use * FastaDescriptionLineParser as a SeqIO filter. * * If you pass it a RichSeqIOListener, you'll get RichSequence objects * in return. Likewise, if you write RichSequence objects, you'll get * absolutely correct FASTA formatted output. * * @author Thomas Down * @author Matthew Pocock * @author Greg Cox * @author Lukas Kall * @author Richard Holland * @author Mark Schreiber * @deprecated Use org.biojavax.bio.seq.io.FastaFormat */ public class FastaFormat implements SequenceFormat, Serializable, org.biojava.utils.ParseErrorListener, org.biojava.utils.ParseErrorSource { public static final String DEFAULT = "FASTA"; /** * Constant string which is the property key used to notify * listeners of the description lines of FASTA sequences. */ public final static String PROPERTY_DESCRIPTIONLINE = "description_line"; protected Vector mListeners = new Vector(); /** * The line width for output. */ protected int lineWidth = 60; /** * Retrive the current line width. * * @return the line width */ public int getLineWidth() { return lineWidth; } /** * Set the line width. *
* When writing, the lines of sequence will never be longer than the line
* width.
*
* @param width the new line width
*/
public void setLineWidth(int width) {
this.lineWidth = width;
}
/**
* Reads information from a flatfile to a SeqIOListener
* using a SymbolTokenizer
to convert sequence strings
* to Symbol
objects.
* @param reader The reader that is the source of the information
* @param symParser converts text seqeunce to biojava objects
* @param siol The listener that listens for event callbacks from this class.
* The listener can be a RichSeqIOListener
.
* @throws org.biojava.bio.symbol.IllegalSymbolException if symParser
* doesn't know how to convert the text sequence into biojava Symbol
s
* @throws java.io.IOException if there is a problem reading.
* @throws org.biojava.bio.seq.io.ParseException if the source cannot be parsed.
* @return true if there is another unread sequence in the source.
*/
public boolean readSequence(
BufferedReader reader,
SymbolTokenization symParser,
SeqIOListener siol
) throws
IllegalSymbolException,
IOException,
ParseException {
String line = reader.readLine();
if (line == null) {
throw new IOException("Premature stream end");
}
while(line.length() == 0) {
line = reader.readLine();
if (line == null) {
throw new IOException("Premature stream end");
}
}
if (!line.startsWith(">")) {
throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
}
siol.startSequence();
String description = line.substring(1).trim();
String regex = "(\\S+)(\\s+(.*))*";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(description);
if (!m.matches()) {
throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
}
String name = m.group(1);
siol.setName(name);
siol.addSequenceProperty(PROPERTY_DESCRIPTIONLINE, description);
boolean seenEOF = readSequenceData(reader, symParser, siol);
siol.endSequence();
return !seenEOF;
}
private boolean readSequenceData(
BufferedReader r,
SymbolTokenization parser,
SeqIOListener listener
) throws
IOException,
IllegalSymbolException {
char[] cache = new char[512];
boolean reachedEnd = false, seenEOF = false;
StreamParser sparser = parser.parseStream(listener);
while (!reachedEnd) {
r.mark(cache.length + 1);
int bytesRead = r.read(cache, 0, cache.length);
if (bytesRead < 0) {
reachedEnd = seenEOF = true;
} else {
int parseStart = 0;
int parseEnd = 0;
while (!reachedEnd && parseStart < bytesRead && cache[parseStart] != '>') {
parseEnd = parseStart;
while (parseEnd < bytesRead &&
cache[parseEnd] != '\n' &&
cache[parseEnd] != '\r'
) {
++parseEnd;
}
sparser.characters(cache, parseStart, parseEnd - parseStart);
parseStart = parseEnd + 1;
while (parseStart < bytesRead &&
(cache[parseStart] == '\n' ||
cache[parseStart] == '\r') ) {
++parseStart;
}
}
if (parseStart < bytesRead && cache[parseStart] == '>') {
try {
r.reset();
} catch (IOException ioe) {
throw new IOException(
"Can't reset: " +
ioe.getMessage() +
" parseStart=" + parseStart +
" bytesRead=" + bytesRead
);
}
if (r.skip(parseStart) != parseStart) {
throw new IOException("Couldn't reset to start of next sequence");
}
reachedEnd = true;
}
}
}
sparser.close();
return seenEOF;
}
/**
* Return a suitable description line for a Sequence. If the
* sequence's annotation bundle contains PROPERTY_DESCRIPTIONLINE,
* this is used verbatim. Otherwise, the sequence's name is used.
*/
protected String describeSequence(Sequence seq) {
String description = null;
Annotation seqAnn = seq.getAnnotation();
if(seqAnn.containsProperty(PROPERTY_DESCRIPTIONLINE)) {
description = (String) seqAnn.getProperty(PROPERTY_DESCRIPTIONLINE);
} else {
description = seq.getName();
}
return description;
}
/**
* Writes a Sequence
or RichSequence
to a
* PrintStream
in FASTA format. If the sequence is a
* RichSequence
the format of the header will be in line with
* the NCBI standard.
* @param seq the sequence to format
* @param os the stream to write the sequence to. To print to screen use
* System.out
* @throws java.io.IOException if data cannot be written to os
*/
public void writeSequence(Sequence seq, PrintStream os)
throws IOException {
os.print(">");
os.println(describeSequence(seq));
int length = seq.length();
for (int pos = 1; pos <= length; pos += lineWidth) {
int end = Math.min(pos + lineWidth - 1, length);
os.println(seq.subStr(pos, end));
}
}
/**
* writeSequence
writes a sequence to the specified
* PrintStream
, using the specified format.
*
* @param seq a Sequence
to write out.
* @param format a String
indicating which sub-format
* of those available from a particular
* SequenceFormat
implemention to use when
* writing.
* @param os a PrintStream
object.
*
* @exception IOException if an error occurs.
* @deprecated use writeSequence(Sequence seq, PrintStream os)
*/
public void writeSequence(Sequence seq, String format, PrintStream os)
throws IOException {
if (! format.equalsIgnoreCase(getDefaultFormat()))
throw new IllegalArgumentException("Unknown format '"
+ format
+ "'");
writeSequence(seq, os);
}
/**
* getDefaultFormat
returns the String identifier for
* the default format.
*
* @return a String
.
* @deprecated
*/
public String getDefaultFormat() {
return DEFAULT;
}
/**
* Adds a parse error listener to the list of listeners if it isn't already
* included.
*
* @param theListener Listener to be added.
*/
public synchronized void addParseErrorListener(ParseErrorListener theListener) {
if (mListeners.contains(theListener) == false) {
mListeners.addElement(theListener);
}
}
/**
* Removes a parse error listener from the list of listeners if it is
* included.
*
* @param theListener Listener to be removed.
*/
public synchronized void removeParseErrorListener(ParseErrorListener theListener) {
if (mListeners.contains(theListener) == true) {
mListeners.removeElement(theListener);
}
}
/**
* This method determines the behaviour when a bad line is processed.
* Some options are to log the error, throw an exception, ignore it
* completely, or pass the event through.
*
* This method should be overwritten when different behavior is desired. * * @param theEvent The event that contains the bad line and token. */ public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) { notifyParseErrorEvent(theEvent); } // Protected methods /** * Passes the event on to all the listeners registered for ParseErrorEvents. * * @param theEvent The event to be handed to the listeners. */ protected void notifyParseErrorEvent(ParseErrorEvent theEvent) { Vector listeners; synchronized(this) { listeners = (Vector)mListeners.clone(); } for (int index = 0; index < listeners.size(); index++) { ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index); client.BadLineParsed(theEvent); } } }