/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.seq.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.biojava.bio.BioError;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.seq.NucleotideTools;
import org.biojava.bio.seq.ProteinTools;
import org.biojava.bio.seq.RNATools;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.db.HashSequenceDB;
import org.biojava.bio.seq.db.IDMaker;
import org.biojava.bio.seq.db.SequenceDB;
import org.biojava.bio.symbol.Alignment;
import org.biojava.bio.symbol.Alphabet;
import org.biojava.bio.symbol.FiniteAlphabet;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.utils.AssertionFailure;
import org.biojava.utils.ChangeVetoException;
/**
* A set of convenience methods for handling common file formats.
*
* @author Thomas Down
* @author Mark Schreiber
* @author Nimesh Singh
* @author Matthew Pocock
* @author Keith James
* @since 1.1
* @deprecated use org.biojavax.bio.seq.RichSequence.IOTools
*/
public final class SeqIOTools {
private static SequenceBuilderFactory _emblBuilderFactory;
private static SequenceBuilderFactory _genbankBuilderFactory;
private static SequenceBuilderFactory _genpeptBuilderFactory;
private static SequenceBuilderFactory _swissprotBuilderFactory;
private static SequenceBuilderFactory _fastaBuilderFactory;
/**
* This can't be instantiated.
*/
private SeqIOTools() {
}
/**
* Get a default SequenceBuilderFactory for handling EMBL
* files.
* @return a SmartSequenceBuilder.FACTORY
*/
public static SequenceBuilderFactory getEmblBuilderFactory() {
if (_emblBuilderFactory == null) {
_emblBuilderFactory =
new EmblProcessor.Factory(SmartSequenceBuilder.FACTORY);
}
return _emblBuilderFactory;
}
/**
* Iterate over the sequences in an EMBL-format stream.
* @param br A reader for the EMBL source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readEmbl(BufferedReader br) {
return new StreamReader(br,
new EmblLikeFormat(),
getDNAParser(),
getEmblBuilderFactory());
}
/**
* Iterate over the sequences in an EMBL-format stream, but for RNA.
* @param br A reader for the EMBL source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readEmblRNA(BufferedReader br) {
return new StreamReader(br,
new EmblLikeFormat(),
getRNAParser(),
getEmblBuilderFactory());
}
/**
* Iterate over the sequences in an EMBL-format stream.
* @param br A reader for the EMBL source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readEmblNucleotide(BufferedReader br) {
return new StreamReader(br,
new EmblLikeFormat(),
getNucleotideParser(),
getEmblBuilderFactory());
}
/**
* Get a default SequenceBuilderFactory for handling GenBank
* files.
* @return a SmartSequenceBuilder.FACTORY
*/
public static SequenceBuilderFactory getGenbankBuilderFactory() {
if (_genbankBuilderFactory == null) {
_genbankBuilderFactory =
new GenbankProcessor.Factory(SmartSequenceBuilder.FACTORY);
}
return _genbankBuilderFactory;
}
/**
* Iterate over the sequences in an Genbank-format stream.
* @param br A reader for the Genbank source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readGenbank(BufferedReader br) {
return new StreamReader(br,
new GenbankFormat(),
getDNAParser(),
getGenbankBuilderFactory());
}
/**
* Iterate over the sequences in an GenbankXML-format stream.
* @param br A reader for the GenbanXML source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readGenbankXml( BufferedReader br )
{
return new StreamReader( br,
new GenbankXmlFormat(),
getDNAParser(),
getGenbankBuilderFactory() );
}
/**
* Get a default SequenceBuilderFactory for handling Genpept
* files.
* @return a SmartSequenceBuilder.FACTORY
*/
public static SequenceBuilderFactory getGenpeptBuilderFactory() {
if (_genpeptBuilderFactory == null) {
_genpeptBuilderFactory =
new GenbankProcessor.Factory(SmartSequenceBuilder.FACTORY);
}
return _genpeptBuilderFactory;
}
/**
* Iterate over the sequences in an Genpept-format stream.
* @param br A reader for the Genpept source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readGenpept(BufferedReader br) {
return new StreamReader(br,
new GenbankFormat(),
getProteinParser(),
getGenpeptBuilderFactory());
}
/**
* Get a default SequenceBuilderFactory for handling Swissprot
* files.
* @return a SmartSequenceBuilder.FACTORY
*/
public static SequenceBuilderFactory getSwissprotBuilderFactory() {
if (_swissprotBuilderFactory == null) {
_swissprotBuilderFactory =
new SwissprotProcessor.Factory(SmartSequenceBuilder.FACTORY);
}
return _swissprotBuilderFactory;
}
/**
* Iterate over the sequences in an Swissprot-format stream.
* @param br A reader for the Swissprot source or file
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readSwissprot(BufferedReader br) {
return new StreamReader(br,
new EmblLikeFormat(),
getProteinParser(),
getSwissprotBuilderFactory());
}
/**
* Get a default SequenceBuilderFactory for handling FASTA
* files.
* @return a SmartSequenceBuilder.FACTORY
*/
public static SequenceBuilderFactory getFastaBuilderFactory() {
if (_fastaBuilderFactory == null) {
_fastaBuilderFactory = new FastaDescriptionLineParser.Factory(
SmartSequenceBuilder.FACTORY);
}
return _fastaBuilderFactory;
}
/**
* Read a fasta file.
*
* @param br the BufferedReader to read data from
* @param sTok a SymbolTokenization that understands the sequences
* @return a SequenceIterator over each sequence in the fasta file
*/
public static SequenceIterator readFasta(
BufferedReader br, SymbolTokenization sTok)
{
return new StreamReader(br,
new FastaFormat(),
sTok,
getFastaBuilderFactory());
}
/**
* Read a fasta file using a custom type of SymbolList. For example,
* use SmartSequenceBuilder.FACTORY to emulate readFasta(BufferedReader,
* SymbolTokenization) and SmartSequenceBuilder.BIT_PACKED to force all
* symbols to be encoded using bit-packing.
* @param br the BufferedReader to read data from
* @param sTok a SymbolTokenization that understands the sequences
* @param seqFactory a factory used to build a SymbolList
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readFasta(
BufferedReader br,
SymbolTokenization sTok,
SequenceBuilderFactory seqFactory)
{
return new StreamReader(
br,
new FastaFormat(),
sTok,
new FastaDescriptionLineParser.Factory(seqFactory));
}
/**
* Iterate over the sequences in an FASTA-format stream of DNA sequences.
* @param br the BufferedReader to read data from
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readFastaDNA(BufferedReader br) {
return new StreamReader(br,
new FastaFormat(),
getDNAParser(),
getFastaBuilderFactory());
}
/**
* Iterate over the sequences in an FASTA-format stream of RNA sequences.
* @param br the BufferedReader to read data from
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readFastaRNA(BufferedReader br) {
return new StreamReader(br,
new FastaFormat(),
getRNAParser(),
getFastaBuilderFactory());
}
/**
* Iterate over the sequences in an FASTA-format stream of Protein sequences.
* @param br the BufferedReader to read data from
* @return a SequenceIterator
that iterates over each
* Sequence
in the file
*/
public static SequenceIterator readFastaProtein(BufferedReader br) {
return new StreamReader(br,
new FastaFormat(),
getProteinParser(),
getFastaBuilderFactory());
}
/**
* Create a sequence database from a fasta file provided as an
* input stream. Note this somewhat duplicates functionality in
* the readFastaDNA and readFastaProtein methods but uses a stream
* rather than a reader and returns a SequenceDB rather than a
* SequenceIterator. If the returned DB is likely to be large then
* the above mentioned methods should be used.
* @return a SequenceDB
containing all the Sequences
* in the file.
* @since 1.2
* @param seqFile The file containg the fasta formatted sequences
* @param alpha The Alphabet
of the sequence, ie DNA, RNA etc
* @throws BioException if problems occur during reading of the
* stream.
*/
public static SequenceDB readFasta(InputStream seqFile, Alphabet alpha)
throws BioException {
HashSequenceDB db = new HashSequenceDB(IDMaker.byName);
SequenceBuilderFactory sbFact =
new FastaDescriptionLineParser.Factory(SmartSequenceBuilder.FACTORY);
FastaFormat fFormat = new FastaFormat();
for (SequenceIterator seqI = new StreamReader(seqFile,
fFormat,
alpha.getTokenization("token"),
sbFact);seqI.hasNext();) {
Sequence seq = seqI.nextSequence();
try {
db.addSequence(seq);
} catch (ChangeVetoException cve) {
throw new AssertionFailure(
"Could not successfully add sequence "
+ seq.getName()
+ " to sequence database",
cve);
}
}
return db;
}
/**
* Write a sequenceDB to an output stream in fasta format.
* @since 1.2
* @param os the stream to write the fasta formatted data to.
* @param db the database of Sequence
s to write
* @throws IOException if there was an error while writing.
*/
public static void writeFasta(OutputStream os, SequenceDB db)
throws IOException {
StreamWriter sw = new StreamWriter(os,new FastaFormat());
sw.writeStream(db.sequenceIterator());
}
/**
* Writes sequences from a SequenceIterator to an OutputStream in
* Fasta Format. This makes for a useful format filter where a
* StreamReader can be sent to the StreamWriter after formatting.
*
* @since 1.2
* @param os The stream to write fasta formatted data to
* @param in The source of input Sequences
* @throws IOException if there was an error while writing.
*/
public static void writeFasta(OutputStream os, SequenceIterator in)
throws IOException {
StreamWriter sw = new StreamWriter(os,new FastaFormat());
sw.writeStream(in);
}
/**
* Writes a single Sequence to an OutputStream in Fasta format.
*
* @param os the OutputStream.
* @param seq the Sequence.
* @throws IOException if there was an error while writing.
*/
public static void writeFasta(OutputStream os, Sequence seq)
throws IOException {
writeFasta(os, new SingleSeqIterator(seq));
}
/**
* Writes a stream of Sequences to an OutputStream in EMBL format.
*
* @param os the OutputStream.
* @param in a SequenceIterator.
* @exception IOException if there was an error while writing.
*/
public static void writeEmbl(OutputStream os, SequenceIterator in)
throws IOException {
StreamWriter sw = new StreamWriter(os, new EmblLikeFormat());
sw.writeStream(in);
}
/**
* Writes a single Sequence to an OutputStream in EMBL format.
*
* @param os the OutputStream.
* @param seq the Sequence.
* @throws IOException if there was an error while writing.
*/
public static void writeEmbl(OutputStream os, Sequence seq) throws IOException {
writeEmbl(os, new SingleSeqIterator(seq));
}
/**
* Writes a stream of Sequences to an OutputStream in SwissProt
* format.
* @param os the OutputStream.
* @param in a SequenceIterator.
* @throws org.biojava.bio.BioException if the Sequence
cannot be converted to SwissProt
* format
* @exception IOException if there was an error while writing.
*/
public static void writeSwissprot(OutputStream os, SequenceIterator in)
throws IOException, BioException {
SequenceFormat former = new EmblLikeFormat();
PrintStream ps = new PrintStream(os);
while (in.hasNext()) {
former.writeSequence(in.nextSequence(), ps);
}
}
/**
* Writes a single Sequence to an OutputStream in SwissProt format.
* @param os the OutputStream.
* @param seq the Sequence.
* @throws org.biojava.bio.BioException if the Sequence
cannot be written to SwissProt format
* @throws IOException if there was an error while writing.
*/
public static void writeSwissprot(OutputStream os, Sequence seq)
throws IOException, BioException {
writeSwissprot(os, new SingleSeqIterator(seq));
}
/**
* Writes a stream of Sequences to an OutputStream in Genpept
* format.
* @param os the OutputStream.
* @param in a SequenceIterator.
* @throws org.biojava.bio.BioException if the Sequence
cannot be written to Genpept format
* @exception IOException if there was an error while writing.
*/
public static void writeGenpept(OutputStream os, SequenceIterator in)
throws IOException, BioException {
SequenceFormat former = new GenpeptFormat();
PrintStream ps = new PrintStream(os);
while (in.hasNext()) {
former.writeSequence(in.nextSequence(), ps);
}
}
/**
* Writes a single Sequence to an OutputStream in Genpept format.
* @param os the OutputStream.
* @param seq the Sequence.
* @throws org.biojava.bio.BioException if the Sequence
cannot be written to Genpept format
* @throws IOException if there was an error while writing.
*/
public static void writeGenpept(OutputStream os, Sequence seq)
throws IOException, BioException {
writeGenpept(os, new SingleSeqIterator(seq));
}
/**
* Writes a stream of Sequences to an OutputStream in Genbank
* format.
*
* @param os the OutputStream.
* @param in a SequenceIterator.
* @exception IOException if there was an error while writing.
*/
public static void writeGenbank(OutputStream os, SequenceIterator in)
throws IOException {
StreamWriter sw = new StreamWriter(os, new GenbankFormat());
sw.writeStream(in);
}
/**
* Writes a single Sequence to an OutputStream in Genbank format.
*
* @param os the OutputStream.
* @param seq the Sequence.
* @throws IOException if there was an error while writing.
*/
public static void writeGenbank(OutputStream os, Sequence seq)
throws IOException {
writeGenbank(os, new SingleSeqIterator(seq));
}
/**
* identifyFormat
performs a case-insensitive mapping
* of a pair of common sequence format name (such as 'embl',
* 'genbank' or 'fasta') and alphabet name (such as 'dna', 'rna',
* 'protein', 'aa') to an integer. The value returned will be one
* of the public static final fields in
* SeqIOConstants
, or a bitwise-or combination of
* them. The method will reject known illegal combinations of
* format and alphabet (such as swissprot + dna) by throwing an
* IllegalArgumentException
. It will return the
* SeqIOConstants.UNKNOWN
value when either format or
* alphabet are unknown.
*
* @param formatName a String
.
* @param alphabetName a String
.
*
* @return an int
.
*/
public static int identifyFormat(String formatName, String alphabetName) {
int format, alpha;
if (formatName.equalsIgnoreCase("raw")) {
format = SeqIOConstants.RAW;
}
else if (formatName.equalsIgnoreCase("fasta")) {
format = SeqIOConstants.FASTA;
}
else if (formatName.equalsIgnoreCase("nbrf")) {
format = SeqIOConstants.NBRF;
}
else if (formatName.equalsIgnoreCase("ig")) {
format = SeqIOConstants.IG;
}
else if (formatName.equalsIgnoreCase("embl")) {
format = SeqIOConstants.EMBL;
}
else if (formatName.equalsIgnoreCase("swissprot") ||
formatName.equalsIgnoreCase("swiss")) {
if (alphabetName.equalsIgnoreCase("aa") ||
alphabetName.equalsIgnoreCase("protein")) {
return SeqIOConstants.SWISSPROT;
} else {
throw new IllegalArgumentException("Illegal format and alphabet "
+ "combination "
+ formatName
+ " + "
+ alphabetName);
}
} else if (formatName.equalsIgnoreCase("genbank")) {
format = SeqIOConstants.GENBANK;
} else if (formatName.equalsIgnoreCase("genpept")) {
if (alphabetName.equalsIgnoreCase("aa") ||
alphabetName.equalsIgnoreCase("protein")) {
return SeqIOConstants.GENPEPT;
} else {
throw new IllegalArgumentException("Illegal format and alphabet "
+ "combination "
+ formatName
+ " + "
+ alphabetName);
}
} else if (formatName.equalsIgnoreCase("refseq")) {
format = SeqIOConstants.REFSEQ;
} else if (formatName.equalsIgnoreCase("gcg")) {
format = SeqIOConstants.GCG;
} else if (formatName.equalsIgnoreCase("gff")) {
format = SeqIOConstants.GFF;
}
else if (formatName.equalsIgnoreCase("pdb")) {
if (alphabetName.equalsIgnoreCase("aa") ||
alphabetName.equalsIgnoreCase("protein")) {
return SeqIOConstants.PDB;
} else {
throw new IllegalArgumentException("Illegal format and alphabet "
+ "combination "
+ formatName
+ " + "
+ alphabetName);
}
} else if (formatName.equalsIgnoreCase("phred")) {
if (alphabetName.equalsIgnoreCase("dna")) {
return SeqIOConstants.PHRED;
} else {
throw new IllegalArgumentException("Illegal format and alphabet "
+ "combination "
+ formatName
+ " + "
+ alphabetName);
}
} else if (formatName.equalsIgnoreCase("clustal")) {
format = AlignIOConstants.CLUSTAL;
} else if (formatName.equalsIgnoreCase("msf")) {
format = AlignIOConstants.MSF;
}
else {
return SeqIOConstants.UNKNOWN;
}
if (alphabetName.equalsIgnoreCase("dna")) {
alpha = SeqIOConstants.DNA;
} else if (alphabetName.equalsIgnoreCase("rna")) {
alpha = SeqIOConstants.RNA;
} else if (alphabetName.equalsIgnoreCase("aa") ||
alphabetName.equalsIgnoreCase("protein")) {
alpha = SeqIOConstants.AA;
} else {
return SeqIOConstants.UNKNOWN;
}
return (format | alpha);
}
/**
* getSequenceFormat
accepts a value which represents
* a sequence format and returns the relevant
* SequenceFormat
object.
*
* @param identifier an int
which represents a binary
* value with bits set according to the scheme described in
* SeqIOConstants
.
*
* @return a SequenceFormat
.
*
* @exception BioException if an error occurs.
*/
public static SequenceFormat getSequenceFormat(int identifier)
throws BioException {
// Mask the sequence format bytes
int alphaType = identifier & (~ 0xffff);
if (alphaType == 0)
throw new IllegalArgumentException("No alphabet was set in the identifier");
// Mask alphabet bytes
int formatType = identifier & (~ 0xffff0000);
if (formatType == 0)
throw new IllegalArgumentException("No format was set in the identifier");
switch (identifier) {
case SeqIOConstants.FASTA_DNA:
case SeqIOConstants.FASTA_RNA:
case SeqIOConstants.FASTA_AA:
return new FastaFormat();
case SeqIOConstants.EMBL_DNA:
case SeqIOConstants.EMBL_RNA:
return new EmblLikeFormat();
case SeqIOConstants.GENBANK_DNA:
case SeqIOConstants.GENBANK_RNA:
return new GenbankFormat();
case SeqIOConstants.SWISSPROT:
return new EmblLikeFormat();
default:
throw new BioException("No SequenceFormat available for "
+ "format/alphabet identifier '"
+ identifier
+ "'");
}
}
/**
* getBuilderFactory
accepts a value which represents
* a sequence format and returns the relevant
* SequenceBuilderFactory
object.
*
* @param identifier an int
which represents a binary
* value with bits set according to the scheme described in
* SeqIOConstants
.
*
* @return a SequenceBuilderFactory
.
*
* @exception BioException if an error occurs.
*/
public static SequenceBuilderFactory getBuilderFactory(int identifier)
throws BioException {
// Mask the sequence format bytes
int alphaType = identifier & (~ 0xffff);
if (alphaType == 0)
throw new IllegalArgumentException("No alphabet was set in the identifier");
// Mask alphabet bytes
int formatType = identifier & (~ 0xffff0000);
if (formatType == 0)
throw new IllegalArgumentException("No format was set in the identifier");
switch (identifier) {
case SeqIOConstants.FASTA_DNA:
case SeqIOConstants.FASTA_RNA:
case SeqIOConstants.FASTA_AA:
return getFastaBuilderFactory();
case SeqIOConstants.EMBL_DNA:
return getEmblBuilderFactory();
case SeqIOConstants.GENBANK_DNA:
return getGenbankBuilderFactory();
case SeqIOConstants.SWISSPROT:
return getSwissprotBuilderFactory();
case SeqIOConstants.GENPEPT:
return getGenpeptBuilderFactory();
default:
throw new BioException("No SequenceBuilderFactory available for "
+ "format/alphabet identifier '"
+ identifier
+ "'");
}
}
/**
* getAlphabet
accepts a value which represents a
* sequence format and returns the relevant
* FiniteAlphabet
object.
*
* @param identifier an int
which represents a binary
* value with bits set according to the scheme described in
* SeqIOConstants
.
*
* @return a FiniteAlphabet
.
*
* @exception BioException if an error occurs.
*/
public static FiniteAlphabet getAlphabet(int identifier)
throws BioException {
// Mask the sequence format bytes
int alphaType = identifier & (~ 0xffff);
if (alphaType == 0)
throw new IllegalArgumentException("No alphabet was set in the identifier");
switch (alphaType) {
case SeqIOConstants.DNA:
return DNATools.getDNA();
case SeqIOConstants.RNA:
return RNATools.getRNA();
case SeqIOConstants.AA:
return ProteinTools.getTAlphabet();
default:
throw new BioException("No FiniteAlphabet available for "
+ "alphabet identifier '"
+ identifier
+ "'");
}
}
//
// The following methods provide an alternate interface for
// reading and writing sequences and alignments. (Nimesh Singh).
//
//
/**
* Attempts to guess the filetype of a file given the name. For
* use with the functions below that take an int fileType as a
* parameter. EMBL and Genbank files are assumed to contain DNA
* sequence.
* @deprecated because there is no standard file naming convention
* and guessing by file name is inherantly error prone and bad.
* @param seqFile the File
to read from.
* @throws java.io.IOException if seqFile
cannot be read
* @throws java.io.FileNotFoundException if seqFile
cannot be found
* @return a value that describes the file type.
*/
public static int guessFileType(File seqFile)
throws IOException, FileNotFoundException {
//First tries by matching an extension
String fileName = seqFile.getName();
try {
if (Pattern.matches(".*\\u002eem.*", fileName)) {
return SeqIOConstants.EMBL_DNA;
}
else if (Pattern.matches(".*\\u002edat.*", fileName)) {
return SeqIOConstants.EMBL_DNA;
}
else if (Pattern.matches(".*\\u002egb.*", fileName)) {
return SeqIOConstants.GENBANK_DNA;
}
else if (Pattern.matches(".*\\u002esp.*", fileName)) {
return SeqIOConstants.SWISSPROT;
}
else if (Pattern.matches(".*\\u002egp.*", fileName)) {
return SeqIOConstants.GENPEPT;
}
else if (Pattern.matches(".*\\u002efa.*", fileName)) {
return guessFastaType(seqFile);
}
else if (Pattern.matches(".*\\u002emsf.*", fileName)) {
return guessMsfType(seqFile);
}
} catch (PatternSyntaxException e) {
throw new BioError("Internal error in SeqIOTools", e);
}
//Reads the file to guess based on content
BufferedReader br = new BufferedReader(new FileReader(seqFile));
String line1 = br.readLine();
br.close();
if (line1.startsWith(">")) {
return guessFastaType(seqFile);
}
else if (line1.startsWith("PileUp")) {
return guessMsfType(seqFile);
}
else if (line1.startsWith("!!AA_MULTIPLE_ALIGNMENT")) {
return AlignIOConstants.MSF_AA;
}
else if (line1.startsWith("!!NA_MULTIPLE_ALIGNMENT")) {
return AlignIOConstants.MSF_DNA;
}
else if (line1.startsWith("ID")) {
for (int i = 0; i < line1.length(); i++) {
if (Character.toUpperCase(line1.charAt(i)) == 'P' &&
Character.toUpperCase(line1.charAt(i+1)) == 'R' &&
Character.toUpperCase(line1.charAt(i+2)) == 'T') {
return SeqIOConstants.SWISSPROT;
}
}
return SeqIOConstants.EMBL_DNA;
}
else if (line1.toUpperCase().startsWith("LOCUS")) {
for (int i = 0; i < line1.length(); i++) {
if (Character.toUpperCase(line1.charAt(i)) == 'A' &&
Character.toUpperCase(line1.charAt(i+1)) == 'A') {
return SeqIOConstants.GENPEPT;
}
}
return SeqIOConstants.GENBANK_DNA;
}
else if (line1.length() >= 45 &&
line1.substring(19, 45).equalsIgnoreCase("GENETIC SEQUENCE DATA BANK")) {
return guessGenType(fileName);
}
else {
return SeqIOConstants.UNKNOWN;
}
}
/**
* Attempts to retrieve the most appropriate
* SequenceBuilder
object for some combination of
* Alphabet
and SequenceFormat
*
* @param format currently supports FastaFormat
,
* GenbankFormat
, EmblLikeFormat
* @param alpha currently only supports the DNA and Protein
* alphabets
*
* @return the SequenceBuilderFactory
*
* @throws BioException if the combination of alpha and format is
* unrecognized.
*
* @deprecated as this essentially duplicates the operation
* available in the method identifyBuilderFactory
.
*/
public static SequenceBuilderFactory formatToFactory(SequenceFormat format,
Alphabet alpha)
throws BioException {
if ((format instanceof FastaFormat) &&
(alpha == DNATools.getDNA() ||
alpha == ProteinTools.getAlphabet())) {
return getFastaBuilderFactory();
}
else if (format instanceof GenbankFormat &&
alpha == DNATools.getDNA()) {
return getGenbankBuilderFactory();
}
else if (format instanceof GenbankFormat &&
alpha == ProteinTools.getAlphabet()) {
return getGenpeptBuilderFactory();
}
else if (format instanceof EmblLikeFormat &&
alpha == DNATools.getDNA()){
return getEmblBuilderFactory();
}
else if (format instanceof EmblLikeFormat &&
alpha == ProteinTools.getAlphabet()) {
return getSwissprotBuilderFactory();
}
else {
throw new BioException("Unknown combination of"
+ " Alphabet and Format");
}
}
/**
* Reads a file with the specified format and alphabet
* @param formatName the name of the format eg genbank or
* swissprot (case insensitive)
* @param alphabetName the name of the alphabet eg dna or rna or
* protein (case insensitive)
* @param br a BufferedReader for the input
* @return either an Alignment object or a SequenceIterator
* (depending on the format read)
* @throws BioException if an error occurs while reading or a
* unrecognized format, alphabet combination is used (eg swissprot
* and DNA).
*
* @since 1.3
*/
public static Object fileToBiojava(String formatName,
String alphabetName,
BufferedReader br)
throws BioException {
int fileType = identifyFormat(formatName, alphabetName);
return fileToBiojava(fileType, br);
}
/**
* Reads a file and returns the corresponding Biojava object. You
* need to cast it as an Alignment or a SequenceIterator as
* appropriate.
* @param fileType a value that describes the file type
* @param br the reader for the input
* @throws org.biojava.bio.BioException if the file cannot be parsed
* @return either a SequenceIterator
if the file type is a
* sequence file, or a Alignment
if the file is a sequence
* alignment.
*/
public static Object fileToBiojava(int fileType, BufferedReader br)
throws BioException {
// Mask the sequence format bytes
int alphaType = fileType & (~ 0xffff);
if (alphaType == 0)
throw new IllegalArgumentException("No alphabet was set in the identifier");
// Mask alphabet bytes
int formatType = fileType & (~ 0xffff0000);
if (formatType == 0)
throw new IllegalArgumentException("No format was set in the identifier");
switch (fileType) {
case AlignIOConstants.MSF_DNA:
case AlignIOConstants.MSF_AA:
case AlignIOConstants.FASTA_DNA:
case AlignIOConstants.FASTA_AA:
return fileToAlign(fileType, br);
case SeqIOConstants.FASTA_DNA:
case SeqIOConstants.FASTA_AA:
case SeqIOConstants.EMBL_DNA:
case SeqIOConstants.GENBANK_DNA:
case SeqIOConstants.SWISSPROT:
case SeqIOConstants.GENPEPT:
return fileToSeq(fileType, br);
default:
throw new BioException("Unknown file type '"
+ fileType
+ "'");
}
}
/**
* Writes a Biojava SequenceIterator
,
* SequenceDB
, Sequence
or Aligment
* to an OutputStream
*
* @param formatName eg fasta, GenBank (case insensitive)
* @param alphabetName eg DNA, RNA (case insensititve)
* @param os where to write to
* @param biojava the object to write
* @throws BioException problems getting data from the biojava object.
* @throws IOException if there are IO problems
* @throws IllegalSymbolException a Symbol cannot be parsed
*/
public static void biojavaToFile(String formatName, String alphabetName,
OutputStream os, Object biojava)
throws BioException, IOException, IllegalSymbolException{
int fileType = identifyFormat(formatName,alphabetName);
biojavaToFile(fileType, os, biojava);
}
/**
* Converts a Biojava object to the given filetype.
* @param fileType a value that describes the type of sequence file
* @param os the stream to write the formatted results to
* @param biojava a SequenceIterator
, SequenceDB
,
* Sequence
, or Alignment
* @throws org.biojava.bio.BioException if biojava
cannot be
* converted to that format.
* @throws java.io.IOException if the output cannot be written to
* os
* @throws org.biojava.bio.symbol.IllegalSymbolException if biojava
*
contains a Symbol
that cannot be understood by the
* parser.
*/
public static void biojavaToFile(int fileType, OutputStream os,
Object biojava)
throws BioException, IOException, IllegalSymbolException {
switch (fileType) {
case AlignIOConstants.MSF_DNA:
case AlignIOConstants.MSF_AA:
case AlignIOConstants.FASTA_DNA:
case AlignIOConstants.FASTA_AA:
alignToFile(fileType, os, (Alignment) biojava);
break;
case SeqIOConstants.FASTA_DNA:
case SeqIOConstants.FASTA_AA:
case SeqIOConstants.EMBL_DNA:
case SeqIOConstants.GENBANK_DNA:
case SeqIOConstants.SWISSPROT:
case SeqIOConstants.GENPEPT:
if(biojava instanceof SequenceDB){
seqToFile(fileType, os, ((SequenceDB)biojava).sequenceIterator());
}else if(biojava instanceof Sequence){
seqToFile(fileType, os, new SingleSeqIterator((Sequence)biojava));
}else{
seqToFile(fileType, os, (SequenceIterator) biojava);
}
break;
default:
throw new BioException("Unknown file type '"
+ fileType
+ "'");
}
}
/**
* Helper function for guessFileName.
*/
private static int guessFastaType(File seqFile)
throws IOException, FileNotFoundException {
BufferedReader br = new BufferedReader(new FileReader(seqFile));
String line = br.readLine();
line = br.readLine();
br.close();
for (int i = 0; i < line.length(); i++) {
if (Character.toUpperCase(line.charAt(i)) == 'F' ||
Character.toUpperCase(line.charAt(i)) == 'L' ||
Character.toUpperCase(line.charAt(i)) == 'I' ||
Character.toUpperCase(line.charAt(i)) == 'P' ||
Character.toUpperCase(line.charAt(i)) == 'Q' ||
Character.toUpperCase(line.charAt(i)) == 'E') {
return SeqIOConstants.FASTA_AA;
}
}
return SeqIOConstants.FASTA_DNA;
}
private static SymbolTokenization getDNAParser() {
try {
return DNATools.getDNA().getTokenization("token");
} catch (BioException ex) {
throw new BioError("Assertion failing:"
+ " Couldn't get DNA token parser",ex);
}
}
private static SymbolTokenization getRNAParser() {
try {
return RNATools.getRNA().getTokenization("token");
} catch (BioException ex) {
throw new BioError("Assertion failing:"
+ " Couldn't get RNA token parser",ex);
}
}
private static SymbolTokenization getNucleotideParser() {
try {
return NucleotideTools.getNucleotide().getTokenization("token");
} catch (BioException ex) {
throw new BioError("Assertion failing:"
+ " Couldn't get nucleotide token parser",ex);
}
}
private static SymbolTokenization getProteinParser() {
try {
return ProteinTools.getTAlphabet().getTokenization("token");
} catch (BioException ex) {
throw new BioError("Assertion failing:"
+ " Couldn't get PROTEIN token parser",ex);
}
}
/**
* Helper function for guessFileName.
*/
private static int guessMsfType(File seqFile)
throws IOException, FileNotFoundException {
BufferedReader br = new BufferedReader(new FileReader(seqFile));
String line = br.readLine();
if (line.startsWith("!!NA_MULTIPLE_ALIGNMENT")) {
return AlignIOConstants.MSF_DNA;
}
else if (line.startsWith("!!AA_MULTIPLE_ALIGNMENT")) {
return AlignIOConstants.MSF_AA;
}
else {
while (line.indexOf("Type: ") == -1) {
line = br.readLine();
}
br.close();
int typeIndex = line.indexOf("Type: ") + 6;
if (line.substring(typeIndex).startsWith("N")) {
return AlignIOConstants.MSF_DNA;
}
else if (line.substring(typeIndex).startsWith("P")) {
return AlignIOConstants.MSF_AA;
}
else {
return AlignIOConstants.UNKNOWN;
}
}
}
/**
* Helper function for guessFileName.
*/
private static int guessGenType(String fileName)
throws IOException, FileNotFoundException {
BufferedReader br = new BufferedReader(new FileReader(fileName));
String line = br.readLine();
while (line.indexOf("LOCUS") == -1) {
line = br.readLine();
}
br.close();
for (int i = 0; i < line.length(); i++) {
if (Character.toUpperCase(line.charAt(i)) == 'A' &&
Character.toUpperCase(line.charAt(i+1)) == 'A') {
return SeqIOConstants.GENPEPT;
}
}
return SeqIOConstants.GENBANK_DNA;
}
/**
* Converts a file to an Biojava alignment.
*/
private static Alignment fileToAlign(int fileType, BufferedReader br)
throws BioException {
switch(fileType) {
case AlignIOConstants.MSF_DNA:
case AlignIOConstants.MSF_AA:
return (new MSFAlignmentFormat()).read(br);
case AlignIOConstants.FASTA_DNA:
case AlignIOConstants.FASTA_AA:
return (new FastaAlignmentFormat()).read(br);
default:
throw new BioException("Unknown file type '"
+ fileType
+ "'");
}
}
/**
* Converts a file to a Biojava sequence.
*/
private static SequenceIterator fileToSeq(int fileType,
BufferedReader br)
throws BioException {
switch (fileType) {
case SeqIOConstants.FASTA_DNA:
return SeqIOTools.readFastaDNA(br);
case SeqIOConstants.FASTA_AA:
return SeqIOTools.readFastaProtein(br);
case SeqIOConstants.EMBL_DNA:
return SeqIOTools.readEmbl(br);
case SeqIOConstants.GENBANK_DNA:
return SeqIOTools.readGenbank(br);
case SeqIOConstants.SWISSPROT:
return SeqIOTools.readSwissprot(br);
case SeqIOConstants.GENPEPT:
return SeqIOTools.readGenpept(br);
default:
throw new BioException("Unknown file type '"
+ fileType
+ "'");
}
}
/**
* Converts a Biojava alignment to the given filetype.
*/
private static void alignToFile(int fileType, OutputStream os,
Alignment align)
throws BioException, IllegalSymbolException {
switch(fileType) {
case AlignIOConstants.MSF_DNA:
(new MSFAlignmentFormat()).writeDna(os, align);
break;
case AlignIOConstants.MSF_AA:
(new MSFAlignmentFormat()).writeProtein(os, align);
break;
case AlignIOConstants.FASTA_DNA:
(new FastaAlignmentFormat()).writeDna(os, align);
break;
case AlignIOConstants.FASTA_AA:
(new FastaAlignmentFormat()).writeProtein(os, align);
break;
default:
throw new BioException("Unknown file type '"
+ fileType
+ "'");
}
}
/**
* Converts a Biojava sequence to the given filetype.
*/
private static void seqToFile(int fileType, OutputStream os,
SequenceIterator seq)
throws IOException, BioException {
switch (fileType) {
case SeqIOConstants.FASTA_DNA:
case SeqIOConstants.FASTA_AA:
SeqIOTools.writeFasta(os, seq);
break;
case SeqIOConstants.EMBL_DNA:
SeqIOTools.writeEmbl(os, seq);
break;
case SeqIOConstants.SWISSPROT:
SeqIOTools.writeSwissprot(os, seq);
break;
case SeqIOConstants.GENBANK_DNA:
SeqIOTools.writeGenbank(os, seq);
break;
case SeqIOConstants.GENPEPT:
SeqIOTools.writeGenpept(os, seq);
break;
default:
throw new BioException("Unknown file type '"
+ fileType
+ "'");
}
}
private static final class SingleSeqIterator
implements SequenceIterator {
private Sequence seq;
SingleSeqIterator(Sequence seq) {
this.seq = seq;
}
public boolean hasNext() {
return seq != null;
}
public Sequence nextSequence() {
Sequence seq = this.seq;
this.seq = null;
return seq;
}
}
}