/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq.db.emblcd; import java.io.IOException; import java.io.InputStream; /** *

EmblCDROMIndexReader is an abstract class whose * concrete subclasses read EMBL CD-ROM format indices from an * underlying InputStream. This format is used by the * EMBOSS package for database indexing (see programs dbiblast, * dbifasta, dbiflat and dbigcg). Indexing produces four binary files * with a simple format:

* * * *

Internally EMBOSS checks for Big-endian architechtures and * switches the byte order to Little-endian. This means trouble if you * try to read the file using DataInputStream, but at * least the binaries are consistent across architechtures. This class * carries out the necessary conversion.

* *

The EMBL CD-ROM format stores the date in 4 bytes. One byte is * unused (the first one), leaving one byte for the day, one for the * month and one (!) for the year.

* *

For further information see the EMBOSS documentation, or for a * full description, the source code of the dbi programs and the Ajax * library.

* * @author Keith James * @since 1.2 */ public abstract class EmblCDROMIndexReader { protected InputStream input; protected StringBuffer sb; protected RecordParser recParser; // Header fields private byte [] int4 = new byte [4]; private byte [] int2 = new byte [2]; private byte [] dbName = new byte [20]; private byte [] dbRelease = new byte [10]; private byte [] dbDate = new byte [4]; // Record field private byte [] record; private long fileLength; private long recordCount; private int recordLength; private String name; private String release; private String date; /** * Creates a new EmblCDROMIndexReader instance. A * BufferedInputStream is probably the most suitable. * * @param input an InputStream. * * @exception IOException if an error occurs. */ public EmblCDROMIndexReader(InputStream input) throws IOException { this.input = input; sb = new StringBuffer(512); recParser = new RecordParser(); parseHeader(); } /** * readFileLength returns the file length in bytes * (stored within the file's header by the indexing program). This * may be called more than once as the value is cached. * * @return a long. */ public long readFileLength() { return fileLength; } /** * readRecordCount returns the number of records in * the file. This may be called more than once as the value is * cached. * * @return a long. */ public long readRecordCount() { return recordCount; } /** * readRecordLength returns the record length * (bytes). This may be called more than once as the value is * cached. * * @return an int. */ public int readRecordLength() { return recordLength; } /** * readDBName returns the database name from the * index header. This may be called more than once as the value is * cached. * * @return a String. */ public String readDBName() { return name; } /** * readDBRelease returns the database release from * the index header. This may be called more than once as the * value is cached. * * @return a String. */ public String readDBRelease() { return release; } /** * readDBDate reads the date from the index * header. The date is stored in 4 bytes: 0, unused; 1, year; 2, * month; 3, day. With a 1 byte year it's not very much use and * I'm not sure that the EMBOSS programs set the value correctly * anyway. * * @return a String. */ public String readDBDate() { return date; } /** * readRecord returns an array of objects parsed from * a single record. Its content will depend on the type of index * file. Concrete subclasses must provide an implementation of * this method. * * @return an Object [] array. * * @exception IOException if an error occurs. */ public abstract Object [] readRecord() throws IOException; /** * readRawRecord returns the raw bytes of a single * record from the index. * * @return a byte [] array. * * @exception IOException if an error occurs. */ public byte [] readRawRecord() throws IOException { int eof = input.read(record); if (eof == -1) input.close(); return record; } /** * close closes the underlying * InputStream. * * @exception IOException if an error occurs. */ public void close() throws IOException { input.close(); } /** * parseHeader carries out a full parse of the 300 * byte header (common to all the index types) when first * encountered. * * @exception IOException if an error occurs. */ private void parseHeader() throws IOException { int eof = 0; eof = input.read(int4); if (eof == -1) input.close(); fileLength = recParser.parseInt4(int4); eof = input.read(int4); if (eof == -1) input.close(); recordCount = recParser.parseInt4(int4); eof = input.read(int2); if (eof == -1) input.close(); recordLength = recParser.parseInt2(int2); // Set up array for reading records now that we know their // length record = new byte [recordLength]; eof = input.read(dbName); if (eof == -1) input.close(); sb.setLength(0); name = recParser.parseString(sb, dbName); eof = input.read(dbRelease); if (eof == -1) input.close(); sb.setLength(0); release = recParser.parseString(sb, dbRelease); eof = input.read(dbDate); if (eof == -1) input.close(); sb.setLength(0); date = recParser.parseDate(sb, dbDate); // Skip the remainder of the header (padding) input.skip(256); } }