/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.program.abi; import java.io.DataInput; import java.io.DataInputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.biojava.bio.seq.DNATools; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.Symbol; import org.biojava.utils.io.CachingInputStream; import org.biojava.utils.io.Seekable; /** * A general base parser for files produced by ABI software. This includes * chromatograms derived from ABI sequencers and potentially other data files * as well. The format was described by Clark Tibbetts in his paper "Raw Data * File Formats, and the Digital and Analog Raw Data Streams of the ABI PRISM * 377 DNA Sequencer." Available online * * http://www-2.cs.cmu.edu/afs/cs/project/genome/WWW/Papers/clark.html *

* Briefly, the format consists of a set of named fixed-length "tagged data * records" which may contain data themselves, or pointers to data elsewhere * in the file. This class reads these records and exposes them to subclasses * through the {@link #getDataRecord} method. The attributes of the records as * described in Tibbets' paper are exposed through public (final) fields of * {@link TaggedDataRecord} instances. *

*

* If a record only contains a pointer to the desired data (see * {@link TaggedDataRecord#hasOffsetData}, subclasses may get * at the raw data by using {@link TaggedDataRecord#offsetData}: *

*

* This parser provides methods and classes for dealing with the files as * streams or local files (local files being more memory-efficient). *

* * @author Rhett Sutphin (UI CBCB) * @author Richard Holland */ public class ABIFParser { private ABIFParser.DataAccess din; private boolean parsed = false; private Map records; private final int RECORD_COUNT_OFFSET = 18; private final int RECORD_OFFSET_OFFSET = 26; /** Creates a new ABIFParser for a file. */ public ABIFParser(File f) throws IOException { this(new ABIFParser.RandomAccessFile(f)); } /** * Creates a new ABIFParser for an input stream. Note that the stream * will be wrapped in a {@link CachingInputStream} if it isn't one already. * If it is, it will be seeked to 0. */ public ABIFParser(InputStream in) throws IOException { this(new ABIFParser.DataStream(in)); } /** * Creates a new ABIFParser for the specified {@link DataAccess} object. * If you need to read from something other than a file or a stream, you'll * have to implement a {@link DataAccess}-implementing class wrapping your * source and then pass an instance to this constructor. */ public ABIFParser(ABIFParser.DataAccess toParse) throws IOException { din = toParse; readDataRecords(); } /** * Returns the accessor for the raw data being parsed by this parser. */ public final ABIFParser.DataAccess getDataAccess() { return din; } private final void readDataRecords() throws IOException { parsed = false; din.seek(RECORD_COUNT_OFFSET); long recordCount = 0xffffffff & din.readInt(); din.seek(RECORD_OFFSET_OFFSET); long recordOffset = 0xffffffff & din.readInt(); din.seek(recordOffset); TaggedDataRecord tdr; StringBuffer label; records = new HashMap(); for (int i = 0 ; i < recordCount ; i++) { tdr = new TaggedDataRecord(din); label = new StringBuffer(6).append(tdr.tagName).append(tdr.tagNumber); records.put(label.substring(0), tdr); } for (Iterator i = records.values().iterator(); i.hasNext(); ) { TaggedDataRecord record = (TaggedDataRecord)i.next(); if (record.hasOffsetData) { din.seek(record.dataRecord); din.readFully(record.offsetData); } } parsed = true; din.finishedReading(); } /** * Decodes a character into a {@link Symbol} in the DNA alphabet. * Uses a definition of characters that is compatible with the ABI format. * @param token the character to decode * @throws IllegalSymbolException when token isn't in * { a, A, c, C, g, G, t, T, n, N, - } */ public static Symbol decodeDNAToken(char token) throws IllegalSymbolException { switch (token) { case 'a': case 'A': return DNATools.a(); case 'c': case 'C': return DNATools.c(); case 'g': case 'G': return DNATools.g(); case 't': case 'T': return DNATools.t(); case 'n': case 'N': return DNATools.n(); case '-': return DNATools.getDNA().getGapSymbol(); default: throw new IllegalSymbolException("Can't decode token " + token + " into DNA"); } } /** * Get the entry from the file TOC with the given name and tag number. * @param tagName the four-character string name of the desired data record * @param tagNumber which one of the tags with this name to return (must be positive) * @throws IllegalArgumentException if tagName is the wrong length or tagNumber * is 0 or negative * @throws IllegalStateException if the initial parsing is not complete * @return the requested data record, or null if no such record exists */ public ABIFParser.TaggedDataRecord getDataRecord(String tagName, int tagNumber) throws IllegalArgumentException, IllegalStateException { if (!parsed) throw new IllegalStateException("parsing is not complete"); if (tagNumber < 1) throw new IllegalArgumentException("tagNumber must be positive"); if (tagName.length() != 4) throw new IllegalArgumentException("tagName must be 4 characters long"); return (ABIFParser.TaggedDataRecord) records.get(tagName + tagNumber); } /** * Obtain all data records. Keys of the map are strings consisting of * tag names with tag numbers concatenated immediately afterwards. Values * are TaggedDataRecord objects. The map has no particular order and so * cannot be relied on to iterate over records in the same order they * were read from the file. * @return the map of all data records. */ public Map getAllDataRecords() { return Collections.unmodifiableMap(records); } /** * An aggregate immutable type for an ABIF tagged data record. See the * Tibbets paper (referenced in the javadoc for {@link ABIFParser}) for * more information. */ public static class TaggedDataRecord { public static final int DATA_TYPE_ASCII_ARRAY = 2; public static final int DATA_TYPE_INTEGER = 4; public static final int DATA_TYPE_FLOAT = 7; public static final int DATA_TYPE_DATE = 10; public static final int DATA_TYPE_TIME = 11; public static final int DATA_TYPE_PSTRING = 18; public final char[] tagName; public final long tagNumber; public final int dataType; public final int elementLength; public final long numberOfElements; public final long recordLength; public final long dataRecord; public final long crypticVariable; public final boolean hasOffsetData; public final byte[] offsetData; /** * Creates a new TaggedDataRecord from the next 28 bytes of * din. * @param din the source of the raw data to be parsed * @throws IOException if there's a problem with din */ public TaggedDataRecord(ABIFParser.DataAccess din) throws IOException { tagName = new char[4]; tagName[0] = (char) din.readByte(); tagName[1] = (char) din.readByte(); tagName[2] = (char) din.readByte(); tagName[3] = (char) din.readByte(); tagNumber = 0xffffffff & din.readInt(); dataType = 0xffff & din.readShort(); elementLength = 0xffff & din.readShort(); numberOfElements = 0xffffffff & din.readInt(); recordLength = 0xffffffff & din.readInt(); dataRecord = 0xffffffff & din.readInt(); crypticVariable = 0xffffffff & din.readInt(); hasOffsetData = recordLength>4L; if (hasOffsetData) offsetData = new byte[(int)recordLength]; else offsetData = new byte[0]; } /** * A very verbose toString that dumps all of the * data in this record in a human-readable format. */ public String toString() { StringBuffer sb = new StringBuffer(super.toString()).append("[\n"); sb.append(" tagName = ").append(tagName).append('\n'); sb.append(" tagNumber = ").append(tagNumber).append('\n'); sb.append(" dataType = "); switch (dataType) { case DATA_TYPE_ASCII_ARRAY: sb.append("ASCII"); break; case DATA_TYPE_INTEGER: sb.append("INTEGER"); break; case DATA_TYPE_FLOAT: sb.append("FLOAT"); break; case DATA_TYPE_DATE: sb.append("DATE"); break; case DATA_TYPE_TIME: sb.append("TIME"); break; case DATA_TYPE_PSTRING: sb.append("PSTRING"); break; default: sb.append(dataType); } sb.append('\n'); sb.append(" elementLength = ").append(elementLength).append('\n'); sb.append(" numberOfElements= ").append(numberOfElements).append('\n'); sb.append(" recordLength = ").append(recordLength).append('\n'); sb.append(" dataRecord = "); if (recordLength <= 4) { switch (dataType) { case DATA_TYPE_ASCII_ARRAY: if (recordLength > 3) sb.append((char) ((dataRecord >>> 24) & 0xFF)); if (recordLength > 2) sb.append((char) ((dataRecord >>> 16) & 0xFF)); if (recordLength > 1) sb.append((char) ((dataRecord >>> 8 ) & 0xFF)); sb.append((char) ((dataRecord) & 0xFF)); break; case DATA_TYPE_DATE: sb.append((dataRecord >>> 16) & 0xffff).append('/'); sb.append((dataRecord >>> 8 ) & 0xff).append('/'); sb.append((dataRecord) & 0xff); break; case DATA_TYPE_TIME: sb.append((dataRecord >>> 24) & 0xff).append(':'); sb.append((dataRecord >>> 16) & 0xff).append(':'); sb.append((dataRecord >>> 8 ) & 0xff); break; case DATA_TYPE_INTEGER: sb.append(dataRecord >>> (4 - recordLength)*8); break; default: hexStringify((int)dataRecord, sb); } } else { hexStringify((int)dataRecord, sb); } sb.append(" hasOffsetData = ").append(hasOffsetData).append('\n'); sb.append('\n'); sb.append(" crypticVariable = ").append(crypticVariable).append('\n'); sb.append(']'); return sb.toString(); } private void hexStringify(int l, StringBuffer sb) { sb.append("0x"); String hex = Integer.toHexString(l).toUpperCase(); for (int i = 8 ; i > hex.length() ; i--) sb.append('0'); sb.append(hex); } } /** * Concatenation of the {@link Seekable} and {@link DataInput} interfaces. */ public static interface DataAccess extends Seekable, DataInput { /** * Called when the parser has finished reading. The access * may choose to close itself at this point, e.g. if it is * using a RandomAccessFile. * @throws IOException if it could not do what it needs to. */ public void finishedReading() throws IOException; } private static class RandomAccessFile extends java.io.RandomAccessFile implements DataAccess { public RandomAccessFile(File f) throws FileNotFoundException { super(f, "r"); } public void finishedReading() throws IOException { this.close(); } } /** Implements DataAccess by delegating to a CachingStream and a * DataInputStream */ private static class DataStream implements DataAccess { CachingInputStream cin; DataInputStream din; public DataStream(InputStream src) throws IOException { if (src instanceof CachingInputStream) cin = (CachingInputStream) src; else cin = new CachingInputStream(src); cin.seek(0); din = new DataInputStream(cin); } public DataStream(CachingInputStream cin) throws IOException { this((InputStream) cin); } public void finishedReading() throws IOException { // We don't care. } public boolean readBoolean() throws IOException { return din.readBoolean(); } public byte readByte() throws IOException { return din.readByte(); } public char readChar() throws IOException { return din.readChar(); } public short readShort() throws IOException { return din.readShort(); } public int readInt() throws IOException { return din.readInt(); } public long readLong() throws IOException { return din.readLong(); } public float readFloat() throws IOException { return din.readFloat(); } public double readDouble() throws IOException { return din.readDouble(); } public String readUTF() throws IOException { return din.readUTF(); } public int readUnsignedByte() throws IOException { return din.readUnsignedByte(); } public int readUnsignedShort() throws IOException { return din.readUnsignedShort(); } public void readFully(byte[] values) throws IOException { din.readFully(values); } public void readFully(byte[] values, int start, int len) throws IOException { din.readFully(values, start, len); } public String readLine() throws IOException { throw new UnsupportedOperationException("DataInputStream#readLine is deprecated. Use readUTF instead"); } public int skipBytes(int count) throws IOException { return din.skipBytes(count); } public void seek(long pos) throws IOException { cin.seek(pos); } } }