/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.program.scf;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import org.biojava.bio.BioError;
import org.biojava.bio.chromatogram.AbstractChromatogram;
import org.biojava.bio.chromatogram.Chromatogram;
import org.biojava.bio.chromatogram.UnsupportedChromatogramFormatException;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.symbol.IllegalAlphabetException;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.IntegerAlphabet;
import org.biojava.bio.symbol.Symbol;
import org.biojava.bio.symbol.SymbolList;
import org.biojava.bio.symbol.SymbolListViews;
import org.biojava.utils.SmallMap;
/**
* A {@link org.biojava.bio.chromatogram.Chromatogram} as loaded from an
* SCF v2 or v3 file. Also loads and exposes the SCF format's "private data"
* and "comments" sections. The quality values from the SCF are stored as
* additional sequences on the base call alignment. The labels are the
* PROB_
* constants in this class.
* The values are {@link org.biojava.bio.symbol.IntegerAlphabet.IntegerSymbol}
* objects in the range 0 to 255.
*
*
* @author Rhett Sutphin (UI CBCB)
*/
public class SCF extends AbstractChromatogram {
private byte[] privateData;
private Properties comments;
private static final IntegerAlphabet.SubIntegerAlphabet
PROBABILITY_ALPHABET =
IntegerAlphabet.getSubAlphabet(0, 255);
/** Represents the maximum unsigned value
* of a byte for wrapping purposes */
public static final int BYTE_MAX_VALUE =
256;
/** Represents the maximum unsigned value
* of a short for wrapping purposes */
public static final int SHORT_MAX_VALUE =
65536;
/** Base call alignment sequence label for the probability that call
* should be A. */
public static final Object PROB_NUC_A = "quality-a";
/** Base call alignment sequence label for the probability that call
* should be C. */
public static final Object PROB_NUC_C = "quality-c";
/** Base call alignment sequence label for the probability that call
* should be G. */
public static final Object PROB_NUC_G = "quality-g";
/** Base call alignment sequence label for the probability that call
* should be T. */
public static final Object PROB_NUC_T = "quality-t";
/**
* Base call alignment sequence label for the substitution
* probability. In versions of the SCF spec before 3.10, this is called
* spareQual[0].
*/
public static final Object PROB_SUBSTITUTION =
"substitution-probability";
/**
* Base call alignment sequence label for the overcall probability.
* In versions of the SCF spec before 3.10, this is called
* spareQual[1].
*/
public static final Object PROB_OVERCALL = "overcall-probability";
/**
* Base call alignment sequence label for the undercall probability.
* In versions of the SCF spec before 3.10, this is called
* spareQual[2].
*/
public static final Object PROB_UNDERCALL =
"undercall-probability";
/** Creates a new, completely empty SCF. */
protected SCF() {
super();
comments = new Properties();
}
public static SCF create(File f)
throws IOException, UnsupportedChromatogramFormatException {
SCF out = new SCF();
out.load(f);
return out;
}
public static SCF create(InputStream in, long alreadyRead)
throws IOException, UnsupportedChromatogramFormatException {
SCF out = new SCF();
out.load(in, alreadyRead);
return out;
}
protected void load(File f) throws IOException,
UnsupportedChromatogramFormatException {
FileInputStream fin = new FileInputStream(f);
try {
load(fin, 0);
} finally {
fin.close();
}
}
protected void load(InputStream in, long initOffset)
throws IOException, UnsupportedChromatogramFormatException {
SCF.ParserFactory.parse(in, this, initOffset);
}
/**
* Returns the comments fields as a {@link Properties} mapping.
*/
public Properties getComments() { return comments; }
protected AbstractChromatogram reverseComplementInstance() { return
new SCF(); }
public static IntegerAlphabet.SubIntegerAlphabet
getProbabilityAlphabet() { return PROBABILITY_ALPHABET; }
/**
* Overrides {@link
* AbstractChromatogram#reverseComplementBaseCallList} to
* support the 7 quality values from the SCF. These are handled thus:
*
* PROB_SUBSTITUTION
, PROB_OVERCALL
, and
* PROB_UNDERCALL
are just reversed &returned.
* PROB_NUC_
* returns the reverse of the quality
* sequence for the complement base.
*
*/
protected SymbolList reverseComplementBaseCallList(Object label) {
if (label == PROB_SUBSTITUTION ||
label == PROB_OVERCALL ||
label == PROB_UNDERCALL) {
return
SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(label));
} else if (label == PROB_NUC_A) {
return
SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_T));
} else if (label == PROB_NUC_C) {
return
SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_G));
} else if (label == PROB_NUC_G) {
return
SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_C));
} else if (label == PROB_NUC_T) {
return
SymbolListViews.reverse(this.getBaseCalls().symbolListForLabel(PROB_NUC_A));
} else {
return super.reverseComplementBaseCallList(label);
}
}
/*** INNER CLASSES FOR PARSER ***/
/** Factory class to create the appropriate parser for the given
* stream. This decision is based on the version field in the file's header. */
private static class ParserFactory {
public static void parse(InputStream in, SCF out, long initOffset)
throws IOException, UnsupportedChromatogramFormatException {
DataInputStream din = new DataInputStream(in);
SCF.Parser parser = createParser(din, out, initOffset);
parser.parse();
}
public static SCF.Parser createParser(DataInputStream din, SCF
out, long initOffset)
throws UnsupportedChromatogramFormatException, IOException {
// read the header to find out the version
long offset = initOffset;
SCF.Parser.HeaderStruct header =
SCF.Parser.HeaderStruct.create(din, initOffset);
offset = SCF.Parser.HeaderStruct.HEADER_LENGTH;
out.setBits((int)header.sample_size * 8);
SCF.Parser parser;
float version;
try {
version = Float.parseFloat(new String(header.version));
} catch (NumberFormatException e) {
throw new UnsupportedChromatogramFormatException(
"The SCF's version (" + new String(header.version) +
") is not a number");
}
if (version < 3.0f && version >= 2.0f) {
parser = new SCF.V2Parser(din, out, header, offset);
} else if (version >= 3.0f) {
parser = new SCF.V3Parser(din, out, header, offset);
} else {
throw new UnsupportedChromatogramFormatException(
"Only version 2 and version 3 SCFs are supported (not "
+ new String(header.version));
}
return parser;
}
}
static interface BaseCallUncertaintyDecoder {
/**
* Returns an appropriate Symbol from the DNA alphabet for
* an encoded byte.
*/
public Symbol decode(byte call) throws IllegalSymbolException;
}
/**
* A BaseCallUncertaintyDecoder that works for type 0 (default) and
* type 4 (ABI)
* code sets.
*/
static class DefaultUncertaintyDecoder implements
BaseCallUncertaintyDecoder {
public DefaultUncertaintyDecoder() { }
public Symbol decode(byte call) throws IllegalSymbolException {
char c = (char) call;
switch (c) {
case 'a': case 'A':
return DNATools.a();
case 'c': case 'C':
return DNATools.c();
case 'g': case 'G':
return DNATools.g();
case 't': case 'T':
return DNATools.t();
case 'n': case 'N':
return DNATools.n();
case 'm': case 'M':
return DNATools.m();
case 'r': case 'R':
return DNATools.r();
case 'w': case 'W':
return DNATools.w();
case 's': case 'S':
return DNATools.s();
case 'y': case 'Y':
return DNATools.y();
case 'k': case 'K':
return DNATools.k();
case 'v': case 'V':
return DNATools.v();
case 'h': case 'H':
return DNATools.h();
case 'd': case 'D':
return DNATools.d();
case 'b': case 'B':
return DNATools.b();
case '-':
return DNATools.getDNA().getGapSymbol();
default:
throw new IllegalSymbolException("No Symbol for " +
c);
}
}
}
abstract static class Parser {
protected long offset = 0;
protected DataInputStream din = null;
protected HeaderStruct header;
protected BaseCallUncertaintyDecoder decoder;
protected SCF out = null;
protected boolean parsed = false;
Parser(DataInputStream din, SCF out,
SCF.Parser.HeaderStruct header, long initOffset)
throws UnsupportedChromatogramFormatException {
if (din == null)
throw new IllegalArgumentException(
"Can't parse a null inputstream");
this.din = din;
if (out == null)
this.out = new SCF();
else
this.out = out;
if (header.samples > Integer.MAX_VALUE)
throw new UnsupportedChromatogramFormatException(
"Can't parse an SCF with more than " + Integer.MAX_VALUE + " trace samples");
if (header.bases > Integer.MAX_VALUE)
throw new UnsupportedChromatogramFormatException(
"Can't parse an SCF with more than " + Integer.MAX_VALUE + " called bases");
this.header = header;
this.decoder = createDecoder(header.code_set);
this.offset = initOffset;
}
/**
* Factory method to create an approriate decoder for the code
* set. Currently, only a direct interpretation of the encoded byte
* as an ASCII character is supported (see DefaultUncertaintyDecoder).
* This interpretation will be used for all code sets, but a
* warning will be printed if the code set is not known to work with this
* interpretation.
*/
private static BaseCallUncertaintyDecoder createDecoder(long
codeSet) {
if (codeSet != 0 && codeSet != 4)
System.err.println("Warning: the code set (" + codeSet +
") is not specifically supported. (It may still work, though.)");
return new DefaultUncertaintyDecoder();
}
public SCF getParsed() {
if (parsed) return out;
else return null;
}
public void parse() throws IOException,
UnsupportedChromatogramFormatException {
parsed = false;
// sort the sections of the file by ascending offset
Integer SAMPLES = new Integer(0),
BASES = new Integer(1),
COMMENTS = new Integer(2),
PRIVATE = new Integer(3);
TreeMap sectionOrder = new TreeMap();
sectionOrder.put(new Long(header.samples_offset), SAMPLES);
sectionOrder.put(new Long(header.bases_offset), BASES);
sectionOrder.put(new Long(header.comments_offset), COMMENTS);
sectionOrder.put(new Long(header.private_offset), PRIVATE);
for (Iterator it = sectionOrder.keySet().iterator() ;
it.hasNext() ;) {
Integer sect = (Integer) sectionOrder.get(it.next());
if (sect == SAMPLES) parseSamples();
else if (sect == BASES) parseBases();
else if (sect == COMMENTS) parseComments();
else if (sect == PRIVATE) parsePrivate();
}
parsed = true;
}
protected abstract void parseSamples() throws IOException,
UnsupportedChromatogramFormatException;
protected abstract void parseBases() throws IOException,
UnsupportedChromatogramFormatException;
protected void parseComments() throws IOException {
skipTo(header.comments_offset);
byte[] raw = new byte[(int)header.comments_size - 1];
din.read(raw, 0, raw.length);
BufferedReader r = new BufferedReader(
new InputStreamReader(
new ByteArrayInputStream(raw),
"ISO-8859-1"
)
);
String line, key, value;
int eqIdx = -1;
while ((line = r.readLine()) != null) {
eqIdx = line.indexOf('=');
//added line below to skip any truncated comment fields
if( eqIdx == -1 ) {
continue;
}
key = line.substring(0, eqIdx);
value = line.substring(eqIdx+1);
out.comments.setProperty(key, value);
}
}
protected void parsePrivate() throws IOException {
if (header.private_size == 0) return;
skipTo(header.private_offset);
out.privateData = new byte[(int)header.private_size];
int privRead = 0;
int thisRead = 0;
while (privRead < out.privateData.length && thisRead >= 0) {
thisRead = din.read(out.privateData, privRead,
out.privateData.length - privRead);
offset += thisRead;
privRead += thisRead;
}
}
protected final void skipTo(long newOffset) throws IOException {
if (newOffset < offset)
throw new IllegalArgumentException("Can't skip backwards: (newOffset==" + newOffset + ") < (offset==" + offset + ")");
long skip = newOffset - offset;
while (skip > 0) {
int actualSkip = din.skipBytes((int)skip);
offset += actualSkip;
skip -= actualSkip;
}
}
/**
* Does the grunt work of creating the base call alignment from
* the given lists of bases, offsets, and probabilities.
* Catches and "Can't happens" all exceptions.
*/
protected final void createAndSetBaseCallAlignment(List dna, List
offsets, List[] probs) {
try {
Map baseCalls = new SmallMap(9);
baseCalls.put(Chromatogram.DNA,
out.createImmutableSymbolList(DNATools.getDNA(), dna));
baseCalls.put(Chromatogram.OFFSETS,
out.createImmutableSymbolList(IntegerAlphabet.getInstance(), offsets));
baseCalls.put(PROB_NUC_A,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[0]));
baseCalls.put(PROB_NUC_C,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[1]));
baseCalls.put(PROB_NUC_G,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[2]));
baseCalls.put(PROB_NUC_T,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[3]));
baseCalls.put(PROB_SUBSTITUTION,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[4]));
baseCalls.put(PROB_OVERCALL,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[5]));
baseCalls.put(PROB_UNDERCALL,
out.createImmutableSymbolList(getProbabilityAlphabet(), probs[6]));
out.setBaseCallAlignment(out.createImmutableAlignment(baseCalls));
} catch (IllegalSymbolException ise) {
throw new BioError(ise,"Can't happen unless the decoder is returning non-DNA symbols");
} catch (IllegalAlphabetException iae) {
throw new BioError(iae,"Can't happen");
}
}
private static class HeaderStruct {
public static final int HEADER_LENGTH = 128;
// SCF spec uses unsigned 32-bit ints, so we'll use longs for simplicity
public long magic_number;
public long samples;
public long samples_offset;
public long bases;
public long bases_left_clip;
public long bases_right_clip;
public long bases_offset;
public long comments_size;
public long comments_offset;
public char[] version;
public long sample_size;
public long code_set;
public long private_size;
public long private_offset;
public long[] spare;
private HeaderStruct() {
version = new char[4];
spare = new long[18];
}
public static HeaderStruct create(DataInputStream din, long
initOffset) throws IOException {
HeaderStruct hs = new HeaderStruct();
if (initOffset > 4) {
throw new IllegalStateException("Can't skip more than four bytes and still have enough info to read header");
} else if (initOffset == 0) {
hs.magic_number = 0xFFFFFFFF & din.readInt();
} else {
hs.magic_number = 0;
// skip to the four-byte boundary
for (int i = 0 ; i < 4 - initOffset ; i++)
din.read();
}
hs.samples = 0xFFFFFFFF & din.readInt();
hs.samples_offset = 0xFFFFFFFF & din.readInt();
hs.bases = 0xFFFFFFFF & din.readInt();
hs.bases_left_clip = 0xFFFFFFFF & din.readInt();
hs.bases_right_clip = 0xFFFFFFFF & din.readInt();
hs.bases_offset = 0xFFFFFFFF & din.readInt();
hs.comments_size = 0xFFFFFFFF & din.readInt();
hs.comments_offset = 0xFFFFFFFF & din.readInt();
hs.version[0] = (char) din.readByte();
hs.version[1] = (char) din.readByte();
hs.version[2] = (char) din.readByte();
hs.version[3] = (char) din.readByte();
hs.sample_size = 0xFFFFFFFF & din.readInt();
hs.code_set = 0xFFFFFFFF & din.readInt();
hs.private_size = 0xFFFFFFFF & din.readInt();
hs.private_offset = 0xFFFFFFFF & din.readInt();
for (int i = 0 ; i < hs.spare.length ; i++) {
hs.spare[i] = 0xFFFFFFFF & din.readInt();
}
return hs;
}
}
}
private static class V3Parser extends Parser {
V3Parser(DataInputStream din, SCF out,
SCF.Parser.HeaderStruct header, long initOffset)
throws IOException, UnsupportedChromatogramFormatException {
super(din, out, header, initOffset);
}
protected void parseSamples() throws IOException,
UnsupportedChromatogramFormatException {
skipTo(header.samples_offset);
// load values from file
int count = (int)header.samples;
int maxValAllowed = 0;
if(header.sample_size == 1) {
maxValAllowed = BYTE_MAX_VALUE;
} else if(header.sample_size == 2) {
maxValAllowed = SHORT_MAX_VALUE;
}
int[][] trace = new int[4][count];
int[] maxVal = new int[] { Integer.MIN_VALUE,
Integer.MIN_VALUE,
Integer.MIN_VALUE,
Integer.MIN_VALUE };
for (int n = 0 ; n < 4 ; n++)
readSamplesInto(trace[n]);
// stored values are delta-delta values; reprocess into actual values
// algorithm cribbed from io_lib's delta_samples function in misc_scf.c
int[] p_sample1 = new int[] { 0, 0, 0, 0 };
int[] p_sample2 = new int[] { 0, 0, 0, 0 };
for (int n = 0 ; n < 4 ; n++) {
for (int i = 0 ; i < trace[n].length ; i++) {
//New version of the code which takes into account what the
//underlying value i.e. is it a byte or is it a short
//and slightly rejigged for speed and sanity sakes
p_sample1[n] += trace[n][i];
if(p_sample1[n] >= maxValAllowed) p_sample1[n] = p_sample1[n] -
maxValAllowed;
p_sample2[n] = p_sample1[n] + p_sample2[n];
if(p_sample2[n] >= maxValAllowed) p_sample2[n] = p_sample2[n] -
maxValAllowed;
trace[n][i] = p_sample2[n];
maxVal[n] = Math.max(maxVal[n], trace[n][i]);
}
}
// set into output SCF chromat object
try {
out.setTrace(DNATools.a(), trace[0], maxVal[0]);
out.setTrace(DNATools.c(), trace[1], maxVal[1]);
out.setTrace(DNATools.g(), trace[2], maxVal[2]);
out.setTrace(DNATools.t(), trace[3], maxVal[3]);
} catch (IllegalSymbolException ise) {
throw new BioError(ise,"Can't happen");
}
}
protected void readSamplesInto(int[] samps) throws IOException {
if (header.sample_size == 1) {
for (int i = 0 ; i < samps.length ; i++) {
samps[i] = din.readUnsignedByte();
offset += 1;
}
} else if (header.sample_size == 2) {
for (int i = 0 ; i < samps.length ; i++) {
samps[i] = din.readUnsignedShort();
offset += 2;
}
}
}
protected void parseBases() throws IOException,
UnsupportedChromatogramFormatException {
skipTo(header.bases_offset);
int count = (int) header.bases;
List[] probs = new ArrayList[7];
for (int i = 0 ; i < 7 ; i++) probs[i] = new ArrayList(count);
List offsets = new ArrayList(count);
List dna = new ArrayList(count);
long tmp;
try {
for (int i = 0 ; i < count ; i++) {
// read the first set of sequential data (the trace peak offsets)
tmp = 0xFFFFFFFF & din.readInt();
offset += 4;
if (tmp > Integer.MAX_VALUE)
throw new
UnsupportedChromatogramFormatException(
"SCF contains a base with peak offset > " + Integer.MAX_VALUE);
offsets.add(IntegerAlphabet.getInstance().getSymbol((int) tmp));
}
// read sets of probs for A, C, G, T
for (int j = 0 ; j < 4 ; j++) {
for (int i = 0 ; i < count ; i++) {
probs[j].add(getProbabilityAlphabet().getSymbol(din.readByte() & 0xFF));
offset++;
}
}
// read bases
try {
for (int i = 0 ; i < count ; i++) {
dna.add(decoder.decode(din.readByte()));
offset++;
}
} catch (IllegalSymbolException ise) {
UnsupportedChromatogramFormatException ue = new UnsupportedChromatogramFormatException("Base call decoding failure");
ue.initCause(ise);
throw ue;
}
// read 'spare' probs
for (int j = 4 ; j < 7 ; j++) {
for (int i = 0 ; i < count ; i++) {
probs[j].add(getProbabilityAlphabet().getSymbol(din.readByte() & 0xFF));
offset++;
}
}
} catch (IllegalSymbolException ise) {
throw new BioError(ise,"Can't happen unless there's a misdefinition of getProbabilityAlphabet() or IntegerAlphabet");
}
// create/set base call list
createAndSetBaseCallAlignment(dna, offsets, probs);
}
} // end SCFv3Parser
private static class V2Parser extends Parser {
V2Parser(DataInputStream din, SCF out,
SCF.Parser.HeaderStruct header, long initOffset)
throws IOException, UnsupportedChromatogramFormatException {
super(din, out, header, initOffset);
}
protected void parseSamples() throws IOException {
int count = (int) header.samples;
int[][] trace = new int[4][count];
int[] maxVal = new int[] { Integer.MIN_VALUE,
Integer.MIN_VALUE,
Integer.MIN_VALUE,
Integer.MIN_VALUE };
if (header.sample_size == 1) {
for (int i = 0 ; i < count ; i++) {
for (int n = 0 ; n < 4 ; n++) {
trace[n][i] = din.readUnsignedByte();
maxVal[n] = Math.max(trace[n][i], maxVal[n]);
offset++;
}
}
} else if (header.sample_size == 2) {
for (int i = 0 ; i < count ; i++) {
for (int n = 0 ; n < 4 ; n++) {
trace[n][i] = din.readUnsignedShort();
maxVal[n] = Math.max(trace[n][i], maxVal[n]);
offset += 2;
}
}
}
// set into output SCF chromat object
try {
out.setTrace(DNATools.a(), trace[0], maxVal[0]);
out.setTrace(DNATools.c(), trace[1], maxVal[1]);
out.setTrace(DNATools.g(), trace[2], maxVal[2]);
out.setTrace(DNATools.t(), trace[3], maxVal[3]);
} catch (IllegalSymbolException ise) {
throw new BioError(ise,"Can't happen");
}
}
protected void parseBases() throws IOException,
UnsupportedChromatogramFormatException {
skipTo(header.bases_offset);
int count = (int) header.bases;
List[] probs = new ArrayList[7];
for (int i = 0 ; i < probs.length ; i++) probs[i] = new
ArrayList(count);
List dna = new ArrayList(count);
List offsets = new ArrayList(count);
long tmp;
byte[] probTmp = new byte[7];
for (int i = 0 ; i < count ; i++) {
// read the peak index
tmp = 0xFFFFFFFF & din.readInt();
offset += 4;
if (tmp > Integer.MAX_VALUE)
throw new UnsupportedChromatogramFormatException(
"SCF contains a base with peak offset > " + Integer.MAX_VALUE);
offsets.add(IntegerAlphabet.getInstance().getSymbol((int)
tmp));
// read the per-base probabilities
din.read(probTmp, 0, 4);
offset += 4;
// read the actual base called
try {
dna.add(decoder.decode(din.readByte()));
offset += 1;
} catch (IllegalSymbolException ise) {
UnsupportedChromatogramFormatException ue = new UnsupportedChromatogramFormatException(
"Base call decoding failure");
ue.initCause(ise);
throw ue;
}
// read the spare probability fields
din.read(probTmp, 4, 3);
offset += 3;
try {
for (int p = 0 ; p < 7 ; p++)
probs[p].add(getProbabilityAlphabet().getSymbol(0xFF & probTmp[p]));
} catch (IllegalSymbolException ise) {
throw new BioError(ise,"Can't happen unless getProbabilityAlphabet() has been misdefined.");
}
}
createAndSetBaseCallAlignment(dna, offsets, probs);
}
}
}