/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.dist;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import org.biojava.bio.Annotation;
import org.biojava.bio.BioError;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceFactory;
import org.biojava.bio.seq.impl.SimpleSequenceFactory;
import org.biojava.bio.symbol.Alignment;
import org.biojava.bio.symbol.Alphabet;
import org.biojava.bio.symbol.AlphabetIndex;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.AtomicSymbol;
import org.biojava.bio.symbol.BasisSymbol;
import org.biojava.bio.symbol.FiniteAlphabet;
import org.biojava.bio.symbol.IllegalAlphabetException;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.Location;
import org.biojava.bio.symbol.LocationTools;
import org.biojava.bio.symbol.PackedSymbolListFactory;
import org.biojava.bio.symbol.PointLocation;
import org.biojava.bio.symbol.SimpleSymbolListFactory;
import org.biojava.bio.symbol.Symbol;
import org.biojava.bio.symbol.SymbolList;
import org.biojava.bio.symbol.SymbolListFactory;
import org.biojava.bio.symbol.SymbolListViews;
import org.biojava.utils.AssertionFailure;
import org.biojava.utils.ChangeVetoException;
import org.xml.sax.SAXException;
/**
* A class to hold static methods for calculations and manipulations using
* Distributions.
*
* @author Mark Schreiber
* @author Matthew Pocock
* @since 1.2
*/
public final class DistributionTools {
/**
* Overide the constructer to prevent subclassing.
*/
private DistributionTools(){}
/**
* Writes a Distribution to XML that can be read with the readFromXML method.
*
* @param d the Distribution to write.
* @param os where to write it to.
* @throws IOException if writing fails
*/
public static void writeToXML(Distribution d, OutputStream os) throws IOException{
new XMLDistributionWriter().writeDistribution(d, os);
}
/**
* Read a distribution from XML.
*
* @param is an InputStream to read from
* @return a Distribution parameterised by the xml in is
* @throws IOException if is failed
* @throws SAXException if is could not be processed as XML
*/
public static Distribution readFromXML(InputStream is)throws IOException, SAXException{
XMLDistributionReader writer = new XMLDistributionReader();
return writer.parseXML(is);
}
/**
* Randomizes the weights of a Distribution
.
*
* @param d the Distribution
to randomize
* @throws ChangeVetoException if the Distribution is locked
*/
public static void randomizeDistribution(Distribution d)
throws ChangeVetoException{
Random rand = new Random();
FiniteAlphabet a = (FiniteAlphabet)d.getAlphabet();
AlphabetIndex ind = AlphabetManager.getAlphabetIndex(a);
DistributionTrainerContext dtc = new SimpleDistributionTrainerContext();
dtc.registerDistribution(d);
for(int i = 0; i < a.size(); i++){
try {
dtc.addCount(d,ind.symbolForIndex(i),rand.nextDouble());
}
catch (IllegalSymbolException ex) {
throw new BioError("Alphabet has Illegal Symbols!!", ex);
}
}
dtc.train();
}
/**
* Make a distribution from a count.
*
* @param c the count
* @return a Distrubution over the same FiniteAlphabet
as c
* and trained with the counts of c
*/
public static Distribution countToDistribution(Count c){
FiniteAlphabet a = (FiniteAlphabet)c.getAlphabet();
Distribution d = null;
try{
d = DistributionFactory.DEFAULT.createDistribution(a);
AlphabetIndex index =
AlphabetManager.getAlphabetIndex(a);
DistributionTrainerContext dtc = new SimpleDistributionTrainerContext();
dtc.registerDistribution(d);
for(int i = 0; i < a.size(); i++){
dtc.addCount(d, index.symbolForIndex(i),
c.getCount((AtomicSymbol)index.symbolForIndex(i)));
}
dtc.train();
} catch (IllegalAlphabetException iae) {
throw new AssertionFailure("Assertion failure: Alphabets don't match");
}catch(IllegalSymbolException ise){
throw new AssertionFailure("Assertion Error: Cannot convert Count to Distribution", ise);
} catch (ChangeVetoException cve) {
throw new AssertionFailure("Assertion failure: distributions or counts got locked.", cve);
}
return d;
}
/**
* Compares the emission spectra of two distributions.
*
* @return true if alphabets and symbol weights are equal for the two distributions.
* @throws BioException if one or both of the Distributions are over infinite alphabets.
* @since 1.2
* @param a A Distribution
with the same Alphabet
as
* b
* @param b A Distribution
with the same Alphabet
as
* a
*/
public static final boolean areEmissionSpectraEqual(Distribution a, Distribution b)
throws BioException{
//are either of the Dists infinite
if(a.getAlphabet() instanceof FiniteAlphabet == false
|| b.getAlphabet() instanceof FiniteAlphabet == false){
throw new IllegalAlphabetException("Cannot compare emission spectra over infinite alphabet");
}
//are alphabets equal?
if(!(a.getAlphabet().equals(b.getAlphabet()))){
return false;
}
//are emissions equal?
for(Iterator i = ((FiniteAlphabet)a.getAlphabet()).iterator();i.hasNext();){
Symbol s = (Symbol)i.next();
if(a.getWeight(s) != b.getWeight(s)) return false;
}
return true;
}
/**
* Compares the emission spectra of two distribution arrays.
*
* @return true if alphabets and symbol weights are equal for each pair
* of distributions. Will return false if the arrays are of unequal length.
* @throws BioException if one of the Distributions is over an infinite
* alphabet.
* @since 1.3
* @param a A Distribution[]
consisting of Distributions
* over a FiniteAlphabet
* @param b A Distribution[]
consisting of Distributions
* over a FiniteAlphabet
*/
public static final boolean areEmissionSpectraEqual(Distribution[] a,
Distribution[] b)
throws BioException{
if(a.length != b.length) return false;
for (int i = 0; i < a.length; i++) {
if(areEmissionSpectraEqual(a[i], b[i]) == false){
return false;
}
}
return true;
}
/**
* A method to calculate the Kullback-Liebler Distance (relative entropy).
*
* @param logBase - the log base for the entropy calculation. 2 is standard.
* @param observed - the observed frequence of Symbols
.
* @param expected - the excpected or background frequency.
* @return - A HashMap mapping Symbol to (Double)
relative entropy.
* @since 1.2
*/
public static final HashMap KLDistance(Distribution observed,
Distribution expected,
double logBase){
Iterator alpha = ((FiniteAlphabet)observed.getAlphabet()).iterator();
HashMap kldist = new HashMap(((FiniteAlphabet)observed.getAlphabet()).size());
while(alpha.hasNext()){
Symbol s = (Symbol)alpha.next();
try{
double obs = observed.getWeight(s);
double exp = expected.getWeight(s);
if(obs == 0.0){
kldist.put(s,new Double(0.0));
}else{
double entropy = obs * (Math.log(obs/exp))/Math.log(logBase);
kldist.put(s,new Double(entropy));
}
}catch(IllegalSymbolException ise){
ise.printStackTrace(System.err);
}
}
return kldist;
}
/**
* A method to calculate the Shannon Entropy for a Distribution.
*
* @param logBase - the log base for the entropy calculation. 2 is standard.
* @param observed - the observed frequence of Symbols
.
* @return - A HashMap mapping Symbol to (Double)
entropy.
* @since 1.2
*/
public static final HashMap shannonEntropy(Distribution observed, double logBase){
Iterator alpha = ((FiniteAlphabet)observed.getAlphabet()).iterator();
HashMap entropy = new HashMap(((FiniteAlphabet)observed.getAlphabet()).size());
while(alpha.hasNext()){
Symbol s = (Symbol)alpha.next();
try{
double obs = observed.getWeight(s);
if(obs == 0.0){
// entropy.put(s,new Double(0.0));
}else{
double e = -(Math.log(obs))/Math.log(logBase);
entropy.put(s,new Double(e));
}
}catch(IllegalSymbolException ise){
ise.printStackTrace(System.err);
}
}
return entropy;
}
/**
* Calculates the total Entropy for a Distribution. Entropies for individual
* Symbols
are weighted by their probability of occurence.
* @param observed the observed frequence of Symbols
.
* @return the total entropy of the Distribution
.
*/
public static double totalEntropy(Distribution observed){
HashMap ent = shannonEntropy(observed, 2.0);
double totalEntropy = 0.0;
try{
for(Iterator i = ent.keySet().iterator(); i.hasNext();){
Symbol sym = (Symbol) i.next();
totalEntropy += observed.getWeight(sym)*((Double)ent.get(sym)).doubleValue();
}
}
catch(Exception e){
e.printStackTrace(System.err);
}
return totalEntropy;
}
/**
* Calculates the total bits of information for a distribution.
* @param observed - the observed frequence of Symbols
.
* @return the total information content of the Distribution
.
* @since 1.2
*/
public static final double bitsOfInformation(Distribution observed){
double totalEntropy = totalEntropy(observed);
int size = ((FiniteAlphabet)observed.getAlphabet()).size();
return Math.log((double)size)/Math.log(2.0) - totalEntropy;
}
/**
* Equivalent to distOverAlignment(a, false, 0.0).
*
* @param a the Alignment
* @return an array of Distribution instances representing columns of the
* alignment
* @throws IllegalAlphabetException if the alignment alphabet is not
* compattible
*/
public static Distribution[] distOverAlignment(Alignment a)
throws IllegalAlphabetException{
return distOverAlignment(a,false,0.0);
}
/**
* Creates a joint distribution.
*
* @throws IllegalAlphabetException if all sequences don't use the same alphabet
* @param a the Alignment
to build the Distribution[]
over.
* @param countGaps if true gaps will be included in the distributions
* (NOT YET IMPLEMENTED!!, CURRENTLY EITHER OPTION WILL PRODUCE THE SAME RESULT)
* @param nullWeight the number of pseudo counts to add to each distribution
* @param cols a list of positions in the alignment to include in the joint distribution
* @return a Distribution
* @since 1.2
*/
public static final Distribution jointDistOverAlignment(Alignment a,
boolean countGaps,
double nullWeight,
int[] cols)
throws IllegalAlphabetException {
List seqs = a.getLabels();
FiniteAlphabet alpha =
(FiniteAlphabet)((SymbolList)a.symbolListForLabel(seqs.get(0))).getAlphabet();
for(int i = 1; i < seqs.size();i++){
FiniteAlphabet test = (FiniteAlphabet)((SymbolList)a.symbolListForLabel(seqs.get(i))).getAlphabet();
if(test != alpha){
throw new IllegalAlphabetException("Cannot Calculate jointDistOverAlignment() for alignments with"+
"mixed alphabets");
}
}
List a_list = new ArrayList();
for(int i=0; iAlignment to build the Distribution[]
over.
* @param countGaps if true gaps will be included in the distributions
* @param nullWeight the number of pseudo counts to add to each distribution,
* pseudo counts will not affect gaps, no gaps, no gap counts.
* @return a Distribution[]
where each member of the array is a
* Distribution
of the Symbols
found at that position
* of the Alignment
.
* @since 1.2
*/
public static final Distribution[] distOverAlignment(Alignment a,
boolean countGaps,
double nullWeight)
throws IllegalAlphabetException {
List seqs = a.getLabels();
FiniteAlphabet alpha = (FiniteAlphabet)((SymbolList)a.symbolListForLabel(seqs.get(0))).getAlphabet();
for(int i = 1; i < seqs.size();i++){
FiniteAlphabet test = (FiniteAlphabet)((SymbolList)a.symbolListForLabel(seqs.get(i))).getAlphabet();
if(test != alpha){
throw new IllegalAlphabetException("Cannot Calculate distOverAlignment() for alignments with"+
"mixed alphabets");
}
}
Distribution[] pos = new Distribution[a.length()];
DistributionTrainerContext dtc = new SimpleDistributionTrainerContext();
dtc.setNullModelWeight(nullWeight);
double[] adjRatios = null;
if(countGaps){
adjRatios = new double[a.length()];
}
try{
for(int i = 0; i < a.length(); i++){// For each position
double gapCount = 0.0;
double totalCount = 0.0;
pos[i] = DistributionFactory.DEFAULT.createDistribution(alpha);
dtc.registerDistribution(pos[i]);
for(Iterator j = seqs.iterator(); j.hasNext();){// of each sequence
Object seqLabel = j.next();
Symbol s = a.symbolAt(seqLabel,i + 1);
/*If this is working over a flexible alignment there is a possibility
that s could be null if this Sequence is not really preset in this
region of the Alignment. In this case it will be skipped*/
if(s == null)
continue;
Symbol gap = alpha.getGapSymbol();
if(countGaps &&
s.equals(gap)){
gapCount++; totalCount++;
}else{
dtc.addCount(pos[i],s,1.0);// count the symbol
totalCount++;
}
}
if(countGaps){
adjRatios[i] = 1.0 - (gapCount / totalCount);
}
}
dtc.train();
if(countGaps){//need to adjust counts for gaps
for (int i = 0; i < adjRatios.length; i++) {
Distribution d = pos[i];
for (Iterator iter = ((FiniteAlphabet)d.getAlphabet()).iterator();
iter.hasNext(); ) {
Symbol sym = (Symbol)iter.next();
d.setWeight(sym, (d.getWeight(sym) * adjRatios[i]));
}
}
}
}catch(Exception e){
e.printStackTrace(System.err);
}
return pos;
}
/**
* Creates an array of distributions, one for each column of the alignment.
* No pseudo counts are used.
* @param countGaps if true gaps will be included in the distributions
* @param a the Alignment
to build the Distribution[]
over.
* @throws IllegalAlphabetException if the alignment is not composed from sequences all
* with the same alphabet
* @return a Distribution[]
where each member of the array is a
* Distribution
of the Symbols
found at that position
* of the Alignment
.
* @since 1.2
*/
public static final Distribution[] distOverAlignment(Alignment a,
boolean countGaps)
throws IllegalAlphabetException {
return distOverAlignment(a,countGaps,0.0);
}
/**
* Averages two or more distributions. NOTE the current implementation ignore the null model.
* @since 1.2
* @param dists the Distributions
to average
* @return a Distribution
were the weight of each Symbol
* is the average of the weights of that Symbol
in each Distribution
.
*/
public static final Distribution average (Distribution [] dists){
Alphabet alpha = dists[0].getAlphabet();
//check if all alphabets are the same
for (int i = 1; i < dists.length; i++) {
if(!(dists[i].getAlphabet().equals(alpha))){
throw new IllegalArgumentException("All alphabets must be the same");
}
}
try{
Distribution average = DistributionFactory.DEFAULT.createDistribution(alpha);
DistributionTrainerContext dtc = new SimpleDistributionTrainerContext();
dtc.registerDistribution(average);
for (int i = 0; i < dists.length; i++) {// for each distribution
for(Iterator iter = ((FiniteAlphabet)dists[i].getAlphabet()).iterator(); iter.hasNext(); ){//for each symbol
Symbol sym = (Symbol)iter.next();
dtc.addCount(average,sym,dists[i].getWeight(sym));
}
}
dtc.train();
return average;
} catch(IllegalAlphabetException iae){//The following throw unchecked exceptions as they shouldn't happen
throw new AssertionFailure("Distribution contains an illegal alphabet", iae);
} catch(IllegalSymbolException ise){
throw new AssertionFailure("Distribution contains an illegal symbol", ise);
} catch(ChangeVetoException cve){
throw new AssertionFailure("The Distribution has become locked", cve);
}
}
/**
* Produces a sequence by randomly sampling the Distribution.
*
* @param name the name for the sequence
* @param d the distribution to sample. If this distribution is of order N a
* seed sequence is generated allowed to 'burn in' for 1000 iterations and used
* to produce a sequence over the conditioned alphabet.
* @param length the number of symbols in the sequence.
* @return a Sequence with name and urn = to name and an Empty Annotation.
*/
public static final Sequence generateSequence(String name, Distribution d, int length){
SymbolList sl = generateSymbolList(d, length);
SequenceFactory fact = new SimpleSequenceFactory();
return fact.createSequence(sl, name, name, Annotation.EMPTY_ANNOTATION);
//return new SimpleSequence(sl,name,name,Annotation.EMPTY_ANNOTATION);
}
/**
* Produces a SymbolList
by randomly sampling a Distribution.
*
* @param d the distribution to sample. If this distribution is of order N a
* seed sequence is generated allowed to 'burn in' for 1000 iterations and used
* to produce a sequence over the conditioned alphabet.
* @param length the number of symbols in the sequence.
* @return a SymbolList or length length
*/
public static final SymbolList generateSymbolList(Distribution d, int length){
if(d instanceof OrderNDistribution)
return generateOrderNSymbolList((OrderNDistribution)d, length);
SymbolList sl = null;
List l = new ArrayList(length);
for (int i = 0; i < length; i++) {
l.add(d.sampleSymbol());
}
try {
SymbolListFactory fact;
if(length < 10000){
fact = new SimpleSymbolListFactory();
}else{
fact = new PackedSymbolListFactory();
}
Symbol[] syms = new Symbol[length];
l.toArray(syms);
sl = fact.makeSymbolList(syms, length, d.getAlphabet());
//sl = new SimpleSymbolList(d.getAlphabet(),l);
}
catch (IllegalAlphabetException ex) {
//shouldn't happen but...
throw new BioError("Distribution emitting Symbols not from its Alphabet?");
}
return sl;
}
private static final SymbolList generateOrderNSymbolList(OrderNDistribution d, int length){
SymbolList sl = null;
List l = new ArrayList(length);
/*
* When emitting an orderN sequence a seed sequence is required that is of the
* length of the conditioning alphabet. The emissions will also be allowed
* to 'burn in' for 1000 emissions so that the 'end effect' of the seed
* is negated.
*/
FiniteAlphabet cond = (FiniteAlphabet)d.getConditioningAlphabet();
UniformDistribution uni = new UniformDistribution(cond);
BasisSymbol seed = (BasisSymbol)uni.sampleSymbol();
//using the linked list the seed becomes like a history buffer.
LinkedList ll = new LinkedList(seed.getSymbols());
try {
for(int i = 0; i < 1000+ length; i++){
//get a symbol using the seed
Symbol sym = d.getDistribution(seed).sampleSymbol();
if(i >= 1000){
l.add(sym);
}
//add the symbol to the end of the seed
ll.addLast(sym);
//remove the first basis symbol of the seed
ll.removeFirst();
//regenerate the seed
seed = (BasisSymbol)cond.getSymbol(ll);
}
SymbolListFactory fact;
if(length < 10000){
fact = new SimpleSymbolListFactory();
}else{
fact = new PackedSymbolListFactory();
}
Symbol[] syms = new Symbol[l.size()];
l.toArray(syms);
sl = fact.makeSymbolList(syms, length, d.getConditionedAlphabet());
//sl = new SimpleSymbolList(d.getConditionedAlphabet(),l);
}
catch (IllegalSymbolException ex) {
//shouldn't happen but...
throw new BioError("Distribution emitting Symbols not from its Alphabet?",ex);
}catch(IllegalAlphabetException ex){
//shouldn't happen but...
throw new BioError("Distribution emitting Symbols not from its Alphabet?",ex);
}
return sl;
}
/**
* Generate a sequence by sampling a distribution.
*
* @deprecated use generateSequence() or generateSymbolList() instead.
* @param name the name of the sequence
* @param d the distribution to sample
* @param length the length of the sequence
* @return a new sequence with the required composition
*/
protected static final Sequence generateOrderNSequence(String name, OrderNDistribution d, int length){
SymbolList sl = generateOrderNSymbolList(d, length);
SequenceFactory fact = new SimpleSequenceFactory();
return fact.createSequence(sl, name, name, Annotation.EMPTY_ANNOTATION);
//return new SimpleSequence(sl, name, name, Annotation.EMPTY_ANNOTATION);
}
}//End of class