/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.seq.io;
import java.io.IOException;
import java.io.InputStream;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.biojava.bio.Annotation;
import org.biojava.bio.BioError;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.seq.Feature;
import org.biojava.bio.seq.RemoteFeature;
import org.biojava.bio.seq.StrandedFeature;
import org.biojava.bio.symbol.AlphabetIndex;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.BetweenLocation;
import org.biojava.bio.symbol.FuzzyLocation;
import org.biojava.bio.symbol.FuzzyPointLocation;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.Location;
import org.biojava.bio.symbol.PointLocation;
import org.biojava.bio.symbol.Symbol;
import org.biojava.utils.ClassTools;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* AbstractGenEmblFileFormer
contain file formatting code
* common to both GenBank and EMBL file formats.
*
* @author Keith James
* @author Greg Cox
* @since 1.2
*/
class AbstractGenEmblFileFormer
{
private static final String FEATURE_DATA_FILE =
"org/biojava/bio/seq/io/FeatureQualifier.xml";
private static final Map FEATURE_DATA = new HashMap();
private static final Map QUALIFIER_DATA = new HashMap();
static
{
/* This loads an XML file containing information on which
* qualifiers are valid (or even mandatory) for a particular
* feature key. It also indicates whether the value should be
* contained within quotes.
*/
loadFeatureData(FEATURE_DATA_FILE, FEATURE_DATA, QUALIFIER_DATA);
}
/* Defines types of qualifier lines encountered: FIRST - the first
* line of the qualifier, OVERWIDE - contains a very wide token
* which can not be wrapped without breaking it, NOFIT - a line
* which is too wide, but may be wrapped without breaking tokens,
* FIT - a line which is short enough to add without wrapping it.
*/
static final int FIRST = 0;
static final int OVERWIDE = 1;
static final int NOFIT = 2;
static final int FIT = 3;
/* Defines the various types of Location available. Each is
* represented differently within an EMBL/Genbank flatfile.
*/
static final int RANGE = 0;
static final int POINT = 1;
static final int FUZZY_RANGE = 2;
static final int FUZZY_POINT = 3;
static final int BETWEEN_LOCATION = 4;
// Get separator for system
String nl = System.getProperty("line.separator");
SymbolTokenization dnaTokenization;
AlphabetIndex dnaIndex;
int aCount, cCount, gCount, tCount, oCount;
Symbol a, c , g, t;
{
dnaIndex = AlphabetManager.getAlphabetIndex(DNATools.getDNA());
a = DNATools.a();
c = DNATools.c();
g = DNATools.g();
t = DNATools.t();
try
{
dnaTokenization = DNATools.getDNA().getTokenization("token");
}
catch (Exception e)
{
throw new BioError("Couldn't initialize tokenizer for the DNA alphabet",e);
}
}
/**
* formatSequenceProperty
formats text into
* EMBL/Genbank style header lines.
*
* @param sb a StringBuffer
.
* @param text a String
to format.
* @param leader a String
to append to the start of
* each line.
* @param wrapWidth an int
indicating the number of
* columns per line.
*
* @return a StringBuffer
.
*/
StringBuffer formatSequenceProperty(StringBuffer sb,
String text,
String leader,
int wrapWidth)
{
BreakIterator boundary = BreakIterator.getLineInstance();
boundary.setText(text);
int start = boundary.first();
int end = boundary.next();
int lineLength = 0;
sb.append(leader);
while (end != BreakIterator.DONE)
{
String word = text.substring(start, end);
lineLength = lineLength + word.length();
if (lineLength >= wrapWidth)
{
sb.append(nl);
sb.append(leader);
lineLength = word.length();
}
sb.append(word);
start = end;
end = boundary.next();
}
return sb;
}
/**
* formatQualifierBlock
formats text into
* EMBL/Genbank style qualifiers.
*
* @param text a String
to format.
* @param leader a String
to append to the start of
* each line.
* @param wrapWidth an int
indicating the number of
* columns per line.
*
* @return a StringBuffer
.
*/
StringBuffer formatQualifierBlock(StringBuffer sb,
String text,
String leader,
int wrapWidth)
{
int tokenType = FIRST;
int position = leader.length();
sb.append(leader);
StringTokenizer t = new StringTokenizer(text);
TOKEN:
while (t.hasMoreTokens())
{
String s = t.nextToken();
String separator = "";
int tokenLen = s.length();
// The first token has to be treated differently. It
// always starts immediately after the '=' character
if (! (tokenType == FIRST))
{
separator = " ";
if (tokenLen + 1 > wrapWidth)
tokenType = OVERWIDE;
else if (position + tokenLen + 1 > wrapWidth)
tokenType = NOFIT;
else
tokenType = FIT;
}
switch (tokenType)
{
case FIRST:
// The first line always always starts immediately
// after the '=' character, even if it means
// forcing a break
if (! (position + tokenLen > wrapWidth))
{
sb.append(s);
position += s.length();
tokenType = FIT;
continue TOKEN;
}
separator = " ";
case OVERWIDE:
// Force breaks in the token until the end is
// reached
for (int i = 0; i < tokenLen; i++)
{
if (position == wrapWidth)
{
sb.append(nl);
sb.append(leader);
position = leader.length();
}
sb.append(s.charAt(i));
position++;
}
position = tokenLen % wrapWidth;
break;
case NOFIT:
// Token won't fit, so pass it to the next line
sb.append(nl);
sb.append(leader);
sb.append(s);
position = tokenLen + leader.length();
break;
case FIT:
// Token fits on this line
sb.append(separator);
sb.append(s);
position += (tokenLen + 1);
break;
default:
// Nothing
break;
} // end switch
} // end while
return sb;
}
/**
* formatQualifier
creates a qualifier string and
* adds quotes or parens to EMBL/Genbank qualifier values as
* specified in the internally loaded XML feature table
* description.
*
* @param key an Object
(the qualifier key).
* @param value an Object
(the qualifier content).
*
* @return a String
bounded by the correct tokens.
*/
StringBuffer formatQualifier(StringBuffer sb, Object key, Object value)
{
sb.append('/');
sb.append(key);
// Default is to quote unknown qualifiers
String form = "quoted";
if (QUALIFIER_DATA.containsKey(key))
form = (String) ((Map) QUALIFIER_DATA.get(key)).get("form");
// This is a slight simplification. There are some types of
// qualifier which are unquoted unless they contain
// spaces. We all love special cases, don't we?
if (form.equals("quoted"))
{
sb.append("=\"");
sb.append(value);
sb.append("\"");
}
else if (form.equals("bare"))
{
sb.append('=');
sb.append(value);
}
else if (form.equals("paren"))
{
sb.append('(');
sb.append(value);
sb.append(')');
}
else if (! form.equals("empty"))
{
sb.append('=');
sb.append(value);
}
return sb;
}
/**
* formatTokenBlock
divides up the tokens
* representing the Symbols
into blocks of the
* specified length, with a single space delimeter.
*
* @param syms a Symbol []
array whose tokens are to
* be formatted.
* @param blockSize an int
indicating the size of
* each block.
*
* @return a StringBuffer
.
*/
StringBuffer formatTokenBlock(StringBuffer sb,
Symbol [] syms,
int blockSize,
SymbolTokenization tokenization)
throws IllegalSymbolException
{
for (int i = 0; i < syms.length; i++)
{
sb.append(tokenization.tokenizeSymbol(syms[i]));
if ((i + 1) % blockSize == 0)
sb.append(' ');
}
return sb;
}
/**
* Formats the location of a feature. This version is required
* when formatting remote locations, since the location field of a
* remote feature is the projection of that feature on the
* sequence. When a distinction is made between 'order' and
* 'join' this method will likely be extended for that also.
*
* @param theFeature The feature with the location to format
*
* @return String The formatted location
*/
public String formatLocation(Feature theFeature)
{
String formattedLocation = null;
StrandedFeature.Strand featureStrand = StrandedFeature.POSITIVE;
String joinType = "join";
Annotation ann = theFeature.getAnnotation();
if (ann.containsProperty(Feature.PROPERTY_DATA_KEY))
{
Map dat = (Map) ann.getProperty(Feature.PROPERTY_DATA_KEY);
if (dat.containsKey("JoinType"))
{
joinType = (String) dat.get("JoinType");
}
}
if (theFeature instanceof RemoteFeature)
{
StringBuffer tempBuffer = new StringBuffer();
List regionList = ((RemoteFeature)theFeature).getRegions();
if (regionList.size() > 1)
{
tempBuffer.append(joinType);
tempBuffer.append('(');
}
java.util.ListIterator tempIterator = regionList.listIterator();
while (tempIterator.hasNext())
{
// Set up remote location
RemoteFeature.Region tempRegion = (RemoteFeature.Region)(tempIterator.next());
if (tempRegion.getSeqID() != null)
{
tempBuffer.append(tempRegion.getSeqID());
tempBuffer.append(':');
}
// Add the actual location
String tempLocation = null;
if (theFeature instanceof StrandedFeature)
{
featureStrand = ((StrandedFeature)theFeature).getStrand();
}
tempLocation = this.formatLocation(tempRegion.getLocation(), featureStrand);
tempBuffer.append(tempLocation);
// Only have commas between two subregions
if (tempIterator.hasNext())
{
tempBuffer.append(',');
}
}
// Close out the join block if needed
if (regionList.size() > 1)
{
tempBuffer.append(')');
}
formattedLocation = tempBuffer.substring(0);
}
else
{
if (theFeature instanceof StrandedFeature)
{
featureStrand = ((StrandedFeature)theFeature).getStrand();
}
StringBuffer tempBuffer = new StringBuffer();
formattedLocation = this.formatLocationBlock(tempBuffer,
theFeature.getLocation(), featureStrand.getValue(), "",
Integer.MAX_VALUE, joinType).toString();
}
return formattedLocation;
}
/**
* formatLocation
creates an EMBL/Genbank style
* representation of a Location
. This is a
* convenience method only. The version which has a
* StringBuffer
parameter (and returns the
* StringBuffer
) is preferred. If a compound location is
* formatted using this method, it is returned as a join-type location
* rather than an order-type.
*
* @param loc a Location
to format.
* @param strand a StrandedFeature.Strand
* indicating the Location
's strand.
*
* @return a StringBuffer
.
*/
public String formatLocation(Location loc,
StrandedFeature.Strand strand)
{
// Using arbitrary leader and wrapwidth wide enough to always
// make one line.
StringBuffer sb = formatLocationBlock(new StringBuffer(),
loc,
strand.getValue(),
"",
Integer.MAX_VALUE,
"join");
return sb.substring(0);
}
/**
* formatLocation
creates an EMBL/Genbank style
* representation of a Location
. Supported location
* forms:
*
*
* 123 * <123 or >123 * (123.567) * (123.567)..789 * 123..(567.789) * (123.345)..(567.789) * 123..456 * <123..567 or 123..>567 or <123..>567 * 123^567 * AL123465:(123..567) ** * If a compound location is formatted using this method, it is returned as * a join-type location rather than an order-type. * * To preserve the join/order distinction; and to format locations like * AL123465:(123..567), use the formatLocation(Feature) method. * * @param sb a
StringBuffer
Location to format.
* @param strand a StrandedFeature.Strand
* indicating the Location
's strand.
*
* @return a StringBuffer
.
*/
public StringBuffer formatLocation(StringBuffer sb,
Location loc,
StrandedFeature.Strand strand)
{
// Using arbitrary leader and wrapwidth wide enough to always
// make one line
return formatLocationBlock(sb, loc, strand.getValue(), "", Integer.MAX_VALUE, "join");
}
/**
* formatLocationBlock
creates an EMBL/Genbank style
* representation of a Location
wrapped to a specific
* width.
*
* If a compound location is formatted using this method, it is returned as
* a join-type location rather than an order-type.
*
* To preserve the join/order distinction; and to format locations like
* AL123465:(123..567), use the formatLocation(Feature) method.
*
* @param loc a Location
to use as a template.
* @param strand an int
indicating the
* Location
's strand.
* @param leader a String
to append to the start of
* each line.
* @param wrapWidth an int
indicating the number of
* columns per line.
*
* @return a StringBuffer
.
*/
StringBuffer formatLocationBlock(StringBuffer sb,
Location loc,
int strand,
String leader,
int wrapWidth)
{
return this.formatLocationBlock(sb, loc, strand, leader, wrapWidth, "join");
}
/**
* formatLocationBlock
creates an EMBL/Genbank style
* representation of a Location
wrapped to a specific
* width.
*
* @param loc a Location
to use as a template.
* @param strand an int
indicating the
* Location
's strand.
* @param leader a String
to append to the start of
* each line.
* @param wrapWidth an int
indicating the number of
* columns per line.
* @param joinType Only used if the location is a compound
* location. It is prepended to the list of locations.
*
* @return a StringBuffer
.
*/
StringBuffer formatLocationBlock(StringBuffer sb,
Location loc,
int strand,
String leader,
int wrapWidth,
String joinType)
{
// Indicates how many characters have been added to the
// current line
int position = leader.length();
boolean join = false;
boolean complement = false;
List locs = new ArrayList();
for (Iterator li = loc.blockIterator(); li.hasNext();)
{
locs.add(li.next());
}
/* There are issues here about choosing various forms:
* join(complement(...),complement(...))
* complement(join(...,...))
*
* The former has the locations sorted in reverse order.
*/
Collections.sort(locs, Location.naturalOrder);
if (!loc.isContiguous())
{
join = true;
sb.append(joinType);
sb.append('(');
position += 5;
}
if (strand == -1)
{
Collections.reverse(locs);
complement = true;
}
int locType = 0;
// Records the length of the String(s) added to the buffer to
// determine whether we need to wrap the line
int pre, post;
int diff = 0;
for (Iterator li = locs.iterator(); li.hasNext();)
{
Location thisLoc = (Location) li.next();
pre = sb.length();
if (PointLocation.class.isInstance(thisLoc))
locType = POINT;
else if (FuzzyLocation.class.isInstance(thisLoc))
locType = FUZZY_RANGE;
else if (FuzzyPointLocation.class.isInstance(thisLoc))
locType = FUZZY_POINT;
else if (BetweenLocation.class.isInstance(thisLoc))
locType = BETWEEN_LOCATION;
else
locType = RANGE;
StringBuffer lb = new StringBuffer();
switch (locType)
{
case POINT:
PointLocation pl = (PointLocation) thisLoc;
sb.append(complement ?
toComplement(formatPoint(lb, pl).substring(0)) :
formatPoint(lb, pl).substring(0));
break;
case FUZZY_RANGE:
FuzzyLocation fl = (FuzzyLocation) thisLoc;
sb.append(complement ?
toComplement(formatFuzzyRange(lb, fl).substring(0)) :
formatFuzzyRange(lb, fl).substring(0));
break;
case FUZZY_POINT:
FuzzyPointLocation fpl = (FuzzyPointLocation) thisLoc;
sb.append(complement ?
toComplement(formatFuzzyPoint(lb, fpl).substring(0)) :
formatFuzzyPoint(lb, fpl).substring(0));
break;
case RANGE:
sb.append(complement ?
toComplement(formatRange(lb, thisLoc).substring(0)) :
formatRange(lb, thisLoc).substring(0));
break;
case BETWEEN_LOCATION:
BetweenLocation tempLocation = (BetweenLocation) thisLoc;
String formattedLocation = formatBetween(lb, tempLocation).toString();
if (complement)
{
formattedLocation = toComplement(formattedLocation);
}
sb.append(formattedLocation);
break;
default:
// Maybe exception here?
break;
}
// If there is another location after this
if ((locs.indexOf(thisLoc) + 1) < locs.size())
sb.append(',');
post = sb.length();
// The number of characters just added
diff = post - pre;
// If we have exceeded the line length
if ((position + diff) > wrapWidth)
{
// Insert a newline just prior to this location string
sb.insert((sb.length() - diff), nl + leader);
position = leader.length() + diff;
}
else
{
position += diff;
}
}
if (join)
{
sb.append(')');
// If adding the ")" has made the line too long, move the
// last range to the next line
if ((position + 1) > wrapWidth)
{
sb.insert((sb.length() - diff), nl + leader);
position++;
diff++;
}
}
return sb;
}
/**
* formatFuzzyRange
creates an EMBL/Genbank style
* String representation of a FuzzyLocation
.
*
* @param sb a StringBuffer
to format the location
* into.
* @param fl a FuzzyLocation
.
*
* @return a String
representation of the location.
*/
private StringBuffer formatFuzzyRange(StringBuffer sb, FuzzyLocation fl)
{
if (! fl.hasBoundedMin())
{
// <123
sb.append('<');
sb.append(fl.getMin());
}
else if (fl.getOuterMin() != fl.getInnerMin())
{
// (123.567)
sb.append('(');
sb.append(fl.getOuterMin());
sb.append('.');
sb.append(fl.getInnerMin());
sb.append(')');
}
else
{
// 123
sb.append(fl.getMin());
}
sb.append("..");
if (! fl.hasBoundedMax())
{
// >567
sb.append('>');
sb.append(fl.getMax());
}
else if (fl.getInnerMax() != fl.getOuterMax())
{
// (567.789)
sb.append('(');
sb.append(fl.getInnerMax());
sb.append('.');
sb.append(fl.getOuterMax());
sb.append(')');
}
else
{
// 567
sb.append(fl.getMax());
}
return sb;
}
/**
* formatFuzzyPoint
creates an EMBL/Genbank style
* String representation of a FuzzyPointLocation
.
*
* @param sb a StringBuffer
to format the location into.
* @param fpl a FuzzyPointLocation
.
*
* @return a String
representation of the location.
*/
private StringBuffer formatFuzzyPoint(StringBuffer sb, FuzzyPointLocation fpl)
{
if (! fpl.hasBoundedMin())
{
// <123
sb.append('<');
sb.append(fpl.getMax());
}
else if (! fpl.hasBoundedMax())
{
// >567
sb.append('>');
sb.append(fpl.getMin());
}
else
{
// (567.789)
sb.append('(');
sb.append(fpl.getMin());
sb.append('.');
sb.append(fpl.getMax());
sb.append(')');
}
return sb;
}
/**
* formatRange
creates an EMBL/Genbank style String
* representation of a RangeLocation
.
*
* @param sb a StringBuffer
to format the location into.
* @param rl a Location
.
*
* @return a String
representation of the location.
*/
private StringBuffer formatRange(StringBuffer sb, Location rl)
{
// 123..567
sb.append(rl.getMin());
sb.append("..");
sb.append(rl.getMax());
return sb;
}
/**
* formatPoint
creates an EMBL/Genbank style String
* representation of a PointLocation
.
*
* @param sb a StringBuffer
to format the location into.
* @param pl a PointLocation
.
*
* @return a String
representation of the location.
*/
private StringBuffer formatPoint(StringBuffer sb, PointLocation pl)
{
sb.append(Integer.toString(pl.getMin()));
return sb;
}
/**
* Formats a between location x y into x^y.
*
* @param sb a StringBuffer
to format the location into.
* @param theLocation The between location object to be formatted
*
* @return A string representation of the location
*/
private StringBuffer formatBetween(StringBuffer sb, BetweenLocation theLocation)
{
sb.append(theLocation.getMin());
sb.append('^');
sb.append(theLocation.getMax());
return sb;
}
/**
* toComplement
accepts an EMBL/Genbank style String
* representation of a Location
and returns the
* complementary strand version.
*
* @param value a String
.
*
* @return a String
representation of the
* complementary strand location.
*/
private String toComplement(String value)
{
return "complement(" + value + ")";
}
/**
* loadFeatureData
reads data describing EMBL/Genbank
* features and qualifiers from an XML file and populates two data
* Maps, one for features, one for qualifiers. The file describes
* which qualifiers are optional or mandatory for a feature type
* and which qualifiers are written within quotes (or
* parantheses). The DTD used by the XML file is stored in the
* file's internal subset.
*
* @param featureDataFile a String
indicating the
* name of the file.
* @param featureData a Map
to populate with
* feature data.
* @param qualifierData a Map
to populate with
* qualifier data.
*/
static void loadFeatureData(String featureDataFile,
Map featureData,
Map qualifierData)
{
try
{
InputStream featureDataStream =
ClassTools.getClassLoader(EmblFileFormer.class).getResourceAsStream(featureDataFile);
if (featureDataStream == null)
throw new BioError("Failed to find resource: "
+ featureDataFile);
InputSource is = new InputSource(featureDataStream);
DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder();
// Get document and then the root element
Document doc = parser.parse(is);
NodeList featureNodes = doc.getDocumentElement().getChildNodes();
// For nodes in root element (features)
int fNodeCount = featureNodes.getLength();
for (int i = 0; i < fNodeCount; i++)
{
Node featureNode = featureNodes.item(i);
if (! (featureNode instanceof Element))
continue;
Element feature = (Element) featureNode;
String fNodeName = feature.getNodeName();
if (fNodeName.equals("feature"))
{
String featureKey = feature.getAttribute("key");
NodeList qualifierNodes = feature.getChildNodes();
// For nodes in each feature (qualifiers)
int qNodeCount = qualifierNodes.getLength();
for (int j = 0; j < qNodeCount; j++)
{
Node qualifierNode = qualifierNodes.item(j);
if (! (qualifierNode instanceof Element))
continue;
Element qualifier = (Element) qualifierNode;
String qNodeName = qualifier.getNodeName();
if (qNodeName.equals("qualifier"))
{
Map qData = new HashMap();
qData.put("form", qualifier.getAttribute("form"));
qData.put("mandatory",
new Boolean(qualifier.getAttribute("mandatory")));
qualifierData.put(qualifier.getAttribute("name"), qData);
}
}
featureData.put(featureKey, qualifierData.keySet());
}
featureDataStream.close();
}
}
catch (IOException ioe)
{
ioe.printStackTrace();
}
catch (SAXException se)
{
se.printStackTrace();
}
catch (BioError be)
{
be.printStackTrace();
}
catch (ParserConfigurationException ex)
{
ex.printStackTrace();
}
}
}