/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.bio.program.indexdb;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.biojava.bio.BioException;
import org.biojava.bio.program.tagvalue.ChangeTable;
import org.biojava.bio.program.tagvalue.Indexer;
import org.biojava.bio.program.tagvalue.LineSplitParser;
import org.biojava.bio.program.tagvalue.Parser;
import org.biojava.bio.program.tagvalue.ValueChanger;
import org.biojava.bio.seq.io.SeqIOConstants;
import org.biojava.utils.CommitFailure;
import org.biojava.utils.ParserException;
import org.biojava.utils.io.CountedBufferedReader;
import org.biojava.utils.io.RAF;
import org.biojava.utils.lsid.LifeScienceIdentifier;
/**
* IndexTools
contains static utility methods for
* creating flatfile indices according to the OBDA standard.
*
* @author Keith James
* @author Matthew Pocock
*/
public class IndexTools
{
// Cannot be instantiated
private IndexTools() { }
/**
* indexFasta
indexes DNA, RNA or protein Fasta
* format sequence files on primary identifier.
*
* @param location a File
directory which will
* contain the indices.
* @param seqFiles a File []
array of files to index.
* @param alphabetIdentifier an int
indicating the
* type of sequence to be indexed. May be one of
* SeqIOConstants.DNA SeqIOConstants.RNA
* SeqIOConstants.AA
.
* @param name a String
arbitrary database name.
*
* @exception FileNotFoundException if an error occurs.
* @exception IOException if an error occurs.
* @exception ParserException if an error occurs.
* @exception BioException if an error occurs.
*/
public static void indexFasta(String name, File location, File [] seqFiles,
int alphabetIdentifier)
throws FileNotFoundException, IOException, ParserException,
BioException
{
BioStoreFactory bsf = new BioStoreFactory();
bsf.setStoreName(name);
switch (alphabetIdentifier)
{
case (SeqIOConstants.DNA):
bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_DNA);
break;
case (SeqIOConstants.RNA):
bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_RNA);
break;
case (SeqIOConstants.AA):
bsf.setSequenceFormat(SeqIOConstants.LSID_FASTA_AA);
break;
default:
throw new IllegalArgumentException("Unknown alphabet identifier '"
+ alphabetIdentifier
+ "'");
}
_indexFasta(bsf, location, seqFiles);
}
/**
* indexEmbl
indexes DNA, RNA or protein EMBL format
* sequence files on ID as primary identifier and AC as secondary.
*
* @param location a File
directory which will
* contain the indices.
* @param seqFiles a File []
array of files to index.
* @param alphabetIdentifier an int
indicating the
* type of sequence to be indexed. May be one of
* SeqIOConstants.DNA SeqIOConstants.RNA
* SeqIOConstants.AA
.
* @param name a String
arbitrary database name.
*
* @exception FileNotFoundException if an error occurs.
* @exception IOException if an error occurs.
* @exception ParserException if an error occurs.
* @exception BioException if an error occurs.
*/
public static void indexEmbl(String name, File location, File [] seqFiles,
int alphabetIdentifier)
throws FileNotFoundException, IOException, ParserException,
BioException
{
BioStoreFactory bsf = new BioStoreFactory();
bsf.setStoreName(name);
switch (alphabetIdentifier)
{
case (SeqIOConstants.DNA):
bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_DNA);
break;
case (SeqIOConstants.RNA):
bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_RNA);
break;
case (SeqIOConstants.AA):
bsf.setSequenceFormat(SeqIOConstants.LSID_EMBL_AA);
break;
default:
throw new IllegalArgumentException("Unknown alphabet identifier '"
+ alphabetIdentifier
+ "'");
}
_indexEmblLike(bsf, location, seqFiles);
}
/**
* indexGenbank
indexes DNA, RNA or protein Genbank
* format sequence files on LOCUS as primary identifier and
* ACCESSION as secondary.
*
* @param location a File
directory which will
* contain the indices.
* @param seqFiles a File []
array of files to index.
* @param alphabetIdentifier an int
indicating the
* type of sequence to be indexed. May be one of
* SeqIOConstants.DNA SeqIOConstants.RNA
* SeqIOConstants.AA
.
* @param name a String
arbitrary database name.
*
* @exception FileNotFoundException if an error occurs.
* @exception IOException if an error occurs.
* @exception ParserException if an error occurs.
* @exception BioException if an error occurs.
*/
public static void indexGenbank(String name, File location, File [] seqFiles,
int alphabetIdentifier)
throws FileNotFoundException, IOException, ParserException,
BioException
{
BioStoreFactory bsf = new BioStoreFactory();
bsf.setStoreName(name);
switch (alphabetIdentifier)
{
case (SeqIOConstants.DNA):
bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_DNA);
break;
case (SeqIOConstants.RNA):
bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_RNA);
break;
case (SeqIOConstants.AA):
bsf.setSequenceFormat(SeqIOConstants.LSID_GENBANK_AA);
break;
default:
throw new IllegalArgumentException("Unknown alphabet identifier '"
+ alphabetIdentifier
+ "'");
}
_indexGenbank(bsf, location, seqFiles);
}
/**
* indexSwissprot
indexes Swissprot format protein
* sequence files on ID as primary identifier.
*
* @param location a File
directory which will
* contain the indices.
* @param seqFiles a File []
array of files to index.
* @exception FileNotFoundException if an error occurs.
* @exception IOException if an error occurs.
* @exception ParserException if an error occurs.
* @exception BioException if an error occurs.
*/
public static void indexSwissprot(String name, File location, File [] seqFiles)
throws FileNotFoundException, IOException, ParserException,
BioException
{
BioStoreFactory bsf = new BioStoreFactory();
bsf.setStoreName(name);
bsf.setSequenceFormat(LifeScienceIdentifier.valueOf("open-bio.org",
"swiss",
"protein" ));
_indexEmblLike(bsf, location, seqFiles);
}
private static void _indexFasta(BioStoreFactory bsf,
File location, File [] seqFiles)
throws FileNotFoundException, IOException, BioException
{
bsf.setPrimaryKey("ID");
bsf.setStoreLocation(location);
bsf.addKey("ID", 10);
BioStore store = bsf.createBioStore();
for (int i = 0; i < seqFiles.length; i++)
{
// File data
long newOffset = 0L;
long oldOffset = 0L;
RAF raf = new RAF(seqFiles[i], "r");
Map map = new HashMap();
CountedBufferedReader reader =
new CountedBufferedReader(new FileReader(raf.getFile()));
// Record data
String id = "";
String line = null;
while ((line = reader.readLine()) != null)
{
if (line.startsWith(">"))
{
// Write at end of record
if (newOffset > 0)
{
store.writeRecord(raf, oldOffset,
(int) (newOffset - oldOffset),
id, map);
oldOffset = newOffset;
}
newOffset = reader.getFilePointer();
int delimeter = line.indexOf(" ");
if (delimeter < 0)
id = line.substring(1);
else
id = line.substring(1, delimeter);
}
else
{
newOffset = reader.getFilePointer();
}
}
// Write final record
store.writeRecord(raf, oldOffset,
(int) (newOffset - oldOffset),
id, map);
}
try
{
store.commit();
}
catch (CommitFailure ne)
{
throw new BioException("Failed to commit new index to file",ne);
}
}
private static void _indexEmblLike(BioStoreFactory bsf,
File location, File [] seqFiles)
throws FileNotFoundException, IOException, ParserException,
BioException
{
bsf.setPrimaryKey("ID");
bsf.setStoreLocation(location);
bsf.addKey("AC", 10);
bsf.addKey("ID", 10);
BioStore store = bsf.createBioStore();
for (int i = 0; i < seqFiles.length; i++)
{
Indexer indexer = new Indexer(seqFiles[i], store);
indexer.setPrimaryKeyName("ID");
indexer.addSecondaryKey("AC");
ChangeTable changeTable = new ChangeTable();
changeTable.setChanger("ID", new ChangeTable.Changer()
{
public Object change(Object value)
{
String s = (String) value;
int i = s.indexOf(" ");
if (i < 0)
return s;
else
return s.substring(0, i);
}
});
changeTable.setChanger("AC", new ChangeTable.Changer()
{
public Object change(Object value)
{
String s = (String) value;
int i = s.indexOf(";");
return s.substring(0, i);
}
});
ValueChanger changer = new ValueChanger(indexer, changeTable);
Parser parser = new Parser();
while(parser.read(indexer.getReader(),
LineSplitParser.EMBL, changer));
}
try
{
store.commit();
}
catch (CommitFailure ne)
{
throw new BioException("Failed to commit new index to file",ne);
}
}
private static void _indexGenbank(BioStoreFactory bsf,
File location, File [] seqFiles)
throws FileNotFoundException, IOException, ParserException,
BioException
{
bsf.setPrimaryKey("LOCUS");
bsf.setStoreLocation(location);
bsf.addKey("LOCUS", 10);
bsf.addKey("ACCESSION", 10);
BioStore store = bsf.createBioStore();
for (int i = 0; i < seqFiles.length; i++)
{
Indexer indexer = new Indexer(seqFiles[i], store);
indexer.setPrimaryKeyName("LOCUS");
indexer.addSecondaryKey("ACCESSION");
ChangeTable changeTable = new ChangeTable();
changeTable.setChanger("LOCUS", new ChangeTable.Changer()
{
public Object change(Object value)
{
String s = (String) value;
int i = s.indexOf(" ");
if (i < 0)
return s;
else
return s.substring(0, i);
}
});
ValueChanger changer = new ValueChanger(indexer, changeTable);
Parser parser = new Parser();
while(parser.read(indexer.getReader(),
LineSplitParser.GENBANK, changer));
}
try
{
store.commit();
}
catch (CommitFailure ne)
{
throw new BioException("Failed to commit new index to file",ne);
}
}
}