/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.bio.seq.db; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.biojava.bio.BioException; import org.biojava.bio.seq.db.emblcd.DivisionLkpReader; import org.biojava.bio.seq.db.emblcd.EmblCDROMIndexReader; import org.biojava.bio.seq.db.emblcd.EmblCDROMRandomAccess; import org.biojava.bio.seq.db.emblcd.EntryNamIdxReader; import org.biojava.bio.seq.db.emblcd.EntryNamRandomAccess; import org.biojava.bio.seq.io.SequenceBuilderFactory; import org.biojava.bio.seq.io.SequenceFormat; import org.biojava.bio.seq.io.SymbolTokenization; /** *
EmblCDROMIndexStore
s implement a read-only
* IndexStore
backed by EMBL CD-ROM format binary
* indices. The required index files are typically named
* "division.lkp" and "entrynam.idx". As an IndexStore
* performs lookups by sequence ID, the index files "acnum.trg" and
* "acnum.hit" (which store additional accession number data) are not
* used.
The sequence IDs are found using a binary search via a pointer
* into the index file. The whole file is not read unless a request
* for all the IDs is made using the getIDs() method. The set of IDs
* is then cached after the first pass. This class also has a
* close()
method to free resources associated with the
* underlying RandomAccessFile
.
The binary index files may be created using the EMBOSS programs
* dbifasta, dbiblast, dbiflat or dbigcg. The least useful from the
* BioJava perspective is dbigcg because we do not have a
* SequenceFormat
implementation for GCG format
* files.
The Index
instances returned by this class do not
* have the record length set because this information is not
* available in the binary index. The value -1 is used instead, as
* described in the Index
interface.
EmblCDROMIndexStore
backed by a
* random access binary index.
*
* @param divisionLkp a File
containing the master
* index.
* @param entryNamIdx a File
containing the sequence
* IDs and offsets.
* @param format a SequenceFormat
.
* @param factory a SequenceBuilderFactory
.
* @param parser a SymbolTokenization
.
*
* @exception IOException if an error occurs.
*/
public EmblCDROMIndexStore(File divisionLkp,
File entryNamIdx,
SequenceFormat format,
SequenceBuilderFactory factory,
SymbolTokenization parser)
throws IOException
{
// Set to the empty abstract path
this(new File(""), divisionLkp, entryNamIdx,
format, factory, parser);
}
/**
* Creates a new EmblCDROMIndexStore
backed by a
* random access binary index.
*
* @param pathPrefix a File
containing the abstract
* path to be appended to sequence database filenames retrieved
* from the binary index.
* @param divisionLkp a File
containing the master
* index.
* @param entryNamIdx a File
containing the sequence
* IDs and offsets.
* @param format a SequenceFormat
.
* @param factory a SequenceBuilderFactory
.
* @param parser a SymbolTokenization
.
*
* @exception IOException if an error occurs.
*/
public EmblCDROMIndexStore(File pathPrefix,
File divisionLkp,
File entryNamIdx,
SequenceFormat format,
SequenceBuilderFactory factory,
SymbolTokenization parser)
throws IOException
{
this.divisionLkp = divisionLkp;
this.entryNamIdx = entryNamIdx;
this.format = format;
this.factory = factory;
this.parser = parser;
this.pathPrefix = pathPrefix;
initialise();
}
/**
* getPathPrefix
returns the abstract path currently
* being appended to the raw sequence database filenames extracted
* from the binary index. This value defaults to the empty
* abstract path.
*
* @return a File
.
*/
public File getPathPrefix()
{
return pathPrefix;
}
/**
* setPathPrefix
sets the abstract path to be
* appended to sequence database filenames retrieved from the
* binary index. E.g. if the binary index refers to the database
* as 'SWALL' and the pathPrefix
is set to
* "/usr/local/share/data/seq/", then the IndexStore
* will know the database path as
* "/usr/local/share/data/seq/swall" and any Index
* instances produced by the store will return the latter path
* when their getFile() method is called. This value defaults to
* the empty abstract path.
*
* @param pathPrefix a File
prefix specifying the
* abstract path to append.
*/
public void setPathPrefix(File pathPrefix)
{
this.pathPrefix = pathPrefix;
}
/**
* getName
returns the database name as defined
* within the EMBL CD-ROM index.
*
* @return a String
value.
*/
public String getName()
{
return name;
}
/**
* store
adds an Index
to the store. As
* EMBL CD-ROM indices are read-only, this implementation throws a
* BioException
.
*
* @param index an Index
.
*
* @exception IllegalIDException if an error occurs.
* @exception BioException if an error occurs.
*/
public void store(Index index)
throws IllegalIDException, BioException
{
throw new BioException("Failed to add Index: store is read-only."
+ " To add sequences use the dbi programs"
+ " supplied in EMBOSS");
}
/**
* commit
commits changes. As EMBL CD-ROM indices are
* read-only, this implementation throws a
* BioException
.
*
* @exception BioException if an error occurs.
*/
public void commit() throws BioException
{
throw new BioException("Failed to commit: store is read-only."
+ " To add sequences use the dbi programs"
+ " supplied in EMBOSS");
}
/**
* rollback
rolls back changes made since the last
* commit
. As EMBL CD-ROM indices are read-only, this
* implementation does nothing.
*/
public void rollback() { }
public Index fetch(String id) throws IllegalIDException, BioException
{
Index index = null;
try
{
Object [] enRecord = entryRandomAccess.findRecord(id);
if (enRecord.length == 0)
throw new IllegalIDException("Failed to find ID: " + id);
// Append current pathPrefix
index =
new SimpleIndex(new File(pathPrefix,
(String) seqFiles.get((Integer)
enRecord[3])),
((Long) enRecord[1]).longValue(), -1, id);
}
catch (IOException ioe)
{
throw new BioException("Failed to retrieve index for ID: " + id);
}
return index;
}
public Set getIDs()
{
if (seqIds == null)
{
seqIds = new HashSet((int) entryRecordCount);
BufferedInputStream bis = null;
try
{
bis =
new BufferedInputStream(new FileInputStream(entryNamIdx));
EmblCDROMIndexReader ent = new EntryNamIdxReader(bis);
for (long i = 0; i < entryRecordCount; i++)
{
Object [] enRecord = ent.readRecord();
seqIds.add((String) enRecord[0]);
}
bis.close();
}
// File was not found, so don't try to close it
catch (FileNotFoundException fnfe)
{
System.err.println("Failed to find file "
+ entryNamIdx.getName());
fnfe.printStackTrace();
}
// File was opened, so try to close it
catch (IOException ioe)
{
try
{
bis.close();
}
catch (IOException ioe2)
{
System.err.println("Failed to close input stream from file "
+ entryNamIdx.getName());
}
System.err.println("Failed to read file "
+ entryNamIdx.getName());
ioe.printStackTrace();
}
}
return Collections.unmodifiableSet(seqIds);
}
public Set getFiles()
{
return Collections.unmodifiableSet(fileSet);
}
public SequenceFormat getFormat()
{
return format;
}
public SequenceBuilderFactory getSBFactory()
{
return factory;
}
public SymbolTokenization getSymbolParser()
{
return parser;
}
/**
* close
closes the underlying
* EntryNamRandomAccess
which in turn closes the
* lower level RandomAccessFile
. This frees the
* resources associated with the file.
*
* @exception IOException if an error occurs.
*/
public void close() throws IOException
{
entryRandomAccess.close();
}
/**
* initialise
reads the headers of the index files to
* obtain data about the record sizes and counts, database name
* and sequence filenames. It then opens a random access file to
* the ID index for lookups.
*
* @exception IOException if an error occurs.
*/
private void initialise() throws IOException
{
BufferedInputStream bis = null;
// First try to get details of file names and numbers from
// master index file.
try
{
bis = new BufferedInputStream(new FileInputStream(divisionLkp));
EmblCDROMIndexReader div = new DivisionLkpReader(bis);
divRecordCount = div.readRecordCount();
// The database name is the same in all the index headers
name = div.readDBName();
seqFiles = new HashMap((int) divRecordCount);
// Store the file number->name mapping
for (long i = divRecordCount; --i >= 0;)
{
Object [] divRecord = div.readRecord();
Integer fileNumber = (Integer) divRecord[0];
String fileName = (String) divRecord[1];
seqFiles.put(fileNumber, fileName);
}
// Keep a Set view
fileSet = new HashSet((int) divRecordCount);
fileSet.addAll(seqFiles.values());
bis.close();
}
// File was not found, so don't try to close it
catch (FileNotFoundException fnfe)
{
System.err.println("Failed to find file "
+ divisionLkp.getName());
// Rethrow
throw fnfe;
}
// File was opened, so try to close it
catch (IOException ioe)
{
try
{
bis.close();
}
catch (IOException ioe2)
{
System.err.println("Failed to close input stream from file "
+ divisionLkp.getName());
}
System.err.println("Failed to read full set of sequence IDs file "
+ divisionLkp.getName());
// Rethrow
throw ioe;
}
// Now try to get details of sequence ID index file
try
{
bis = new BufferedInputStream(new FileInputStream(entryNamIdx));
EmblCDROMIndexReader ent = new EntryNamIdxReader(bis);
entryRecordLength = ent.readRecordLength();
entryRecordCount = ent.readRecordCount();
bis.close();
}
// File was not found, so don't try to close it
catch (FileNotFoundException fnfe)
{
System.err.println("Failed to find file "
+ entryNamIdx.getName());
// Rethrow
throw fnfe;
}
// File was opened, so try to close it
catch (IOException ioe)
{
try
{
bis.close();
}
catch (IOException ioe2)
{
System.err.println("Failed to close input stream from file "
+ entryNamIdx.getName());
}
System.err.println("Failed to read file "
+ entryNamIdx.getName());
// Rethrow
throw ioe;
}
// Try to set up random access file
try
{
entryRandomAccess = new EntryNamRandomAccess(entryNamIdx,
300,
entryRecordLength,
entryRecordCount);
}
// File was not found, so don't try to close it
catch (FileNotFoundException fnfe)
{
System.err.println("Failed to find file "
+ entryNamIdx.getName());
try
{
bis.close();
}
catch (IOException ioe2)
{
System.err.println("Failed to close random access file "
+ entryNamIdx.getName());
}
// Rethrow
throw fnfe;
}
}
}