/***************************************************************************** NewChromsweep.h (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #ifndef NEW_CHROMSWEEP_H #define NEW_CHROMSWEEP_H #include #include "BTlist.h" #include "RecordKeyList.h" #include "RecordKeyVector.h" #include #include #include #include #include "string.h" using namespace std; class Record; class FileRecordMgr; class ContextIntersect; class NewChromSweep { public: NewChromSweep(ContextIntersect *context); virtual ~NewChromSweep(void); virtual bool init(); typedef RecordList recListType; typedef const RecordListNode *recListIterType; virtual bool next(RecordKeyVector &next); // NOTE! You MUST call this method after sweep if you want the // getTotalRecordLength methods to return the whole length of the // record files, rather than just what was used by sweep. void closeOut(bool testChromOrder = false); unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; } unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; } unsigned long getQueryTotalRecords() { return _queryTotalRecords; } unsigned long getDatabaseTotalRecords() { return _databaseTotalRecords; } protected: ContextIntersect *_context; FileRecordMgr *_queryFRM; int _numDBs; //don't really need this stored, but here for code brevity. int _numFiles; //ditto. Just numDBs + num queries, which for now is always 1. vector _dbFRMs; unsigned long _queryRecordsTotalLength; vector _dbFileRecordsLength; //each value in this vector have the //length of all records in the corresponding db file. unsigned long _databaseRecordsTotalLength; unsigned long _queryTotalRecords; unsigned long _databaseTotalRecords; bool _wasInitialized; // a cache of still active features from the database file // typedef enum { BEFORE_QUERY, NEAR_QUERY, AFTER_QUERY, EOF } cacheStatusType; // vector >_caches; vector _caches; // the set of hits in the database for the current query // recListType _hits; // the current query and db features. Record * _currQueryRec; vector _currDbRecs; // a cache of the current chrom from the query. used to handle chrom changes. string _currQueryChromName; string _prevQueryChromName; bool _runToQueryEnd; virtual void masterScan(RecordKeyVector &retList); bool nextRecord(bool query, int dbIdx = -1); //true fetches next query record, false fetches next db record. virtual void scanCache(int dbIdx, RecordKeyVector &retList); virtual void clearCache(int dbIdx); virtual bool chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan); bool dbFinished(int dbIdx); bool intersects(const Record *rec1, const Record *rec2) const; bool allCachesEmpty(); bool allCurrDBrecsNull(); // // members and methods for detecting differently // sorted files without a genome file. // typedef map _orderTrackType; vector<_orderTrackType *> _fileTracks; map _filePrevChrom; bool _lexicoDisproven; //whether we've established that any file ISN'T in lexicographical order bool _lexicoAssumed; //whether we've had to try to guess that any file might be in lexicographical order. string _lexicoAssumedChromName; //which chromosome we had to make that guess for. Used in error reporting. int _lexicoAssumedFileIdx; //which file we had to make the guess for. Also for error reporting. bool _testLastQueryRec; void testChromOrder(const Record *rec); bool queryChromAfterDbRec(const Record *dbRec); int findChromOrder(const Record *rec); bool verifyChromOrderMismatch(const string & chrom, const string &prevChrom, int skipFile); void testThatAllDbChromsExistInQuery(); bool testLexicoQueryAfterDb(const Record *queryRec, const Record *dbRec); }; #endif /* NewChromSweep_H */