/** ** copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file eland_ms/OligoHashTable.hh ** ** \brief Part of ELAND ** ** Part of ELAND ** ** \author Tony Cox **/ #ifndef CASAVA_ELAND_MS_OLIGO_HASH_TABLE_H #define CASAVA_ELAND_MS_OLIGO_HASH_TABLE_H #include "ElandDefines.hh" #include "Hasher.hh" #include "PartitionHashTable.hh" namespace casava { namespace eland_ms { // ELAND stands for Efficient Local Alignment of Nucleotide Data // Matches oligos to genome data allowing for substitution errors and // for ambiguity codes // The basic plan: oligos are divided into 4 contiguous fragments, e.g. // a 24-mer into 4 fragments of 6 bases - call them ABCD. // These 4 fragments can be partitioned into (assuming a 24 base oligo) // a 12 base prefix and a 12 base suffix in 6 different ways: // // Prefix Suffix // A B C D // C D A B // A C B D // B D A C // A D B C // B C A D // // The idea is to convert the prefix to a number using 2 bits per base encoding // and use this number as an index into a look up table. // // The look up table gives you the suffixes of all oligos that have the same // prefix as the oligo you are searching for. These suffixes are then compared // inexactly with the query suffix by doing an exclusive-OR then using the // result to index the "suffix score table" - another look up table which // gives the positions of any differing bases. // // The key point is that we can catch all possible 2 substitution error // matches by checking the 6 combinations above: the worst case scenario is // that the two errors occur in different fragments - for one of the six // combinations both these fragments will be in the suffix and so will be // caught by the inexact matching step. // // 'N's in the data (and some 2 base ambiguity codes) can be handled by // additionally storing an AND mask for each oligo which is applied after // the exclusive-OR and prior to accessing the suffix score table. // // It makes sense to cover the six possible partitions in 3 passes // through the genome, making use of the fact that the partitions can be // paired so that the prefix of one is the suffix of another. // The way in which an oligo matches to the genome data is described by // one instance of MatchPosition // one instance of MatchDescriptor // The match as described by these two quantities may be categorised as // being in one of seven states: // // 1: NM - No Match for this oligo // 2: UE - Unique Exact match for this oligo // 3: U1 - Unique match for this oligo with a single base substitution error // 4: U2 - Unique match for this oligo, having 2 base substitution errors // 5: RE - Unique Exact match for this oligo // 6: R1 - Unique match for this oligo with a single base substitution error // 7: R2 - Unique match for this oligo, having 2 base substitution errors // OligoHashTable: manage build and scan of the two PartitionHashTables on // each of the three passes // pass 0: // - search for partitions 0 = ABCD and 1 = CDAB // pass 1: // - search for partitions 2 = ACBD and 3 = BDAC // pass 2: // - search for partitions 4 = ADCB and 5 = CBAD template< int PASS, int OLIGO_LEN> class OligoHashTable { public: // OligoHashTable( int oligoLength, // HashTableDataStore& htds1, // HashTableDataStore& htds2, // const SuffixScoreTable& score, MatchTable& results ); // // // buildTable: read oligos from source and generate hash tables // // return false if no oligos were hashed // bool buildTable( OligoSource& oligos,bool singleseed=true ); // // // hashThisOligo: returns true if an oligo should be placed in the hash // // table on this pass // bool hashThisOligo( OligoNumber oligoNum, const vector& queryMask ); // // // scan: scan hash table across a chromosome file stored in the // // 2-bits-per-base format generated by squashGenome // MatchPosition scan( FileReader& file, const MatchPosition currentBlock ); // // // checkOligo: look in the two hash tables for matches against a position // // in the genome // inline void checkOligo // ( const Oligo& ol, const MatchPosition sequencePos ); // // void checkOligo // ( const vector& v, const MatchPosition starts ); // // // // // checkManyOligos: fast scan across multiple consecutive positions in the // // genome // MatchPosition checkManyOligos // ( Oligo& ol, const Word* pWord, // const Word* pLastWord, MatchPosition seqNum ); // // int getNumOligos( void ) const { return numOligos_; } // shiftOligo: shift a base off of thisWord and onto ol void shiftOligo( Oligo& ol, Word& thisWord ) const { register Word thisCarry; ol.ui[1]<<=numBitsPerBase; thisCarry=(ol.ui[0]>>((ElandConstants::suffixLength-1)*numBitsPerBase)); ol.ui[1]|=thisCarry; ol.ui[0]<<=numBitsPerBase; thisCarry=(thisWord>>((maxBasesPerWord-1)*numBitsPerBase)); ol.ui[0]|=thisCarry; thisWord<<=numBitsPerBase; ol.ui[1]&=(~ElandConstants::prefixMask); ol.ui[0]&=(~ElandConstants::suffixMask); } // ~shiftOligo private: // numOligos_: on pass 0, buildTable sets this to the number of oligos // found in the source int numOligos_; const int oligoLength_; const int prefixLength_; const Word prefixMask_; // data storage for match scoring table // Score of match between suffixes A and B is suffixScore_[ A XOR B ] const SuffixScoreTable& score_; // data storage for match information for each oligo MatchTable& results_; // vector matchPosition_; // vector matchType_; // data storage for oligos PartitionHashTable::useSplitPrefix, PASS, true> table1_; PartitionHashTable::useSplitPrefix, PASS, false> table2_; // hasher - templatized Hasher hash_; public: OligoHashTable ( int oligoLength, // PartitionHashTable& table1, // PartitionHashTable& table2, HashTableDataStore::useSplitPrefix>& htds1, HashTableDataStore::useSplitPrefix>& htds2, const SuffixScoreTable& score, MatchTable& results ) : numOligos_(-1), oligoLength_(oligoLength), prefixLength_(oligoLength-16), // TC 04.06.03 - changed to work for oligoLength of 32 // prefixMask_((1<<(2*prefixLength_))-1), // NB new version probably doesn't work for oligoLength of 16! prefixMask_( ((Word)~0) >> (numBitsPerBase*(maxBasesPerWord-prefixLength_)) ), // partitionNum_(0), score_(score), results_(results), table1_(htds1, results), table2_(htds2, results), hash_() { // assert(oligoLength_>=16); // assert(oligoLength_<=24); cerr << "made hash table, pointer table sizes = " << table1_.data_.entryPointer_.size() << ", " << table2_.data_.entryPointer_.size() << endl; } // ~OligoHashTable::OligoHashTable bool hashThisOligo ( OligoNumber oligoNum, const vector& queryMask ) { // if mask is empty then nothing to hash (because of QC failure) // so obviously return false if (queryMask.size()==0) { results_.setQualityFailed__(oligoNum); return false; } // ~if else { // ask MatchTable if it is interested in the results return results_.isInterested__( oligoNum, PASS, ((queryMask[0].ui[0]!=0) ||(queryMask[0].ui[1]!=0))); } // ~else } // ~OligoHashTable::hashThisOligo bool buildTable( OligoSource& oligos, const bool single, const bool isCheckHeaders) { table1_.setTable( hash_.getLengthPart1(), hash_.getLowerFragSizePart2(), hash_.getLowerFragMaskPart2(), hash_.getLowerFragScorePart2(score_), hash_.getUpperFragScorePart2(score_) ); table2_.setTable( hash_.getLengthPart2(), hash_.getLowerFragSizePart1(), hash_.getLowerFragMaskPart1(), hash_.getLowerFragScorePart1(score_), hash_.getUpperFragScorePart1(score_) ); // numOligos_=0; const char* pOligo; Oligo res, mask; // table1_.pCount_ = &table1_.entryPointer_[2]; - now done in setTable // table2_.pCount_ = &table2_.entryPointer_[2]; - now done in setTable OligoNumber oligosFound(0), oligosToHash(0); oligos.rewind(); int readLength = 0; vector seedOffsets( 4,0 ); // determine the oligo length + the placement of the seeds if ( (pOligo=oligos.getNextOligoSelect(false,false)) != NULL ) { readLength = strlen(pOligo); } if( !single ) { if( !readLength) { // no oligos could be read, assume read length to be OLIGO_LEN readLength = OLIGO_LEN; } vector so(calculateSeedOffsets(OLIGO_LEN,readLength)); using std::swap; swap(seedOffsets, so); } // rewind selector oligos.rewind(); /// -------------------------------------------------------- /// BEGIN MODIFICATIONS MULTISEED CODE /// -------------------------------------------------------- MultiSeedQueryGenerator makeOligosFromASCII(single,seedOffsets); //QueryGenerator makeOligosFromASCII(); vector queryOligo, queryMask; vector queryOligoNum; vector queryCnt; MaskMapType maskMap1,maskMap2; cerr << "counting oligos" << endl; // count occurrences while (1) { if ( (pOligo=oligos.getNextOligoSelect(isCheckHeaders,false)) == NULL ) break; // ett.translate( pOligo, ol, rc ); ++oligosFound; makeOligosFromASCII ( pOligo, 0, queryOligo, queryMask, queryOligoNum, queryCnt ); // printWord(ol.ui[1],16);cout <<'-';printWord(ol.ui[0],16); cout << endl; // if more than 3 exact matches already found, don't search for any more // if ( ( partitionNum_==0) // || ( (results_.matchPosition_[oligosFound]0, because the remaining seeds might not contain Ns) // // iterate over the vector queryMask and see if queryMask[q] is zero. bool qc_flagged = false; for( uint q(0);q tmp_v; hashThisOligo( oligosFound, tmp_v ); } } // ~while if (numOligos_==-1) { numOligos_=oligosFound; assert(numOligos_ v_tmp; hashThisOligo( j,v_tmp ); } j++; // two places to increase j: noOfSkippedSequences + regular increase } // ~while j maskMap1.clear(); maskMap2.clear(); cerr << "completed the hashing part..." << endl; // table1_.pCount_--; // now pCount_ == entryPointer_ // table2_.pCount_--; // now pCount2_ == entryPointer2_ // exit(0); // table1_.removeRepeatedEntries( results_.matchPosition_ ); // table2_.removeRepeatedEntries( results_.matchPosition_ ); table1_.removeRepeatedEntries( results_ ); table2_.removeRepeatedEntries( results_ ); cerr << "successful table build" << endl; return true; } // ~OligoHashTable::buildTable( ifstream& oligoFile ) void checkOligo ( MatchCache& cache, const Oligo& ol, const MatchPosition sequencePos ) { // cout << "co: "<< sequencePos << endl; // exit (-22); // assert(1==0); Oligo res; hash_(ol, res); table1_.sps_.check( cache, res.ui[0], res.ui[1], sequencePos ); // ( res.ui[0], res.ui[1], sequencePos, partitionNum_ ); table2_.sps_.check( cache, res.ui[1], res.ui[0], sequencePos ); } // ~OligoHashTable::checkOligo bool lessThanF( const pair& lhs, const pair& rhs ) { return (lhs.first.ui[0]& lhs, // const pair& rhs ) //{ // return (lhs.first.ui[1]& v, const MatchPosition starts ) //{ // // // cout << "Co: " << v.size() << "!!" << endl; // // vector h(v.size()); // static vector > h; // // static vector p; // h.resize(v.size()); // // p.reserve(v.size()); // // p.resize(0); // // for (uint i(0); i::iterator i(p.begin());i!=p.end();i++) // // { // // table1_.sps_.check__ // // ( i->a_, i->b_, i->suff_, i->pos_ ); // // } // // p.resize(0); // // Word swap; // // for (uint i(0);i::iterator i(p.begin());i!=p.end();i++) // // { // // table2_.sps_.check__ // // ( i->a_, i->b_, i->suff_, i->pos_ ); // // // // } // // // // for (uint i(0); ipWord_), *pLastWord(pf->pLastWord_); Word pFetch; for (;pWord n; for ( ; pWord != pLastWord ; ++pWord ) { thisWord = *pWord; for (int i(0);i<16;i++, seqNum++) { shiftOligo( ol, thisWord ); #ifdef XXX ol.ui[1]<<=numBitsPerBase; thisCarry = (ol.ui[0]>>30); ol.ui[1] |= thisCarry; ol.ui[0]<<=numBitsPerBase; thisCarry = (thisWord>>30); ol.ui[0]|=thisCarry; thisWord<<=numBitsPerBase; ol.ui[1]&=prefixMask_; #endif // n.push_back(seqNum); // n.resize(0); checkOligo( cache, ol, seqNum ); } // ~for i } // ~for pWord seqNum +=(oligoLength_-2); #ifdef PREFETCH pthread_join( prefetchThread, NULL ); #endif return seqNum; } // ~int OligoHashTable::checkManyOligos MatchPosition scan( FileReader& file, const MatchPosition currentBlock ) { // Delay creating MatchCache to this point so that the caches will still // be compatible with the threading scheme.. even though this doesn't // appear to be functional right now. // // If the multi-thread code is dismantled, this could be moved to // member data of OligoHashTable // MatchCache cache(results_); MatchPosition seqNum(0), seqLast; /* register */ Word thisWord = 0; const ValidRegion *pValid(file.getFirstValid()), *pLastValid(file.getLastValid()); const Word *pWord, *pStart(file.getSeqStart()); Oligo ol; #ifdef DEBUG_SCAN cout << pLastValid-pValid+1 << " valid regions " << endl; #endif BOOST_ASSERT((currentBlock & blockPositionMask)==0 && "currentBlock must not have any position bits set"); if( pValid == pLastValid ) { // no valid regions found in the chromosome cerr << "No valid regions found in the current chromosome, updating seqNum with the currentBlock!" << endl; seqNum += currentBlock; } for ( ; pValid!=pLastValid; ++pValid ) { #ifdef DEBUG_SCAN cerr << "got valid region " << pValid->start << " " << pValid->finish << endl; #endif if ((((int)pValid->finish)-((int)pValid->start))+1start; pWord=pStart+(seqNum>>4); seqLast = pValid->finish + currentBlock; if (seqLast >= blockRepeat) { BOOST_THROW_EXCEPTION( cc::PreConditionException( (boost::format("Reference sequence requires more than %d blocks. " "If you are using multiple short reference files, you may get around " "this issue by concatenating them into a single large one. Otherwise, " "please reduce the length of your reference sequence." ) % (blockRepeat >> blockShift)).str()) ); } // seqNum += currentBlock; // seqLast = pValid->finish + currentBlock; // if necessary, shift first word so that first required base // takes up the most significant bits if ((seqNum&0xF)!=0) { thisWord=*(pWord++); thisWord <<= ((seqNum&0xF)<<1); } // ~if seqNum // fill up ol[0] and ol[1] for ( ; seqNum < pValid->start + oligoLength_-1 ; ++seqNum ) { // cout << "start " << seqNum << endl; if ((seqNum&0xF)==0) { thisWord=*(pWord++); // printWord(thisWord,16); cout << endl; } // ~if seqNum #ifdef DEBUG_SCAN printWord( ol.ui[1], 16 ); cout << "-"; printWord( ol.ui[0], 16 ); cout << endl; #endif shiftOligo( ol, thisWord ); } // ~for seqNum // ol[0] &= prefixMask_; seqNum += currentBlock; // now scan until the next whole Word for ( ; ((seqNum <= seqLast)&&((seqNum&0xF)!=0)) ; ++seqNum ) { // cout << "aligning " << seqNum << endl; shiftOligo( ol, thisWord ); #ifdef DEBUG_SCAN printWord( ol.ui[1], 16 ); cout << "-"; printWord( ol.ui[0], 16 ); cout << " - about to check" << endl; #endif // cerr << "scan: " << ol << " " << ol[0] << " " << ol[1] // << " " << seqNum << endl; // checkOligo(ol, seqNum ); checkOligo(cache, ol, seqNum-oligoLength_+2 ); } // ~for seqNum if (seqNum>=seqLast) { #ifdef DEBUG_SCAN cout << "end of region" << endl; #endif // cerr << "B: " // << pValid->start << " " // << pValid->finish << " " // << pValid->finish-pValid->start+1 << " " // << currentBlock << " " // << seqNum << " " // << seqLast << " " // << endl; // Bug fix: need to scan one more base if at a word boundary, // therefore do not exit loop iteration yet TC - 27.11.03 // continue; if ((seqNum&0xF)!=0) continue; } // assert((seqNum&0xF)==0); const Word* pLastWord(pStart+((1+pValid->finish)>>4)); // finish = 33 -> pLastWord = pStart_ + 2 // finish = 32 -> pLastWord = pStart_ + 2 // finish = 31 -> pLastWord = pStart_ + 2 // finish = 30 -> pLastWord = pStart_ + 1 #ifdef DEBUG_SCAN cout << pLastWord-pWord << endl; #endif // now scan word by word // cout << "cmo " << seqNum << " " << pLastWord-pWord << endl; seqNum=checkManyOligos(cache, ol, pWord, pLastWord, seqNum ); const int basesInLast((1+pValid->finish)&0xF); // finish = 33 -> basesInLast = 2 // finish = 32 -> basesInLast = 1 // finish = 31 -> basesInLast = 0 // finish = 30 -> basesInLast = 15 // if required, do the incomplete word at the end if (basesInLast!=0) { thisWord = *pLastWord; for (int i(0);i>blockShift)+1)<