/**
 ** copyright (c) 2007-2010 Illumina, Inc.
 **
 ** This software is covered by the "Illumina Genome Analyzer Software
 ** License Agreement" and the "Illumina Source Code License Agreement",
 ** and certain third party copyright/licenses, and any user of this
 ** source file is bound by the terms therein (see accompanying files
 ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
 ** Illumina_Source_Code_License_Agreement.pdf and third party
 ** copyright/license notices).
 **
 ** This file is part of the Consensus Assessment of Sequence And VAriation
 ** (CASAVA) software package.
 **
 ** \file eland_ms/OligoHashTable.hh
 **
 ** \brief Part of ELAND
 **
 ** Part of ELAND
 **
 ** \author Tony Cox
 **/

#ifndef CASAVA_ELAND_MS_OLIGO_HASH_TABLE_H
#define CASAVA_ELAND_MS_OLIGO_HASH_TABLE_H

#include "ElandDefines.hh"
#include "Hasher.hh"
#include "PartitionHashTable.hh"

namespace casava
{
namespace eland_ms
{
// ELAND stands for Efficient Local Alignment of Nucleotide Data

// Matches oligos to genome data allowing for substitution errors and
// for ambiguity codes

// The basic plan: oligos are divided into 4 contiguous fragments, e.g.
// a 24-mer into 4 fragments of 6 bases - call them ABCD.
// These 4 fragments can be partitioned into (assuming a 24 base oligo)
// a 12 base prefix and a 12 base suffix in 6 different ways:
//
// Prefix  Suffix
// A B     C D
// C D     A B
// A C     B D
// B D     A C
// A D     B C
// B C     A D
//
// The idea is to convert the prefix to a number using 2 bits per base encoding
// and use this number as an index into a look up table.
//
// The look up table gives you the suffixes of all oligos that have the same
// prefix as the oligo you are searching for. These suffixes are then compared
// inexactly with the query suffix by doing an exclusive-OR then using the
// result to index the "suffix score table" - another look up table which
// gives the positions of any differing bases.
//
// The key point is that we can catch all possible 2 substitution error
// matches by checking the 6 combinations above: the worst case scenario is
// that the two errors occur in different fragments - for one of the six
// combinations both these fragments will be in the suffix and so will be
// caught by the inexact matching step.
//
// 'N's in the data (and some 2 base ambiguity codes) can be handled by
// additionally storing an AND mask for each oligo which is applied after
// the exclusive-OR and prior to accessing the suffix score table.
//
// It makes sense to cover the six possible partitions in 3 passes
// through the genome, making use of the fact that the partitions can be
// paired so that the prefix of one is the suffix of another.


// The way in which an oligo matches to the genome data is described by
// one instance of MatchPosition
// one instance of MatchDescriptor
// The match as described by these two quantities may be categorised as
// being in one of seven states:
//
// 1: NM - No Match for this oligo
// 2: UE - Unique Exact match for this oligo
// 3: U1 - Unique match for this oligo with a single base substitution error
// 4: U2 - Unique match for this oligo, having 2 base substitution errors
// 5: RE - Unique Exact match for this oligo
// 6: R1 - Unique match for this oligo with a single base substitution error
// 7: R2 - Unique match for this oligo, having 2 base substitution errors

// OligoHashTable: manage build and scan of the two PartitionHashTables on
// each of the three passes
// pass 0:
// - search for partitions 0 = ABCD and 1 = CDAB
// pass 1:
// - search for partitions 2 = ACBD and 3 = BDAC
// pass 2:
// - search for partitions 4 = ADCB and 5 = CBAD
template< int PASS, int OLIGO_LEN> class OligoHashTable
{
public:
//  OligoHashTable( int oligoLength,
//		  HashTableDataStore<OLIGO_LEN>& htds1,
//		  HashTableDataStore<OLIGO_LEN>& htds2,
//		  const SuffixScoreTable& score, MatchTable& results );
//
//  // buildTable: read oligos from source and generate hash tables
//  // return false if no oligos were hashed
//  bool buildTable( OligoSource& oligos,bool singleseed=true );
//
//  // hashThisOligo: returns true if an oligo should be placed in the hash
//  // table on this pass
//  bool hashThisOligo( OligoNumber oligoNum, const vector<Oligo>& queryMask );
//
//  // scan: scan hash table across a chromosome file stored in the
//  // 2-bits-per-base format generated by squashGenome
//  MatchPosition scan( FileReader& file, const MatchPosition currentBlock );
//
//  // checkOligo: look in the two hash tables for matches against a position
//  // in the genome
//  inline void checkOligo
//  ( const Oligo& ol, const MatchPosition sequencePos );
//
//  void checkOligo
//    ( const vector<Oligo>& v, const MatchPosition starts );
//
//
//
//  // checkManyOligos: fast scan across multiple consecutive positions in the
//  // genome
//  MatchPosition checkManyOligos
//  ( Oligo& ol, const Word* pWord,
//    const Word* pLastWord, MatchPosition seqNum );
//
//  int getNumOligos( void ) const { return numOligos_; }

  // shiftOligo: shift a base off of thisWord and onto ol
  void shiftOligo( Oligo& ol, Word& thisWord ) const
  {
    register Word thisCarry;

    ol.ui[1]<<=numBitsPerBase;
    thisCarry=(ol.ui[0]>>((ElandConstants<OLIGO_LEN>::suffixLength-1)*numBitsPerBase));
    ol.ui[1]|=thisCarry;
    ol.ui[0]<<=numBitsPerBase;
    thisCarry=(thisWord>>((maxBasesPerWord-1)*numBitsPerBase));
    ol.ui[0]|=thisCarry;
    thisWord<<=numBitsPerBase;
    ol.ui[1]&=(~ElandConstants<OLIGO_LEN>::prefixMask);
    ol.ui[0]&=(~ElandConstants<OLIGO_LEN>::suffixMask);
  } // ~shiftOligo

private:

  // numOligos_: on pass 0, buildTable sets this to the number of oligos
  // found in the source
  int numOligos_;

  const int oligoLength_;
  const int prefixLength_;

  const Word prefixMask_;

  // data storage for match scoring table

  // Score of match between suffixes A and B is suffixScore_[ A XOR B ]
  const SuffixScoreTable& score_;

  // data storage for match information for each oligo

  MatchTable& results_;
  //  vector<MatchPosition> matchPosition_;
  //  vector<MatchDescriptor> matchType_;

  // data storage for oligos
  PartitionHashTable<ElandConstants<OLIGO_LEN>::useSplitPrefix, PASS, true> table1_;
  PartitionHashTable<ElandConstants<OLIGO_LEN>::useSplitPrefix, PASS, false> table2_;

  // hasher - templatized
  Hasher<PASS, OLIGO_LEN> hash_;

public:

OligoHashTable
( int oligoLength,
  //  PartitionHashTable<useSplitPrefix>& table1,
  //  PartitionHashTable<useSplitPrefix>& table2,
  HashTableDataStore<ElandConstants<OLIGO_LEN>::useSplitPrefix>& htds1,
  HashTableDataStore<ElandConstants<OLIGO_LEN>::useSplitPrefix>& htds2,
  const SuffixScoreTable& score, MatchTable& results ) :
  numOligos_(-1),
  oligoLength_(oligoLength),
  prefixLength_(oligoLength-16),
  // TC 04.06.03 - changed to work for oligoLength of 32
  //  prefixMask_((1<<(2*prefixLength_))-1),
  // NB new version probably doesn't work for oligoLength of 16!
  prefixMask_( ((Word)~0)
	       >> (numBitsPerBase*(maxBasesPerWord-prefixLength_)) ),
  //  partitionNum_(0),
  score_(score),
  results_(results),
  table1_(htds1, results),
  table2_(htds2, results),
  hash_()
{
  //  assert(oligoLength_>=16);
  //  assert(oligoLength_<=24);

  cerr << "made hash table, pointer table sizes = "
       << table1_.data_.entryPointer_.size() << ", "
       << table2_.data_.entryPointer_.size() << endl;

} // ~OligoHashTable::OligoHashTable


bool hashThisOligo
( OligoNumber oligoNum, const vector<Oligo>& queryMask )
{

  // if mask is empty then nothing to hash (because of QC failure)
  // so obviously return false
  if (queryMask.size()==0)
  {
    results_.setQualityFailed__(oligoNum);
    return false;
  } // ~if
  else
  { // ask MatchTable if it is interested in the results
    return results_.isInterested__( oligoNum,
                                    PASS,
                                    ((queryMask[0].ui[0]!=0)
                                     ||(queryMask[0].ui[1]!=0)));
  } // ~else
} // ~OligoHashTable<PASS>::hashThisOligo


bool buildTable( OligoSource& oligos,
                 const bool single,
                 const bool isCheckHeaders)
{
  table1_.setTable( hash_.getLengthPart1(),
      hash_.getLowerFragSizePart2(),
      hash_.getLowerFragMaskPart2(),
      hash_.getLowerFragScorePart2(score_),
      hash_.getUpperFragScorePart2(score_) );

  table2_.setTable( hash_.getLengthPart2(),
      hash_.getLowerFragSizePart1(),
      hash_.getLowerFragMaskPart1(),
      hash_.getLowerFragScorePart1(score_),
      hash_.getUpperFragScorePart1(score_) );

  //  numOligos_=0;

  const char* pOligo;

  Oligo res, mask;

  //  table1_.pCount_ = &table1_.entryPointer_[2]; - now done in setTable
  //  table2_.pCount_ = &table2_.entryPointer_[2]; - now done in setTable

  OligoNumber oligosFound(0), oligosToHash(0);

  oligos.rewind();

  int readLength = 0;
  vector<int> seedOffsets( 4,0 );

  // determine the oligo length + the placement of the seeds
  if ( (pOligo=oligos.getNextOligoSelect(false,false)) != NULL )
  {
      readLength = strlen(pOligo);
  }

  if( !single )
  {
    if( !readLength)
    {
      // no oligos could be read, assume read length to be OLIGO_LEN
      readLength = OLIGO_LEN;
    }

    vector<int> so(calculateSeedOffsets(OLIGO_LEN,readLength));
    using std::swap; swap(seedOffsets, so);
  }

  // rewind selector
  oligos.rewind();

  /// --------------------------------------------------------
  /// BEGIN MODIFICATIONS MULTISEED CODE
  /// --------------------------------------------------------
  MultiSeedQueryGenerator<OLIGO_LEN> makeOligosFromASCII(single,seedOffsets);
  //QueryGenerator makeOligosFromASCII();

  vector<Oligo> queryOligo, queryMask;
  vector<OligoNumber> queryOligoNum;
  vector<uint> queryCnt;
  MaskMapType maskMap1,maskMap2;

  cerr << "counting oligos" << endl;

  // count occurrences
  while (1)
  {
    if ( (pOligo=oligos.getNextOligoSelect(isCheckHeaders,false)) == NULL ) break;
    //    ett.translate( pOligo, ol, rc );
    ++oligosFound;

    makeOligosFromASCII
      ( pOligo, 0, queryOligo, queryMask, queryOligoNum, queryCnt );

    //  printWord(ol.ui[1],16);cout <<'-';printWord(ol.ui[0],16); cout << endl;

    // if more than 3 exact matches already found, don't search for any more
    //    if ( ( partitionNum_==0)
    // || ( (results_.matchPosition_[oligosFound]<blockRepeat)
    //      && ( ((matchType_[oligosFound].errorType&0x3)!=0)
    //	   || (matchType_[oligosFound].r[0]<=3) ) ) )


    // 29/04/09 markus :
    //
    // the problem here is that even if we have multiple seeds per
    // read, we only take the oligosFound which is the oligoNum for
    // the first seed. So, if we have multiple seeds and the first one
    // is QC, then we do not report QC, but NM (since
    // queryMask.size()>0, because the remaining seeds might not contain Ns)
    //
    // iterate over the vector queryMask and see if queryMask[q] is zero.
    bool qc_flagged = false;
    for( uint q(0);q<queryCnt.size();q++ ) {
      if( queryCnt[q] == 0 ) {
	qc_flagged = true; break;
      }
    }

    if( qc_flagged == false )
      {
	if (hashThisOligo( oligosFound, queryMask )==true)
	  {

	    ++oligosToHash;
	    for ( uint i(0) ; i<queryOligo.size(); i++)
	      {
		hash_(queryMask[i],mask);
		hash_(queryOligo[i],res);

		table1_.sps_.countKey( maskMap1, res.ui[0], mask.ui[0], mask.ui[1] );
		table2_.sps_.countKey( maskMap2, res.ui[1], mask.ui[1], mask.ui[0] );

	      } // ~for
	  } // ~if
      } else
	{
	  vector<Oligo> tmp_v;
	  hashThisOligo( oligosFound, tmp_v );
	}

  } // ~while

  if (numOligos_==-1)
  {
    numOligos_=oligosFound;
    assert(numOligos_<maxNumOligos);
  } // ~if
  else
  {
    if (((int)oligosFound)!=numOligos_)
    {
      cerr << "Error: expecting to read " << numOligos_
	   << " oligos, but found " << oligosFound << endl;
      exit (1);
    } // ~if
  } // ~else

  cerr << "read " << numOligos_ << " oligos from source" << endl;
  cerr << "Will hash " << oligosToHash << " oligos" << endl;

  if (oligosToHash==0)
  {
    cerr << "No oligos to hash, returning" << endl;

    // if results table has not been built, do so - this ensure alignment
    // output always produced, even if all sequences are QC failures
    if (results_.size()==0)
    {
      results_.resize(numOligos_+1);
    } // ~if
    return false;
  } // ~if

  if (PASS==0)
  {
    //    if (results_.matchPosition_.size()==0)
    if (results_.size()==0)
    {
      results_.resize(numOligos_+1);
    } // ~if
    else
    {
      //      assert(((int)results_.size())==numOligos_+1);
    } // ~else
  } // ~if


  cerr << "converting count values to pointer arrays" << endl;

  table1_.makePointerArray(maskMap1);
  table2_.makePointerArray(maskMap2);

  // rewind and fill in tables

  oligos.rewind();

  //  table1_.pCount_--; // now pCount_ == entryPointer_ + 1
  //  table2_.pCount_--; // now pCount2_ == entryPointer2_ + 1


  int j=1;
  while( j<=numOligos_ )
  {
    if ( (pOligo=oligos.getNextOligoSelect(false,false)) == NULL )
    {
      cerr << "Error: expecting " << numOligos_ << " oligos, only found "
	   << j << endl;
      exit (1);
    } // ~if


#ifdef NO_OF_SKIPPED_SEQUENCES
    j+= oligos.getNoSkippedSequences();
#endif


    makeOligosFromASCII
      ( pOligo, j, queryOligo, queryMask, queryOligoNum, queryCnt );

    bool qc_flagged = false;
    for( uint q(0);q<queryCnt.size();q++ ) {
      if( queryCnt[q] == 0 ) {
        qc_flagged = true; break;
      }
    }

    if( qc_flagged == false )
    {
      if (hashThisOligo( j, queryMask) ==true )
      {
        for ( uint i(0) ; i<queryOligo.size(); i++)
        {
          assert(queryOligo.size()!=0);

          hash_(queryMask[i],mask);
          hash_(queryOligo[i],res);

          table1_.sps_.hashEntry( maskMap1, res.ui[0], mask.ui[0], res.ui[1], mask.ui[1], queryOligoNum[i] );
          table2_.sps_.hashEntry( maskMap2, res.ui[1], mask.ui[1], res.ui[0], mask.ui[0], queryOligoNum[i] );

        } // ~for i
      } // ~if
    }
    else {
      // QC flag this oligo
      vector<Oligo> v_tmp;
      hashThisOligo( j,v_tmp );
    }
    j++; // two places to increase j: noOfSkippedSequences + regular increase
  } // ~while j

  maskMap1.clear();
  maskMap2.clear();

  cerr << "completed the hashing part..." << endl;

  //  table1_.pCount_--; // now pCount_ == entryPointer_
  //  table2_.pCount_--; // now pCount2_ == entryPointer2_

  //  exit(0);

  //  table1_.removeRepeatedEntries( results_.matchPosition_ );
  //  table2_.removeRepeatedEntries( results_.matchPosition_ );
  table1_.removeRepeatedEntries( results_ );
  table2_.removeRepeatedEntries( results_ );

  cerr << "successful table build" << endl;
  return true;
} // ~OligoHashTable::buildTable( ifstream& oligoFile )

void checkOligo
( MatchCache& cache, const Oligo& ol, const MatchPosition sequencePos )
{
  //  cout << "co: "<< sequencePos << endl;
  //  exit (-22);
  //  assert(1==0);
  Oligo res;
  hash_(ol, res);

  table1_.sps_.check( cache, res.ui[0], res.ui[1], sequencePos );
  //    ( res.ui[0], res.ui[1], sequencePos, partitionNum_ );
  table2_.sps_.check( cache, res.ui[1], res.ui[0], sequencePos );
} // ~OligoHashTable::checkOligo

bool lessThanF( const pair<Oligo, MatchPosition>& lhs,
                const pair<Oligo, MatchPosition>& rhs )
{
  return (lhs.first.ui[0]<rhs.first.ui[0]);
}

//bool lessThanR( const pair<Oligo, MatchPosition>& lhs,
//                const pair<Oligo, MatchPosition>& rhs )
//{
//  return (lhs.first.ui[1]<rhs.first.ui[1]);
//}


//  struct S
//  {
//    S() {}
//    S(uint a, uint b, Word suff, MatchPosition pos): a_(a), b_(b), pos_(pos), suff_(suff) {}
//    uint a_;
//    uint b_;
//    Word suff_;
//    MatchPosition pos_;
//  }; // ~S

//
//void checkOligo
//( const vector<Oligo>& v, const MatchPosition starts )
//{
//
//  //  cout << "Co: " << v.size() << "!!" << endl;
//  //  vector<Oligo> h(v.size());
//  static vector<pair<Oligo, MatchPosition> > h;
//  //  static vector<S> p;
//  h.resize(v.size());
//  //  p.reserve(v.size());
//  //  p.resize(0);
//
//  for (uint i(0); i<v.size();i++)
//  {
//    hash_(v[i],h[i].first);
//    h[i].second=starts+i;
//  }
//  //    cout << i << " " << h.back().second << " ";
//  //   printWord(h.back().first.ui[0],16); cout << " ";
//  //   printWord(h.back().first.ui[1],16); cout << endl;
//
//  //  uint a, b;
//  sort(h.begin(),h.end(),lessThanF);
//
//  //  for (uint i(0); i<h.size();i++)
//  //  {
//    //    a=table1_.sps_.pCount_[h[i].first.ui[0]];
//    //   b=table1_.sps_.pCount_[h[i].first.ui[0]+1];
//    //    if (a!=b) p.push_back(S(a,b,h[i].first.ui[1],h[i].second));
//    // if (a!=b) table1_.sps_.check__(a,b,h[i].first.ui[1],h[i].second);
//
//  table1_.sps_.check__( h );
//
//    //  }
//
//  //  cout << p.size() << " " << h.size() << " "<< (((double)p.size())/h.size()) << endl;
//  //  for (vector<S>::iterator i(p.begin());i!=p.end();i++)
//  //  {
//  //  table1_.sps_.check__
//  //   ( i->a_, i->b_, i->suff_, i->pos_ );
//  //  }
//  //  p.resize(0);
//
//  Word swap;
//
//  for (uint i(0);i<h.size();i++)
//  {
//    swap=h[i].first.ui[0];
//    h[i].first.ui[0]=h[i].first.ui[1];
//    h[i].first.ui[1]=swap;
//  }
//
//  sort(h.begin(),h.end(),lessThanF);
//
//  //  sort(h.begin(),h.end(),lessThanR);
//
//  table2_.sps_.check__(h);
//  //  for (uint i(0); i<h.size();i++)
//  //  {
//  //   a=table2_.sps_.pCount_[h[i].first.ui[1]];
//  //   b=table2_.sps_.pCount_[h[i].first.ui[1]+1];
//  //  if (a!=b) table2_.sps_.check__(a,b,h[i].first.ui[0],h[i].second);
//    //    if (a!=b) p.push_back(S(a,b,h[i].first.ui[0],h[i].second));
//  //  }
//  //  for (vector<S>::iterator i(p.begin());i!=p.end();i++)
//  //  {
//  //  table2_.sps_.check__
//  //      ( i->a_, i->b_, i->suff_, i->pos_ );
//  //
//    //  }
//
//
//  //  for (uint i(0); i<h.size();i++)
//  //  {
//  //   table1_.sps_.check
//  //     ( h[i].first.ui[0], h[i].first.ui[1], h[i].second );
//  //  }
//
//
//  //  for (uint i(0); i<h.size();i++)
//  //  {
//  //   table2_.sps_.check
//  //      ( h[i].first.ui[1], h[i].first.ui[0], h[i].second );
//  //  }
//
//  //  Oligo res;
//  //  hash_(ol, res);
//
//
//  //  table2_.sps_.check
//  //  ( res.ui[1], res.ui[0], sequencePos );
//
//} // ~OligoHashTable::checkOligo


#ifdef PREFETCH
struct PrefetchStruct
{
  PrefetchStruct( const Word* pWord, const Word* pLastWord ) :
    pWord_(pWord), pLastWord_(pLastWord) {}
  const Word* pWord_;
  const Word* pLastWord_;
};

void* prefetch( void* pv )
{
  PrefetchStruct* pf((PrefetchStruct*)pv);
  const Word* pWord(pf->pWord_), *pLastWord(pf->pLastWord_);
  Word pFetch;
  for (;pWord<pLastWord;pWord+=512)
  {
    pFetch = *pWord;
  } // ~for
  //  delete pf;
  return NULL;
} // ~prefetch
#endif


MatchPosition checkManyOligos
( MatchCache& cache, Oligo& ol, const Word* pWord, const Word* pLastWord, MatchPosition seqNum )
{

    /* register */ Word thisWord; //, thisCarry;

#ifdef PREFETCH
  PrefetchStruct pf(pWord, pLastWord);
  pthread_t prefetchThread;
  pthread_create( &prefetchThread, NULL, prefetch, (void*) &pf );
  pthread_detach( prefetchThread ); // fire and forget
#endif

  seqNum -=(oligoLength_-2);


  //  vector<MatchPosition> n;

  for ( ; pWord != pLastWord ; ++pWord )
  {
    thisWord = *pWord;
    for (int i(0);i<16;i++, seqNum++)
    {

      shiftOligo( ol, thisWord );
#ifdef XXX
      ol.ui[1]<<=numBitsPerBase;
      thisCarry = (ol.ui[0]>>30);
      ol.ui[1] |= thisCarry;
      ol.ui[0]<<=numBitsPerBase;
      thisCarry = (thisWord>>30);
      ol.ui[0]|=thisCarry;
      thisWord<<=numBitsPerBase;
      ol.ui[1]&=prefixMask_;
#endif
      //      n.push_back(seqNum);
        //        n.resize(0);

      checkOligo( cache, ol, seqNum );
    } // ~for i
  } // ~for pWord

  seqNum +=(oligoLength_-2);
#ifdef PREFETCH
  pthread_join( prefetchThread, NULL );
#endif
  return seqNum;


} // ~int OligoHashTable::checkManyOligos


MatchPosition scan( FileReader& file, const MatchPosition currentBlock )
{

  // Delay creating MatchCache to this point so that the caches will still
  // be compatible with the threading scheme.. even though this doesn't
  // appear to be functional right now.
  //
  // If the multi-thread code is dismantled, this could be moved to
  // member data of OligoHashTable
  //
  MatchCache cache(results_);

  MatchPosition seqNum(0), seqLast;
  /* register */ Word thisWord = 0;
  const ValidRegion *pValid(file.getFirstValid()), *pLastValid(file.getLastValid());

  const Word *pWord, *pStart(file.getSeqStart());

  Oligo ol;
#ifdef DEBUG_SCAN
  cout << pLastValid-pValid+1 << " valid regions " << endl;
#endif

  BOOST_ASSERT((currentBlock & blockPositionMask)==0 && "currentBlock must not have any position bits set");

  if( pValid == pLastValid )
  {
      // no valid regions found in the chromosome
      cerr << "No valid regions found in the current chromosome, updating seqNum with the currentBlock!" << endl;
      seqNum += currentBlock;
  }


  for ( ; pValid!=pLastValid; ++pValid )
  {
#ifdef DEBUG_SCAN
    cerr << "got valid region " << pValid->start << " "
	 << pValid->finish << endl;
#endif
    if ((((int)pValid->finish)-((int)pValid->start))+1<oligoLength_)
    {
#ifdef DEBUG
      cout << "region too short, ignoring" << endl;
#endif
      continue;
    }

    seqNum = pValid->start;
    pWord=pStart+(seqNum>>4);
    seqLast = pValid->finish + currentBlock;
    if (seqLast >= blockRepeat)
    {
        BOOST_THROW_EXCEPTION(
            cc::PreConditionException(
                (boost::format("Reference sequence requires more than %d blocks. "
                               "If you are using multiple short reference files, you may get around "
                               "this issue by concatenating them into a single large one. Otherwise, "
                               "please reduce the length of your reference sequence."
                               ) % (blockRepeat >> blockShift)).str())
                );
    }
    //    seqNum += currentBlock;
    //   seqLast = pValid->finish + currentBlock;


    // if necessary, shift first word so that first required base
    // takes up the most significant bits
    if ((seqNum&0xF)!=0)
    {
      thisWord=*(pWord++);
      thisWord <<= ((seqNum&0xF)<<1);

    } // ~if seqNum

    // fill up ol[0] and ol[1]
    for ( ; seqNum < pValid->start + oligoLength_-1 ; ++seqNum )
    {
      //      cout << "start " << seqNum << endl;
      if ((seqNum&0xF)==0)
      {
      	thisWord=*(pWord++);
      	//	printWord(thisWord,16); cout << endl;
      } // ~if seqNum

#ifdef DEBUG_SCAN
      printWord( ol.ui[1], 16 ); cout << "-";
      printWord( ol.ui[0], 16 ); cout << endl;
#endif


      shiftOligo( ol, thisWord );


    } // ~for seqNum
    //    ol[0] &= prefixMask_;

    seqNum += currentBlock;

    // now scan until the next whole Word
    for ( ; ((seqNum <= seqLast)&&((seqNum&0xF)!=0)) ; ++seqNum )
    {
      //   cout << "aligning " << seqNum << endl;
      shiftOligo( ol, thisWord );
#ifdef DEBUG_SCAN
      printWord( ol.ui[1], 16 ); cout << "-";
      printWord( ol.ui[0], 16 ); cout << " - about to check" << endl;
#endif

	// cerr << "scan: " << ol << " " << ol[0] << " " << ol[1]
	// << " " << seqNum << endl;
	//	checkOligo(ol, seqNum );
        checkOligo(cache, ol, seqNum-oligoLength_+2 );

    } // ~for seqNum

    if (seqNum>=seqLast)
    {
#ifdef DEBUG_SCAN
      cout << "end of region" << endl;
#endif

      //      cerr << "B: "
      //   << pValid->start << " "
      //   << pValid->finish << " "
      //   << pValid->finish-pValid->start+1 << " "
      //   << currentBlock << " "
      //   << seqNum << " "
      //   << seqLast << " "
      //   << endl;

      // Bug fix: need to scan one more base if at a word boundary,
      // therefore do not exit loop iteration yet TC - 27.11.03
      //      continue;
      if ((seqNum&0xF)!=0) continue;

    }
    //    assert((seqNum&0xF)==0);


    const Word* pLastWord(pStart+((1+pValid->finish)>>4));
    // finish = 33 -> pLastWord = pStart_ + 2
    // finish = 32 -> pLastWord = pStart_ + 2
    // finish = 31 -> pLastWord = pStart_ + 2
    // finish = 30 -> pLastWord = pStart_ + 1

#ifdef DEBUG_SCAN
    cout << pLastWord-pWord << endl;
#endif

    // now scan word by word
    //    cout << "cmo " << seqNum << " " << pLastWord-pWord << endl;
    seqNum=checkManyOligos(cache, ol, pWord, pLastWord, seqNum );

    const int basesInLast((1+pValid->finish)&0xF);
    // finish = 33 -> basesInLast = 2
    // finish = 32 -> basesInLast = 1
    // finish = 31 -> basesInLast = 0
    // finish = 30 -> basesInLast = 15


    // if required, do the incomplete word at the end
    if (basesInLast!=0)
    {
      thisWord = *pLastWord;

      for (int i(0);i<basesInLast;i++, seqNum++)
      {
          shiftOligo( ol, thisWord );

	//	cerr << "scanend: " << ol << " " << ol[0] << " " << ol[1] << " " << seqNum << endl;
      //   cout << "finito " << seqNum << endl;
	//        checkOligo(ol, seqNum );
          checkOligo(cache, ol, seqNum-oligoLength_+2 );

      } // ~for i

    } // ~if basesInLast

  } // ~for pValid

  return (((seqNum>>blockShift)+1)<<blockShift);

}
};

} //namespace eland_ms
} //namespace casava

#endif // CASAVA_ELAND_MS_OLIGO_HASH_TABLE_H