/** ** copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file eland_ms/QueryGenerator.hh ** ** \brief Part of ELAND ** ** Part of ELAND ** ** \author Tony Cox **/ #ifndef CASAVA_ELAND_MS_QUERY_GENERATOR_H #define CASAVA_ELAND_MS_QUERY_GENERATOR_H #include "ElandDefines.hh" #include "ReverseShifter.hh" namespace casava { namespace eland_ms { // Oligo format, e.g. for 21 mers, 1 character = 1 bit: // ui0.............................ui1............................. // uc0.....uc1.....uc2.....uc3.....uc4.....uc5.....uc6.....uc7..... // 2.1.1.1.1.1.1.1.1.1.1.9.8.7.6.5.4.3.2.1.0. <-base numbers 0 20 // 0 9 8 7 6 5 4 3 2 1 0 // QueryGenerator: generates a set of query sequences for an oligo, allowing // for: // i) reverse complements // ii) Ns // iii) 2 base ambiguity codes (TBD) template class QueryGenerator { public: QueryGenerator() {} // // generate a set of query sequences from ASCII sequence data // int operator()( const char* buf, const uint oligoNum, // vector& queryOligo, // vector& queryMask, // vector& queryOligoNum ); // // // convert a single oligo from ASCII to binary. // // Fail if the oligo contains Ns or other funny stuff // void operator()( const char* buf, Oligo& oligo ); // // // convert an oligo from ASCII to binary and produce a // // mask for the Ns - return number of Ns found // int encodeOligo( const char* buf, Oligo& oligo, Oligo& mask ); void reverseOligo( const Oligo& ol, Oligo& rc ) const { rc.uc[0]=reverseChar[ol.uc[7]]; rc.uc[1]=reverseChar[ol.uc[6]]; rc.uc[2]=reverseChar[ol.uc[5]]; rc.uc[3]=reverseChar[ol.uc[4]]; rc.uc[4]=reverseChar[ol.uc[3]]; rc.uc[5]=reverseChar[ol.uc[2]]; rc.uc[6]=reverseChar[ol.uc[1]]; rc.uc[7]=reverseChar[ol.uc[0]]; reverseShift_( ol, rc ); } // ~reverseOligo protected: char tempBuf1[OLIGO_LEN+1]; char tempBuf2[OLIGO_LEN+2]; ReverseShifter::prefixLength-ElandConstants::suffixLength, OLIGO_LEN> reverseShift_; // int convert_( const char* buf, const uint oligoNum, // vector& queryOligo, // vector& queryMask, // vector& queryOligoNum, const short& seed_no ); public: // convert an oligo from ASCII to binary and produce a // mask for the Ns - return number of Ns found // template< int oligoLength > int encodeOligo( const char* buf, Oligo& oligo, Oligo& mask ) { int numNs(0); bool isN; for (int i(0);i::prefixLength;++i, ++buf) { isN=isBlank(*buf); // isN=( (*buf=='n')||(*buf=='N') ); oligo.ui[1]<<=numBitsPerBase; oligo.ui[1]|=(isN?0:whichBase[(uint)*buf]); mask.ui[1]<<=numBitsPerBase; mask.ui[1]|=0x3*isN; numNs+=1*isN; } // ~for i for (int i(0);i::suffixLength;i++, ++buf) // was max bases per word { isN=isBlank(*buf); // isN=( (*buf=='n')||(*buf=='N') ); oligo.ui[0]<<=numBitsPerBase; oligo.ui[0]|=(isN?0:whichBase[(uint)*buf]); mask.ui[0]<<=numBitsPerBase; mask.ui[0]|=0x3*isN; numNs+=1*isN; } // ~for i return numNs; } // ~int encodeOligo( const char* buf, Oligo& oligo, Oligo& mask ) // convert a single oligo from ASCII to binary. // Fail if the oligo contains Ns or other funny stuff // template< int oligoLength > void operator()( const char* buf, Oligo& oligo ) { Oligo dummy; oligo.ui[0]=0; oligo.ui[1]=0; if (encodeOligo( buf, oligo, dummy)!=0) { cerr << "Error: did not expect to find Ns in oligo!" << endl; } } // ~void operator()( const char* buf, Oligo& oligo ) #ifdef ORIGINAL_CODE // generate a set of query sequences from ASCII sequence data // template< int oligoLength > int QueryGenerator::operator() ( const char* buf, const uint oligoNum, vector& queryOligo, vector& queryMask, vector& queryOligoNum ) { int numNs(0), tailSize(0), headSize(0); // char tempBuf1[OLIGO_LENGTH+1], tempBuf2[OLIGO_LENGTH+1]; tempBuf1[OLIGO_LENGTH]='\0'; tempBuf2[OLIGO_LENGTH]='\0'; queryOligo.clear(); queryMask.clear(); queryOligoNum.clear(); // cout << endl << buf << " - initial " << endl; for (int i(0);i4) // { // cout << "Too many Ns, rejecting" << endl; // return numNs; // } // ~if // while ( ((buf[headSize]=='N')||(buf[headSize]=='n')) // &&(headSizetail for ( int i(0);i2) ||((numInternalNs==2)&&(tailSize!=0)) ||((numInternalNs==1)&&(tailSize>fragLengthA)) ||((numInternalNs==0)&&(tailSize>fragLengthA+fragLengthD))) { // cout << "Too many separate Ns, rejecting" << endl; return numNs; } // OK, output oligo as is // cout << tempBuf1 << " - shifted " << endl; queryOligo.push_back( Oligo() ); queryMask.push_back( Oligo() ); queryOligoNum.push_back(oligoNum); assert( numNs==encodeOligo(tempBuf1, queryOligo.back(), queryMask.back())); // printWord(queryOligo.back().ui[1],prefixLength_); // printWord(queryOligo.back().ui[0],16); // cout << " - encoded oligo" << endl; // printWord(queryMask.back().ui[1],prefixLength_); // printWord(queryMask.back().ui[0],16); // cout << " - encoded mask" << endl; #ifdef ALLOW_N_TO_BE_DELETION // if >=1 error, copy removing first N, send to output list if (numInternalNs>=1) { int i, firstN, secondN; tempBuf2[OLIGO_LENGTH-1]='n'; // for (i=0;((i> ((maxBasesPerWord-prefixLength_)<<1) ); // Changed to match new oligo format TC 05.08.03 queryMask.back().ui[0] ^= ( ((uint)~0) >> ((maxBasesPerWord-suffixLength)<<1) ); queryMask.back().ui[1] ^= ( ((uint)~0) >> ((maxBasesPerWord-prefixLength)<<1) ); } // ~if else { // self-complementary queryOligo.pop_back(); queryMask.pop_back(); queryOligoNum.pop_back(); // cout << oligoNum << " is self-complementary" << endl; } // ~else } // ~for #endif // ~ifndef DONT_SEARCH_REVERSE_STRAND return numNs; } // ~int QueryGenerator::operator() #endif // ORIGINAL_CODE // generate a set of query sequences from ASCII sequence data // template< int oligoLength > int operator() ( const char* buf, const uint oligoNum, vector& queryOligo, vector& queryMask, vector& queryOligoNum ) { // set the stage queryOligo.clear(); queryMask.clear(); queryOligoNum.clear(); // _convert now encapsulates the functionality below return convert_( buf, oligoNum, queryOligo, queryMask, queryOligoNum,0 ); } // ~int QueryGenerator::operator() protected: // private method that encapsulates the main functionality of converting // buf into oligoNum int convert_( const char* buf, const uint oligoNum, vector& queryOligo, vector& queryMask, vector& queryOligoNum, const short& seed_no ) { // the following should be encapsulated int numNs(0), tailSize(0), headSize(0); // char tempBuf1[OLIGO_LENGTH+1], tempBuf2[OLIGO_LENGTH+1]; tempBuf1[OLIGO_LEN]='\0'; tempBuf2[OLIGO_LEN]='\0'; // cout << endl << buf << " - initial " << endl; for (int i(0);i4) // { // cout << "Too many Ns, rejecting" << endl; // return numNs; // } // ~if // while ( ((buf[headSize]=='N')||(buf[headSize]=='n')) // &&(headSizetail for ( int i(0);i2) ||((numInternalNs==2)&&(tailSize!=0)) ||((numInternalNs==1)&&(tailSize>ElandConstants::fragLengthA)) ||((numInternalNs==0)&&(tailSize>ElandConstants::fragLengthA+ElandConstants::fragLengthD))) { // cout << "Too many separate Ns, rejecting" << endl; return numNs; } // OK, output oligo as is // cout << tempBuf1 << " - shifted " << endl; queryOligo.push_back( Oligo() ); queryMask.push_back( Oligo() ); queryOligoNum.push_back( ((oligoNum|seed_bits[seed_no])) ); assert( numNs==encodeOligo(tempBuf1, queryOligo.back(), queryMask.back())); // printWord(queryOligo.back().ui[1],prefixLength_); // printWord(queryOligo.back().ui[0],16); // cout << " - encoded oligo" << endl; // printWord(queryMask.back().ui[1],prefixLength_); // printWord(queryMask.back().ui[0],16); // cout << " - encoded mask" << endl; #ifdef ALLOW_N_TO_BE_DELETION // if >=1 error, copy removing first N, send to output list if (numInternalNs>=1) { int i, firstN, secondN; tempBuf2[OLIGO_LEN-1]='n'; // for (i=0;((i> ((maxBasesPerWord-prefixLength_)<<1) ); // Changed to match new oligo format TC 05.08.03 queryMask.back().ui[0] ^= ( ((uint)~0) >> ((maxBasesPerWord-ElandConstants::suffixLength)<<1) ); queryMask.back().ui[1] ^= ( ((uint)~0) >> ((maxBasesPerWord-ElandConstants::prefixLength)<<1) ); } // ~if else { // self-complementary queryOligo.pop_back(); queryMask.pop_back(); queryOligoNum.pop_back(); // cout << oligoNum << " is self-complementary" << endl; } // ~else } // ~for #endif // ~ifndef DONT_SEARCH_REVERSE_STRAND return numNs; } }; // ====================================================================== // ========================== MULTI-SEED ================================ // ====================================================================== // now let's create a multi-seed query generator template class MultiSeedQueryGenerator : public QueryGenerator { public: // c'tor MultiSeedQueryGenerator(bool single, const vector& seedOffsets ):single_(single),seedOffsets_( seedOffsets ) { // nothing to do } // d'tor ~MultiSeedQueryGenerator(void) { // nothing to do so far } // generate a set of query sequences from ASCII sequence data int operator()( const char* buf, const uint oligoNum, vector& queryOligo, vector& queryMask, vector& queryOligoNum, vector& queryCnt ) { queryOligo.clear(); queryMask.clear(); queryOligoNum.clear(); queryCnt.clear(); // by default we pack as many seeds into the read as possible if // single_ is true (ie the user specified only one seed per // read, set no_of_seeds accordingly int startIdx = 1; uint no_of_seeds = 4; if( single_ == true ) { no_of_seeds = 1; startIdx = 0; } // in the multiseed phase, take overlapping seeds to be as // sensitive as possible; if we are in multiseed mode, we know // that the first 32bp either did not match (NM), or are a // hypermatch (255:255:255). Therefore, start the first seed at // base 16. // // set up an array with the starting positions for all the // seeds; consider those starting positions when inserting the // matches into StateMachine if( single_ == false ) { assert( no_of_seeds == seedOffsets_.size() ); no_of_seeds = 4; } int total_count = 0; // create a set of vectors for each vector to keep the // intermediate results // // the reason for not passing on queryOligo, queryMask or // queryOligoNum is that in the convert_ method we iterate over // the entire vector (create, for instance, the reverse // oligos,...) // cut the buffer into different seeds for(uint i=startIdx;i tmp_queryOligo; vector tmp_queryMask; vector tmp_queryOligoNum; char seed_buf[1024]; strncpy( seed_buf,(buf+seedOffsets_[i]),1024 ); total_count += QueryGenerator::convert_( seed_buf, oligoNum, tmp_queryOligo, tmp_queryMask, tmp_queryOligoNum,(short)i ); // append the temporary vectors queryOligo.insert( queryOligo.end(),tmp_queryOligo.begin(),tmp_queryOligo.end() ); queryMask.insert( queryMask.end(),tmp_queryMask.begin(),tmp_queryMask.end() ); queryOligoNum.insert( queryOligoNum.end(),tmp_queryOligoNum.begin(),tmp_queryOligoNum.end() ); queryCnt.push_back( tmp_queryMask.size() ); } // put everything together return total_count; } private: bool single_; vector seedOffsets_; }; } //namespace eland_ms } //namespace casava #endif // CASAVA_ELAND_MS_QUERY_GENERATOR_H