/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file phageAlign.cpp ** ** \brief Phage align. ** ** Phage align. ** ** \author Tony Cox **/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/MathCompatibility.hh" #include "alignment/GlobalUtilities.hh" #include "common/Alignment.hh" #include "common/Exceptions.hh" #include "alignment/MatchDescriptor.hh" #define SWITCHED_TO_EXPORT_OUTPUT 1 // Simple program to match sequences against a small genome allowing for an // arbitrary number of errors using namespace std; typedef unsigned int uint; //const uint lineLength(50); //const uint bufSize(128); //char oligoBuffer[bufSize]; //const uint maxSeqSize(60); //const uint numBases(4); // Assume will encounter A, C, G, T or '.' in real sequence const uint numReadableBases(numDifferentBases+1); const int veryBadScore(-99999999); const short scoreNotSet=-20000; typedef short ScoreTable[maxSeqSize][numReadableBases][numDifferentBases]; // +20 because title line can overflow sequence line length // Comment out following line to disable support for circularized genome #define CIRCULARIZE // Uncomment out following line to only search forward strand //#define DONT_SEARCH_REVERSE_STRAND // Uncomment next line to scan for indels as well as substitution errors //#define CHECK_FOR_INDELS char titleBuf[256]; // NB scores are stored as shorts, so there will be problems // if max(defaultMatch, defaultMismatch)*seqSize>32767. // defaultMatch=500 OK for read lengths<=65 // Assumed scoring scheme= 100*log10(odds of outcome); const short defaultMatch=500; const short defaultMismatch=-547; // defaultMatch=500 is equivalent to correct:wrong=100000:1 // assuming the three possible wrong bases for any sequenced base // are equiprobable, then the corresponding defaultMismatch is // 100*log10(1/300000)=-547 const short defaultBlank=-47; // "-47" comes from 100*log10(1/3) - log odds score if all four bases // are equally likely to be sequenced as a blank struct GenomeFrag { GenomeFrag( const char* seq, const int pos ) : seq_(seq), pos_(pos) {} const char* seq_; uint pos_; }; // ~GenomeFrag bool lessThanGenomeFrag( const GenomeFrag& a, const GenomeFrag& b ) { return (strcmp(a.seq_, b.seq_)<0); } // ~bool lessThanGenomeFrag const uint reverseFlag(((uint)1)<<31); // Sort GenomeFrags using a least-significant-digit first radix sort void mySort( vector& v, const int seqSize ) { // Written somewhat quickly, probably does more copying of data than // it needs to vector vtemp(v); vector v1, v2; for (uint i(0);i > m; for (int i(seqSize-1);i>=0;i--) { // bin entries v1 by seqSize-th digit for (uint j(0); j >::iterator i(m.begin());i!=m.end();++i) { if (!i->second.empty()) { v2.insert(v2.end(),i->second.begin(),i->second.end()); i->second.clear(); } // ~if } // ~for v1=v2; } // ~for // Now overwrite v with the finished product for (uint i(0);i ScoreByBase; typedef vector ScoreByCycle; typedef vector ScoreBySeq; struct QuerySeqs { QuerySeqs( int seqSize ) : scores_( seqSize+1, ScoreByCycle(4) ) {} void addSeq( const OligoSource& oligos, const ScoreSource& scores, const uint seqSize ); ScoreBySeq scores_; vector maxScore_; }; // ~struct QuerySeqs void QuerySeqs::addSeq(const OligoSource& , const ScoreSource& scores, const uint seqSize) { BaseScore t1,t2; // strncpy( data_, oligos.getLastOligo(), seqSize ); // for (uint i(0); it1) t1=t2; } // ~for j maxScore_.back()+=t1; // cout << i << '\t' << t1; for (uint j(0); j bestPos_; }; // ~struct QueryResults struct QuerySeq { QuerySeq( const OligoSource& oligos, const ScoreSource& scores, const uint seqSize, const uint seqNum); char data_[maxSeqSize]; }; // ~struct QuerySeq QuerySeq::QuerySeq( const OligoSource& oligos, const ScoreSource& , const uint seqSize, const uint ) { strncpy( data_, oligos.getLastOligo(), seqSize ); data_[seqSize]='\0'; } // ~QuerySeq::c'tor bool lessThanQuerySeq( const QuerySeq& a, const QuerySeq& b ) { return (strcmp(a.data_, b.data_)<0); } // ~bool lessThanQuerySeq // returns first position at which a and b differ (or seqSize if identical) int firstDiff( const QuerySeq& a, const QuerySeq& b, const int seqSize ) { int i(0); for (;i& frags, vector& tempFragStore) { OligoSource* pTags(OligoSource::getOligoSource(tagFileName)); cerr << "Opened tag file " << tagFileName << endl; #ifndef SWITCHED_TO_EXPORT_OUTPUT printf( "#TAG_FILE %s\n", tagFileName ); #endif const char* tagBuf; while((tagBuf=pTags->getNextOligo())!=NULL) { tempFragStore.push_back( Seq() ); if (strlen(tagBuf) frags; vector tempFragStore; vector fragStore; char* buf(NULL); char* bufR(NULL); char* tagFileName(NULL); char* genomeFileName(NULL); if (strcmp( args[thisArg], "-tag")==0) { tagFileName=args[++thisArg]; } // ~if else { genomeFileName=args[thisArg]; } // ~else uint maxNumBlanks(atoi(args[++thisArg])); cerr << "Will ignore sequences containing more than " << maxNumBlanks << " blanks" << endl; #ifndef SWITCHED_TO_EXPORT_OUTPUT printf( "#MAX_BLANKS %d\n", maxNumBlanks ); #endif uint seqSize =atoi(args[++thisArg]); cerr << "Assuming sequence length of " << seqSize << " bases" << endl; #ifndef SWITCHED_TO_EXPORT_OUTPUT printf( "#SEQ_LENGTH %d\n", seqSize ); #endif // assert(seqSize=maxSeqSize)) { cerr << "Error: specified sequence size (" << seqSize << ") is outside valid range (1 to " << maxSeqSize-1 << ")" << endl; exit (1); } std::string chrName; if (tagFileName!=NULL) { // sequences are matched against a set of tags // tempFragStore contains the actual strings - frags just points to them. storeTags(tagFileName, seqSize, frags, tempFragStore); chrName = boost::filesystem::path(tagFileName).leaf(); } // ~if else { // assume target sequence is a single entry FASTA format file assert(genomeFileName!=NULL); FILE* pGenome; if ((pGenome=fopen(genomeFileName, "r" ))==NULL) { cerr << "Error: could not open genome file " << args[1] << endl; exit (1); } // ~if cerr << "Opened genome file " << genomeFileName << endl; #ifndef SWITCHED_TO_EXPORT_OUTPUT printf( "#GENOME_FILE %s\n", genomeFileName ); #endif chrName = boost::filesystem::path(genomeFileName).leaf(); int numRead(0); // read comment line; assert(fgets(titleBuf, 256, pGenome)!=NULL); assert(titleBuf[0]=='>'); titleBuf[strlen(titleBuf)-1]='\0'; if (titleBuf[strlen(titleBuf)-1]=='\r') { cerr << "ERROR: genome file " << genomeFileName << " is a DOS format text file!!\n"; exit (-1); } // if // Create a buffer to hold the genome data numRead=ftell(pGenome); fseek(pGenome,0,SEEK_END); genomeSize=ftell(pGenome)-numRead+seqSize; cerr << "Creating buffer of size " << genomeSize << " bytes" << endl; buf=new char[genomeSize]; char* p(buf); fseek(pGenome,numRead,SEEK_SET); genomeSize=0; while (fgets(p, 256, pGenome)!=NULL) { numRead=strlen(p)-1; // -1 ignores carriage return at end p+=numRead; genomeSize+=numRead; } // ~while fclose(pGenome); cerr << "Read in genome " << titleBuf << " of size " << genomeSize << endl; #ifdef CIRCULARIZE for (uint i(0);i" << genomeSize-seqSize+1 << " cross cut point" << endl; genomeSize+=seqSize-1; #endif *p++='\0'; frags.reserve(genomeSize-seqSize+1); bool isValid; for (uint i(0);i<=genomeSize-seqSize; i++) { // don't match if fragment contains non-base characters, e.g. Ns isValid=true; for (uint j(seqSize); ((j!=0)&&isValid);) { --j; isValid=(realBaseCodes[(uint)*(buf+i+j)]!=nc); } // ~for if (isValid) { frags.push_back( GenomeFrag( buf+i, i)); } // ~if } // ~if #ifndef DONT_SEARCH_REVERSE_STRAND frags.reserve(2*frags.size()); bufR = new char[genomeSize+1]; bufR[genomeSize]='\0'; // reverse complement the genome not the queries for (int i(0);i sequenceVec; vector seqs; QuerySeqs seqs_(seqSize); for (int i(thisArg); i(pOligosRaw); if (pScoresRaw!=NULL) { #ifndef SWITCHED_TO_EXPORT_OUTPUT printf ("#SCORE_FILE %s\n", args[i]); #endif scoreFilter.link(pScoresRaw); oligoFilter.link(pOligosRaw); pScores=&scoreFilter; pOligos=&oligoFilter; } // ~if else { // pScoresRaw=new ScoreSourceBasic pScoresRaw=new ScoreSourceBasic ( *pOligos, defaultMatch, defaultBlank, defaultMismatch ); pScores=pScoresRaw; oligoFilter.link(pOligosRaw); pOligos=&oligoFilter; #ifndef SWITCHED_TO_EXPORT_OUTPUT printf ("#SCORE_FILE none\n"); #endif } // ~else } // ~else // pScoresRaw and pOligosRaw never accessed directly from now on // ...until they are no longer needed and are deleted bool isValid(true); casava::common::Sequence sequence(pOligos->getNextSequence(isValid)); while (isValid) { if (sequence.getData().size() < seqSize) { cerr << "ERROR: all query sequences in file " << args[i] << " need to be at least " << seqSize << " bases long, but sequence " << sequence.getData() << " (seq num " << seqNum + 1 << ") is only " << sequence.getData().size() << " bases long." << endl; exit (-1); } // ~if std::string sequenceData = sequence.getData(); std::replace(sequenceData.begin(), sequenceData.end(), '.', 'N'); sequence.setData(sequenceData); sequenceVec.push_back(sequence); seqs_.addSeq( *pOligos, *pScores, seqSize ); seqs.push_back( QuerySeq( *pOligos, *pScores, seqSize, seqNum++)); // cout << seqs[seqNum-1].num_ << " " << seqs[seqNum-1].maxScore_ // << seqs[seqNum-1].data_ << endl; #ifdef DEBUG cout << pOligos->getLastOligo() << endl; for (int i(0) ; igetLastOligo()[i] << endl; for (int j(0);j<4;j++) { cout << baseNames[j] << " " << scoreTable[i] [baseCodes[pOligos->getLastOligo()[i]]] [baseCodes[baseNames[j]]] << " " <getScore(baseNames[j], i) << endl; assert (scoreTable[i][baseCodes[pOligos->getLastOligo()[i]]] [baseCodes[baseNames[j]]] ==pScores->getScore(baseNames[j], i)); } // ~for j } // ~for i #endif sequence = pOligos->getNextSequence(isValid); } // ~while if(dynamic_cast(pOligosRaw)==NULL) delete pScoresRaw; delete pOligosRaw; // fclose(pSeq); } // ~for i const uint numFrags(frags.size()); const uint numSeqs(seqs.size()); assert(numSeqs==seqNum); cerr << "Sequences of length " << seqSize << endl; cerr << "Total sequences: " << numSeqs+numIgnored << endl; cerr << "Will ignore " << numIgnored << endl; cerr << "Will align " << numSeqs << endl; // standard sort is horrendously slow for large genome // (4Mb = ~8 million frags) - replace with radix sort // sort( frags.begin(), frags.end(), lessThanGenomeFrag ); mySort( frags, seqSize ); // vector fragStore; fragStore.reserve(numFrags); fragStore.push_back( Seq() ); strncpy(fragStore[0].data, frags[0].seq_, seqSize); fragStore[0].data[seqSize]='\0'; frags[0].seq_=fragStore[0].data; for (uint i(1);i commonPrefix(numFrags); for (uint i(0);i