/// -*- mode: c++; indent-tabs-mode: nil; -*- // /// \file /** * Project : CASAVA * Module : $RCSfile: AlignContig.cpp,v $ * @author : Tony Cox * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ #include "applications/AlignContig.hh" #include "variance/Aligner.hh" #include "blt_util/seq_util.hh" #include #include #include #include #include #include #include #include #include //#define DEBUG_ALCTG 1 namespace ca { namespace applications { namespace cigarWriter { static const char align_state_label[] = "XMID"; enum align_t { NONE, MATCH, INSERT, DELETE }; static void write_cigar_update(const align_t align_state, align_t& last_align_state, unsigned& state_count, int& first_indel_offset, std::ostream& os) { if((align_state==DELETE or align_state==INSERT) and (first_indel_offset == -1)){ first_indel_offset=state_count; } if((state_count>0) and (align_state != last_align_state)){ os << state_count << align_state_label[last_align_state]; state_count = 0; } last_align_state=align_state; } static int write_cigar_descriptor(const char* align_contig, const unsigned align_contig_start, const unsigned align_contig_end, const unsigned contig_end, const char* align_ref, std::ostream& os){ static const char skip('-'); int first_indel_offset(-1); assert(align_contig_start <= align_contig_end); assert(align_contig_end <= contig_end); const unsigned contig_end_clip(contig_end-align_contig_end); align_t align_state(NONE); align_t last_align_state(NONE); unsigned state_count(0); // Test for soft clip at the begining of the alignment (ie. not // all of the contig has been aligned). If this is found, annotate // this as 'M' for now, although it should be 'S' -- could switch // this over after more varling debugging // if(align_contig_start){ last_align_state=MATCH; state_count += align_contig_start; } const unsigned ns(strlen(align_contig)); assert(ns==strlen(align_ref)); for(unsigned i(0);i=min_context) and (downstreamContext>=min_context)); //return ((upstreamContext>=min_context) or (downstreamContext>=min_context)); return ( (upstreamContext>=min_context and downstreamContext>=min_context) or (upstreamContext>=min_context and align.at_[lastPos] == ' ') or (downstreamContext>=min_context and align.at_[0] == ' ') ); } using namespace std; using namespace ca::variance_detection; // print alignment information - either as one-line-per-variant or as a full alignment // in BLAST style format class AlignmentPrinter { public: AlignmentPrinter(CasavaOptions& op) : numPoorContexts_(0),options_(op) { } virtual ~AlignmentPrinter() {} virtual void operator()( int &variantNum, const int contigNum, const char* contigName, const int numReadsInContig, Aligner& align, const int genomeStart )=0; // returns number of contigs with not enough matches bases on either side of the alignment int getNumPoorContexts() const { return numPoorContexts_; } protected: int numPoorContexts_; CasavaOptions& options_; }; // --report mode : one variant per line class AlignmentPrinterByVariant : public AlignmentPrinter { public: AlignmentPrinterByVariant( FILE* pOutfile, string& genomeFileName, CasavaOptions& options ) : AlignmentPrinter(options), pOutfile_(pOutfile), genomeFileName_(genomeFileName) {} virtual void operator()( int &variantNum, const int contigNum, const char* contigName, const int numReadsInContig, Aligner& align, const int genomeStart ); private: FILE* pOutfile_; std::string genomeFileName_; }; // ~struct AlignmentPrinterByVariant void AlignmentPrinterByVariant::operator()( int& variantNum, const int contigNum, const char* /*contigName*/, const int numReadsInContig, Aligner& align, const int genomeStart ) { // skip contigs with insufficient coverage if (!isSufficientContext(align,options_.minContext())) { ++numPoorContexts_; return; } const int l(align.yt_.size()); int lastVariantPos(-1); // pos of variant in genome //int lastVariantIndex = 0; // pos of variant in string -- dummy init. //int numMatchingBases(0); std::string genome, variant, transc, upstream, downstream; //bool isMatch, isMismatch, isIndel; //, hasBeenPrinted(false); int genomePos(genomeStart+align.y_end_); //int printedVariant(0); if (variantNum==0) { // a helpful friendly message comes free with your first indel fprintf(pOutfile_, "#variant_nr\tcontig_nr\tgenome\tstart\twild-type/variant\tedit_transcript\tupstream\tdownstream\treads_nr\talignment_score\n"); //cerr << "#variant_nr\tcontig_nr\tgenome\tstart\twild-type/variant\tedit_transcript\tupstream\tdownstream\treads_nr\talignment_score\n" << endl; //cerr << "\n\n" << endl; } /* cerr << "NEXT VAR:\n" << endl; cerr << align.yt_ << endl; cerr << align.at_ << endl; cerr << align.xt_ << endl; */ // // So it seems the previous code simply prints out the // variant after it notices the first gap as long as // there is enough context on each side. // // Given that assumption we will check first for one of // four possibilities: // // 1.) Alignment starts with a gap // 2.) Alignment ends with a gap // 3.) Alignment contains a gap // 4.) There is no gap // int varStart = -1; int varEnd = -1; bool isSnp = false; bool foundGap = false; if (align.at_[0] == ' ') { //cerr << "Found open-ended start, length: " << l << endl; // 1.) Alignment starts with a gap // Now find where the first non-gap is foundGap = true; int ii(0); while(ii < l) { if (align.at_[ii] != ' ') { break; } ii++; } varStart = 0; varEnd = ii; } else if (align.at_[l - 1] == ' ') { //cerr << "Found open-ended end" << endl; // 2.) Alignment ends with a gap // Now find where the first gap is foundGap = true; int ii(l - 1); while(ii >= 0) { if (align.at_[ii] != ' ') { break; } ii--; } varStart = ii + 1; varEnd = l; } else { //cerr << "Found SNP or indel" << endl; // Check for 3 or 4 for (int ii(0); ii < l; ii++) { if (align.at_[ii] == ' ') { foundGap = true; ii = l; } } if (foundGap) { //cerr << "Found indel" << endl; // 3.) Alignment contains a gap // Now find where the first gap is and following // non-gap for (int ii(0); ii < l; ii++) { if (align.at_[ii] == ' ' && varStart == -1) { varStart = ii; } if (varStart != -1 && align.at_[ii] != ' ') { varEnd = ii; break; } } } else { //cerr << "Found SNP" << endl; // 4.) There is no gap // This is a SNP, so find first mis-match isSnp = true; for (int ii(0); ii < l; ii++) { if (align.at_[ii] == '.') { varStart = ii; varEnd = ii + 1; } } } } // Make sure we set them //assert(varStart != -1); //assert(varEnd != -1); // Some sanity checking //if (! foundGap && ! isSnp) { //cerr << "[ERROR]: No gap or snp!" << endl; //exit(1); //} //cerr << "Variant: index, start, end:" << variantNum << "\t" << varStart << "\t" << varEnd << endl; if (foundGap) { //upstream=align.yt_.substr(0,varStart); //downstream=align.yt_.substr(varEnd + genome.size()); upstream=align.xt_.substr(0,varStart); downstream=align.xt_.substr(varEnd + genome.size()); lastVariantPos = genomePos + varStart; // May need to use GenomeStart instead of GenomePos, neet to test /* cerr << "UP\n" << upstream << endl; cerr << "DN\n" << downstream << endl; cerr << "last\n" << lastVariantPos << endl << endl; */ // Build edit string transc.resize(varEnd - varStart); for (int ii(varStart); ii < varEnd; ii++) { // Do a quick sanity check here, with this new method, only gaps // and not mismatches should be captured in the edit string // // This should not change the content, but may change the output // format from what was previously done... if (align.at_[ii] != ' ' && ! isSnp) { cerr << "[ERROR]: While building edit string (non-SNP) found non-gap character: '" << align.at_[ii] << "'" << endl; exit(1); } if (align.at_[ii] != '.' && isSnp) { cerr << "[ERROR]: While building edit string (SNP) found non-gap character: " << align.at_[ii] << endl; exit(1); } if (isSnp) { transc[ii - varStart] = align.at_[ii]; } else { transc[ii - varStart] = '-'; } } // Get varaint string variant.resize((varEnd - varStart) + (varEnd - varStart) + 1); for(int ii(0); ii < varEnd - varStart; ii++) { variant[ii] = align.yt_[ii + varStart]; } variant[varEnd - varStart] = '/'; for (int ii(varStart); ii < varEnd; ii++) { variant[ii - varStart + varEnd - varStart + 1] = align.xt_[ii]; } //transc[varEnd - varStart + 1] = '\0'; //variant[(varEnd - varStart) + (varEnd - varStart) + 2] = '\0'; /* cerr << "TRANS\n" << transc << endl; cerr << "VAR\n" << variant << ", length: " << variant.length() << ", max_size: " << variant.max_size() << ", capacity: " << variant.capacity() << endl; cerr << "VARC\n" << variant.c_str() << "." << endl; cerr << "trnC\n" << transc.c_str() << "." << endl; cerr << "\n\n\n" << endl; */ // Increment variant number variantNum++; fprintf(pOutfile_, "%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\n", variantNum, contigNum, genomeFileName_.c_str(), lastVariantPos, variant.c_str(), transc.c_str(), upstream.c_str(), downstream.c_str(), numReadsInContig, align.score_); } //printedVariant = 1; lastVariantPos = -1; genome.clear(); variant.clear(); transc.clear(); upstream.clear(); downstream.clear(); } // ~printByVariant // Full alignment output. Removes snps and alignments without indels. class AlignmentPrinterFull : public AlignmentPrinter { public: AlignmentPrinterFull( FILE* pOutfile, string& genomeFileName, CasavaOptions& options ): AlignmentPrinter(options), pOutfile_(pOutfile), genomeFileName_(genomeFileName) {} virtual void operator()( int &variantNum, const int contigNum, const char* contigName, const int numReadsInContig, Aligner& align, const int genomeStart ); private: FILE* pOutfile_; std::string genomeFileName_; }; // ~struct AlignmentPrinterFull void AlignmentPrinterFull::operator()( int& /*variantNum*/, const int /*contigNum*/, const char* contigName, const int /*numReadsInContig*/, Aligner& align, const int genomeStart ) { if (!isSufficientContext(align,options_.minContext())) { #ifdef DEBUG_ALCTG cout << "AlignmentPrinterFull::operator()" << endl; cout << "Context too small. Discarding." << endl; cout << "Threshold : " << options_.minContext() << endl; #endif ++numPoorContexts_; return; } if ((options_.variantOnlyMode())&&(strpbrk(align.at_.c_str(), ". ")==NULL)) { #ifdef DEBUG_ALCTG cout << "Exact match. Discarding." << endl; #endif return; } // ~if else if ((options_.indelOnlyMode())&&(strpbrk(align.at_.c_str(), " ")==NULL)) { #ifdef DEBUG_ALCTG cout << "No indel. Discarding." << endl; #endif return; } // ~if else if ((options_.snpOnlyMode())&&((strpbrk(align.at_.c_str(), " ")!=NULL)||(strpbrk(align.at_.c_str(), ".")==NULL))) { #ifdef DEBUG_ALCTG cout << "No snp. Discarding." << endl; #endif return; } // ~if fprintf(pOutfile_,"Aligning: %s\n",contigName); fprintf(pOutfile_,"Score: %d\n",align.score_); fprintf(pOutfile_,"%9d %s %9d\n", align.x_end_+1,align.xt_.c_str(),align.x_start_); fprintf(pOutfile_," %s\n", align.at_.c_str()); fprintf(pOutfile_,"%9d %s %9d\n", align.y_end_+genomeStart, align.yt_.c_str(),align.y_start_+genomeStart-1); fprintf(pOutfile_,"\n"); } // ~void AlignmentPrinterFull::operator() bool AlignContig::readNextHeader(char* header, /*static*/ const int bufSize, FILE* pContigs, int& contigNum, int& numReadsInContig, int& genomeStart, int& genomeEnd) { if (fgets(header, bufSize, pContigs) == NULL) return false; if (header[0]!='>') { cerr << "Expected fasta format, got line = " << header << "\n"; //exit(EXIT_FAILURE); return false; } // ~if header[strlen(header)-1]='\0'; //TODO: Refactor to API? if (sscanf(header,">contig=%d|group=%*d|reads=%d|start=%d|end=%d",&contigNum, &numReadsInContig, &genomeStart, &genomeEnd)!=4) { fprintf(stderr, "ERROR:: Could not parse start and end from %s\n",header); //exit(EXIT_FAILURE); return false; } // ~if if (genomeEnd(genome_size)) && genomeStart(genome_size)); if ( (genomeEnd>=static_cast(genome_size)) ||(genomeStart>=static_cast(genome_size)) ) { cerr << "WARNING: contig " << buf << " not aligned - alignment across boundary is TBD" << endl; continue; } // ~if // align contig to genome with affine gap costs align(buf2, genome.c_str()+genomeStart-1, strlen(buf2), genomeEnd-genomeStart+1); #ifdef DEBUG_ALCTG cout << "Alignment: " << endl; cout << align.at_.c_str() << endl; #endif if (align.score_<_options.minScore()) { #ifdef DEBUG_ALCTG cout << "Alignment score too low : " << align.score_ << " " << _options.minScore() << endl; #endif ++numBadAlignments; continue; } // ~if //cerr << "START printing alignment\n" << endl; //try to see if there are N's in genome being aligned to std::string genomeAligned; if (strchr(align.at_.c_str(),' ') != NULL) genomeAligned=genome.substr(align.y_end_+genomeStart+(strchr(align.at_.c_str(),' ')-align.at_.c_str()-1), (strrchr(align.at_.c_str(),' ')-strchr(align.at_.c_str(),' ')+1) ); if (strchr(genomeAligned.c_str(), 'N') !=NULL ) continue; //end of checking if there are N's in the genome (*pAlignmentPrinter)( variantNum, contigNum,buf, numReadsInContig, align, genomeStart ); //cerr << "Done printing alignment\n" << endl; // write varling output bool is_indel_context(false); if(is_write_varling_indel_contigs) { const bool is_indel(strpbrk(align.at_.c_str(), " ")!=NULL); if(is_indel) { is_indel_context = isSufficientContext(align,_options.minContext()); } } //cerr << "is indel context: " << is_indel_context << "." << endl; if(is_write_varling_indel_contigs and is_indel_context) { const unsigned start(align.y_end_+genomeStart); const char* contig_seq(buf2); contig_os << ">IDC" << contigNum << "|" << genomeFileName << "|" << start << "|"; int first_indel_offset(0); { // generate and write match descriptor: const char* align_contig(align.xt_.c_str()); assert(align.x_start_>=0); assert(align.x_end_>=0); const unsigned align_contig_start(align.x_end_); const unsigned align_contig_end(align.x_start_); const unsigned contig_end(strlen(contig_seq)); const char* align_ref(align.yt_.c_str()); first_indel_offset=cigarWriter::write_cigar_descriptor(align_contig,align_contig_start,align_contig_end,contig_end,align_ref,contig_os); } //cerr << "first indel offset: " << first_indel_offset << "." << endl; // start of the first indel in the contig: const unsigned indel_start(first_indel_offset>=0 ? start+first_indel_offset : 0); contig_os << "|" << indel_start << "\n"; // \todo line-wrap the sequence: contig_os << contig_seq << "\n"; //cerr << "Done varling" << endl; } // ~if (is_write_varling_indel_contigs) //cerr << "Done contigs" << endl; } // ~while (contigs) fclose(pContigs); fclose(pOutfile); fprintf(stderr, "Ignored %d contigs with alignment score less than %d\n",numBadAlignments, _options.minScore()); fprintf(stderr, "Ignored %d contigs with context (either upstream or downstream) less than %d\n", pAlignmentPrinter->getNumPoorContexts(),_options.minContext()); if (pAlignmentPrinter!=NULL) delete pAlignmentPrinter; return 0; } // end AlignContig::run() } } // end namespace casava{ namespace { applications // End of file