/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file orphanAligner.cpp ** ** \brief Phage align. ** ** Phage align. ** ** \author Markus Bauer **/ #include #include #include #include #include #include #include #include "alignment/ELAND_unsquash.h" #include "alignment/aligner.h" #include "common/ExtendedFileReader.h" #define FRAGMENT_LENGTH 450 #define UPPER_BOUND_OCC 1 #define MAX_ERROR_RATE 0.1 #define REQUEST_SIZE 262144 // singleton request struct SingletonRequest { SingletonRequest( int readNum, int left_or_right, uint fileIndex, uint contigNum, uint filePos, char strand, string orphan ) : readNum_(readNum), left_or_right_(left_or_right), fileIndex_(fileIndex), contigNum_(contigNum), filePos_(filePos), strand_(strand), orphan_(orphan) {} int readNum_; int left_or_right_; uint fileIndex_; uint contigNum_; uint filePos_; char strand_; string orphan_; }; // ~struct SeqRequest // singleton request struct SingletonAlignment { SingletonAlignment( int readNum, int left_or_right, uint fileIndex, uint contigNum, int filePos, char strand, string orphan, int aligned_position, string matchDesc ) : readNum_(readNum), left_or_right_(left_or_right), fileIndex_(fileIndex), contigNum_(contigNum), filePos_(filePos), strand_(strand), orphan_(orphan), aligned_position_(aligned_position), matchDesc_(matchDesc) {} int readNum_; int left_or_right_; uint fileIndex_; uint contigNum_; int filePos_; char strand_; string orphan_; int aligned_position_; string matchDesc_; }; // ~struct SeqRequest bool parseMatch( const string& match,string &chr,char& strand,uint& pos ); void parseXYZ( const char* xyz,int& oneError,int& twoError ); string reverseComplement( const string& s ); int global_upper_bound_occ = UPPER_BOUND_OCC; int global_fragment_length = FRAGMENT_LENGTH; class OrphanAligner { public: OrphanAligner( const string& squashed_genome, const ga::alignment::ScoreType match, const ga::alignment::ScoreType mismatch, const ga::alignment::ScoreType gapopen, const ga::alignment::ScoreType gapextend, const int& read_length, const int& maxNumberMismatches, const int& expected_insertsize, const int& expDeviation ) : squashed_genome_(squashed_genome), align_forward_(match,mismatch,gapopen,gapextend,global_fragment_length,expected_insertsize,expDeviation), align_reverse_(match,mismatch,gapopen,gapextend,global_fragment_length,expected_insertsize,expDeviation), maxNumberMismatches_(maxNumberMismatches) { // if the read length was greater then the expected insert size, we'd have overlapping reads // assert(expected_insertsize>read_length); // calculate where to jump, F -> jump over the read + 50 bases more jump_forward_ = read_length; // for the moment, do not jump forward any additional bases // TODO: estimate the actual fragment size from the expected insert size and the read lengths; // right now it's only hardcoded-values // 300bp quality string --> string of Q30 qualities scales the match/mismatch contributions to 1 qual_string_ = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; int expected_gap_forward = -1; int expected_gap_reverse = -1; // set the gaps favourably for the forward alignment expected_gap_forward = global_fragment_length - expected_insertsize; // set the gaps favourably for the reverse alignment expected_gap_reverse = expected_insertsize - ( jump_forward_ ); cerr << "expeced_gap_forward/expected_gap_reverse : " << expected_gap_forward << "\t" << expected_gap_reverse << endl; align_forward_.init( read_length,global_fragment_length,expected_gap_forward,expDeviation ); align_forward_.allowInserts( 1 ); align_forward_.allowDeletions( 1 ); align_reverse_.init( read_length,global_fragment_length,expected_gap_reverse,expDeviation ); align_reverse_.allowInserts( 1 ); align_reverse_.allowDeletions( 1 ); #ifdef DEBUG cerr << "setting jump_forward = " << jump_forward_ << endl; #endif } ~OrphanAligner() {}; vector pullOutFragments( StringIndex& files,vector& req,uint& orphans_rescued ); private: boost::shared_ptr pSquash_; string squashed_genome_; string qual_string_; ga::alignment::Aligner align_forward_; ga::alignment::Aligner align_reverse_; uint jump_forward_; uint jump_backward_; // threshold for orphans to be rescued. or not. const int maxNumberMismatches_; }; bool lessThanSingleton(const SingletonRequest& a, const SingletonRequest& b) { // return ((a.fileIndex_ " << endl << "(input file has to be in the export file format, genome has to point to a squashed genome directory)" << endl; exit (1); } string squashed_genome = string(args[3]); StringIndex files(squashed_genome); int cur_request = 0; vector req; ExtendedFileReaderActual i_export_left(args[1]); ExtendedFileReader left(&i_export_left); ExtendedFileReaderActual i_export_right(args[2]); ExtendedFileReader right(&i_export_right); ofstream out_left_extended( string(string(args[1])+ string(args[4]) ).c_str() ); ofstream out_right_extended( string(string(args[2])+ string(args[4]) ).c_str() ); // check that the eland_extended files are not empty if (!left.getNextEntry()) { cerr << "WARNING: input file " << args[1] << " is empty" << endl; return 0; } left.rewind(); cerr << "estimating insert size..."; vector insertSizes; // come up with statistics for the insert size distribution while( left.getNextEntry() == true ) { right.getNextEntry(); int left_matchcounter = left.getMatchCounter(); string left_matches = string(left.getMatches()); int right_matchcounter = right.getMatchCounter(); string right_matches = string(right.getMatches()); int currentInsertSize = -1; // we are only looking at read pairs where either read can be placed uniquely if( (left_matchcounter == 1) && (right_matchcounter==1) ) { string left_match_chr = ""; uint left_match_position = 0; char left_strand = 'F'; string right_match_chr = ""; uint right_match_position = 0; char right_strand = 'F'; parseMatch( left_matches,left_match_chr,left_strand,left_match_position ); parseMatch( right_matches,right_match_chr,right_strand,right_match_position ); // we have to be on the very same chromosome if( left_match_chr != right_match_chr ) { continue; } // we are only looking at read pairs that have proper orientation if( left_strand=='F' && right_strand=='R' ) { // left_match_position < right_match_position if( left_match_position < right_match_position ) { currentInsertSize = right_match_position - left_match_position; } } else if( left_strand=='R' && right_strand=='F' ) { // right_match_position < left_match_position if( right_match_position < left_match_position ) { currentInsertSize = left_match_position - right_match_position; } } } if( currentInsertSize != -1 ) { insertSizes.push_back( currentInsertSize ); // cout << currentInsertSize << endl; } } cerr << "done." << endl; // insertSizes now hold all the information // get the median sort( insertSizes.begin(),insertSizes.end() ); int medianDistribution = 0; // calculate variance + standard deviation double varianceDistribution = 0.0; double d_stdDeviationDistribution = 0.0; int i_stdDeviationDistribution = 0; int meanDistribution = 0; if( insertSizes.size() > 0 ) { medianDistribution = insertSizes[ insertSizes.size()/2 ]; // get the mean int sumOfAll = 0; for( unsigned int i=0;i(d_stdDeviationDistribution); cerr << "variance of distribution : " << varianceDistribution << endl; cerr << "standard deviation of distribution : " << d_stdDeviationDistribution << endl; } // rewind everything left.rewind(); right.rewind(); // setting the number of occurrences, upper bound global_upper_bound_occ=atoi(args[5]); ga::alignment::ScoreType match = 6; ga::alignment::ScoreType mismatch = -1; ga::alignment::ScoreType gapopen = 15; ga::alignment::ScoreType gapextend = 3; // determine read length left.getNextEntry(); int left_read_length = string(left.getRead()).size(); left.rewind(); right.getNextEntry(); int right_read_length = string(right.getRead()).size(); right.rewind(); int read_length = (left_read_length>right_read_length)?left_read_length:right_read_length; // compute the number of mismatches that we allow for an orphan to be rescued const int maxNumberMismatches = (int)(read_length*MAX_ERROR_RATE); cerr << "Setting the orphan rescue threshold to " << maxNumberMismatches << endl; OrphanAligner oa( squashed_genome, match, mismatch, gapopen, gapextend, read_length, maxNumberMismatches, medianDistribution, i_stdDeviationDistribution ); vector alns; uint cur_line_idx = 0; uint orphans_rescued = 0; uint total_no_of_candidates = 0; // main loop while( left.getNextEntry()==true ) { right.getNextEntry(); int left_matchcounter = left.getMatchCounter(); string left_matches = string(left.getMatches()); int right_matchcounter = right.getMatchCounter(); string right_matches = string(right.getMatches()); string match_chr = ""; uint i_match_position = 0; char strand = 'F'; string orphan_read = string(left.getRead()); short candidate = 0; /// split up the input at the ',' marks vector v_chrom; vector v_strand; vector v_match_position; vector split_matches; // matchcounter returns the minimal number of the X:Y:Z read: // this is 0 for NM and QC // if min_numer > 10 then we do not report any hits if( (left_matchcounter > 0) && (left_matchcounter <= global_upper_bound_occ) && right_matchcounter == 255 ) { size_t pos_in_string = 0; size_t next_comma = -1; #ifdef DEBUG cerr << "left_matches: " << left_matches << "\t" << left.getXYZ() << "\t" << right.getXYZ() << endl; #endif next_comma = left_matches.find(",",pos_in_string); while( next_comma != string::npos ) { string cur_substr = left_matches.substr( pos_in_string,(next_comma-pos_in_string) ); #ifdef DEBUG cerr << "SPLIT : " << cur_substr << endl; #endif split_matches.push_back( cur_substr ); pos_in_string = next_comma + 1; next_comma = left_matches.find(",",pos_in_string); } #ifdef DEBUG cerr << "SPLIT : " << left_matches.substr( pos_in_string,(next_comma-pos_in_string) ) << endl; #endif split_matches.push_back( left_matches.substr( pos_in_string,1024 ) ); for( vector::iterator it=split_matches.begin();it!=split_matches.end();it++ ) { // take left.getMatches() and parse the hit // chrX.fa:55351604F102C5 parseMatch( *it,match_chr,strand,i_match_position ); orphan_read = string(right.getRead()); candidate = 2; #ifdef DEBUG cerr << match_chr << "/" << strand << "/" << i_match_position << endl; #endif // push them back if( i_match_position > 0 ) { v_chrom.push_back( match_chr ); v_strand.push_back( strand ); v_match_position.push_back( i_match_position ); } } #ifdef DEBUG cerr << endl << endl; #endif } if( left_matchcounter == 255 && (right_matchcounter > 0) && (right_matchcounter <= global_upper_bound_occ) ) { size_t pos_in_string = 0; size_t next_comma = -1; #ifdef DEBUG cerr << "right_matches: " << right_matches << "\t" << left.getXYZ() << "\t" << right.getXYZ() << endl; #endif next_comma = right_matches.find(",",pos_in_string); while( next_comma != string::npos ) { string cur_substr = right_matches.substr( pos_in_string,(next_comma-pos_in_string) ); #ifdef DEBUG cerr << "SPLIT : " << cur_substr << endl; #endif split_matches.push_back( cur_substr ); pos_in_string = next_comma + 1; next_comma = right_matches.find(",",pos_in_string); } #ifdef DEBUG cerr << "SPLIT : " << right_matches.substr( pos_in_string,(next_comma-pos_in_string) ) << endl; #endif split_matches.push_back( right_matches.substr( pos_in_string,1024 ) ); for( vector::iterator it=split_matches.begin();it!=split_matches.end();it++ ) { // take right.getMatches() and parse the hit // chrX.fa:55351604F102C5 parseMatch( *it,match_chr,strand,i_match_position ); orphan_read = string(left.getRead()); candidate = 1; #ifdef DEBUG cerr << match_chr << "/" << strand << "/" << i_match_position << endl; #endif if( i_match_position > 0 ) { v_chrom.push_back( match_chr ); v_strand.push_back( strand ); v_match_position.push_back( i_match_position ); } } #ifdef DEBUG cerr << endl << endl; #endif } // we need the following to make sure that the assert statement does not kick off // when we have a hit at position 0 or < 0 if( v_match_position.size() > 0 ) { for( unsigned int ii=0;ii REQUEST_SIZE ) { cerr << "pulling out fragments..." << endl; total_no_of_candidates += cur_request; vector orphan_alignments = oa.pullOutFragments( files,req,orphans_rescued ); alns.insert( alns.end(),orphan_alignments.begin(),orphan_alignments.end() ); { vector req_tmp; req.swap( req_tmp ); req.clear(); } cur_request = 0; } // else } // candidate cur_line_idx++; } // while if( req.size() > 0 ) { vector orphan_alignments = oa.pullOutFragments( files,req,orphans_rescued ); alns.insert( alns.end(),orphan_alignments.begin(),orphan_alignments.end() ); total_no_of_candidates += cur_request; req.clear(); cur_request = 0; } // if( req.size() ... ) cerr << "total number of candidates = " << total_no_of_candidates << endl; cerr << "orphans rescued = " << orphans_rescued << endl; cerr << "list size = " << alns.size() << endl; cerr << "sorting singleton alignments..."; sort( alns.begin(),alns.end(),lessThanSingletonAlignment ); cerr << "done." << endl; // =================================== OUTPUT =================================== cerr << "writing output..."; // print the new alignments to the extended files cerr << "we got " << alns.size() << " orphans to write." << endl; left.rewind(); right.rewind(); int line_cnt_extended = 0; uint cur_idx_alns = 0; while( left.getNextEntry()==true ) { right.getNextEntry(); if( alns.size() > 0 ) { if( alns[cur_idx_alns].readNum_ == line_cnt_extended ) { short left_or_right = alns[cur_idx_alns].left_or_right_; int cnt_matches = 0; // build up the new match string, extend it to a contig level, // we only have to extend the loop below ostringstream new_match; uint cur_chrom = UINT_INIT; // initializes to 2^XX uint cur_contig = UINT_INIT; // initializes to 2^XX int cur_offset = 0; while( (alns[cur_idx_alns].readNum_ == line_cnt_extended) && (cur_idx_alns= 0 ) { pos = pos_tmp; } else { pos = 0; } return true; } vector OrphanAligner::pullOutFragments( StringIndex& files,vector& req,uint& orphans_rescued ) { vector res; if( req.size() == 0 ) { return res; } sort( req.begin(),req.end(),lessThanSingleton ); uint cur_file_idx = UINT_INIT; for( uint j=0;j( new SquashFile(squashed_genome_, files.names_[req[j].fileIndex_], files)); cur_file_idx = req[j].fileIndex_; } uint adapted_pos = (req[j].filePos_-1); if( req[j].strand_=='R' ) { // check that we are not at the beginning of a chromosome, // creating an integer underflow // if( ((int)adapted_pos - global_fragment_length)<0 ) // OLD CHECK DOES NOT WORK FOR unsigned ints! if( adapted_pos < (uint)global_fragment_length ) { continue; } adapted_pos -= global_fragment_length; } else { adapted_pos += jump_forward_; } #ifdef DEBUG cerr << "adapted position = " << adapted_pos << endl; #endif string result = ""; string non_reversed_result = ""; pSquash_->goToPos ( req[j].contigNum_,adapted_pos ); if( req[j].strand_=='R' ) { for (int jj1(0);jj1getNextBase(); } } else { // pull out the reverse complement for (int jj1(0);jj1getNextBase(); result += reverseCharASCII[(uint)(cur_base) ]; non_reversed_result+=cur_base; } reverse( result.begin(),result.end() ); } #ifdef DEBUG cerr << "FRAGMENT PULLED OUT: " << result << endl; cerr << req[j].strand_ << endl << req[j].orphan_ << endl << result << endl << endl; #endif ga::alignment::Aligner* curAligner = &align_forward_; if( req[j].strand_ == 'F' ) { // we need to take the align_forward_object curAligner = &align_reverse_; } int retCode = (*curAligner)( qual_string_.c_str(),req[j].orphan_.c_str(),result.c_str(),req[j].orphan_.size(),result.size(),(req[j].strand_=='R') ); assert(retCode>0); // assume everything went fine // return code of 2 is a repeat candidate if( retCode == 2 ) { #ifdef DEBUG cerr << "------------------------------------------------------------" << endl; #endif } #ifdef DEBUG int i=0; int reverse_i = curAligner->xt_.size()-1; while( ixt_.size() && (curAligner->xt_[i]=='-') ) {i++;} while( reverse_i>0 && (curAligner->xt_[reverse_i]=='-') ) {reverse_i--;} assert(reverse_i>i); int no_matches = 0; int sizeOverlap = reverse_i-i; for( int overlap_i=i;overlap_i<=reverse_i;++overlap_i ) { if( curAligner->xt_[overlap_i]==curAligner->yt_[overlap_i] ) { no_matches++; } } if( ((double)(no_matches)/(double)(sizeOverlap))>0.95 ) { cerr << req[j].strand_ << "\t" << ((double)(no_matches)/(double)(sizeOverlap)) << endl << curAligner->xt_ << endl << curAligner->yt_ << endl << endl; } #endif #ifdef SANITY_CHECK if( req[j].strand_ == 'F' ) { // check if we get the same alignment if we reverse-complement the read and align it against the non-reversed reference string orphan_revComplement = reverseComplement( req[j].orphan_ ); align_( qual_string_.c_str(),orphan_revComplement.c_str(),non_reversed_result.c_str(),orphan_revComplement.size(),non_reversed_result.size() ); cerr << endl << "NON REVERSED" << endl; cerr << curAligner->xt_ << endl << curAligner->yt_ << endl; } else { cerr << endl; } #endif #ifdef CUT_OUT_THE_ALIGNED_FRAGMENT int i=0; while( ixt_.size() && (curAligner->xt_[i]=='-') ) {i++;} if( (i-20+160)xt_.size() && (i>20) ) { cerr << curAligner->xt_.substr(i-20,160) << endl << curAligner->yt_.substr(i-20,160) << endl; int cur_score = 0; while(ixt_.size() && curAligner->xt_[i]!='-' ) { if( curAligner->xt_[i]==curAligner->yt_[i] ) { cur_score+=2; } else { cur_score-=1; } i++; } } #endif int mismatches = 0; int offset_begin = 0; int offset_end = 0; string new_ad = curAligner->convertToNewAlignmentDescriptor(curAligner->xt_,curAligner->yt_,mismatches,offset_begin,offset_end ); #ifdef DEBUG cerr << "DEBUG:" << endl << curAligner->xt_ << endl << curAligner->yt_ << endl << "# mismatches/ad: " << mismatches << "\t" << new_ad << endl; #endif if( new_ad != "" && (mismatches=0;i-- ) { if( s[i] == 'A' ) { result += 'T'; } else if( s[i] == 'T' ) { result += 'A'; } else if( s[i] == 'G' ) { result += 'C'; } else if( s[i] == 'C' ) { result += 'G'; } else { result += s[i]; } } return result; }