#include "common/SequenceUtils.hh" #include "boost/scoped_array.hpp" #include #include /** * Project : CASAVA * Module : $RCSfile: SequenceUtils.cpp,v $ * @author : Tony Cox * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ namespace casava { namespace common { /// returns log(1+x), switches to special libc function when abs(x) is small /// static double log1p_switch(const double x){ // better number?? static const double smallx_thresh(0.01); if(std::abs(x)= 48) { tmp *= 10; tmp += (type[i] - '0'); } else if (type[i] == 'A' || type[i] == 'C' || type[i] == 'G' || type[i] == 'T') { posInRead += tmp; tmp=0; // mismatch alignScore += _logpcorrectratio[static_cast(qualityString[posInRead])]; ++posInRead; } else if (type[i] == 'N') { posInRead += tmp; tmp=0; // what should N in the reference be?? for now it counts as a match ++posInRead; } else if (type[i]=='^' || type[i] == '$' ) { } else { std::cerr << "ERROR: Unexpected match descriptor: " << type << "\n"; exit(EXIT_FAILURE); } } return alignScore; } const double invlog10(1./(std::log(10.))); double SequenceUtils:: ReadScorer:: getSemiAlignedReadMetric(const Alignment& alignment) const { return ((getAlignmentScore(alignment.getQuality().c_str(),alignment.getMatchDescriptor().c_str()))*invlog10); } double SequenceUtils:: getSemiAlignedReadMetric(const Alignment& alignment, const bool isQphred) { return ReadScorer::get(isQphred).getSemiAlignedReadMetric(alignment); } /*****************************************************************************/ bool SequenceUtils:: getReadPairInfo(const Alignment& alignment, unsigned int readLen, unsigned int partnerReadLen, bool& isChimeric, long& leftPos, unsigned int& insertSize, RelOrient& relOrient) { // const std::string& chrom(alignment.getChromosome()); const Match& partnerMatch(alignment.getPartnerMatch()); const std::string& partnerChrom(partnerMatch.getChromosome()); const long pos(alignment.getPosition()); long partnerPos(partnerMatch.getPosition()); Match::Strand strand(alignment.getStrand()); Match::Strand partnerStrand(partnerMatch.getStrand()); const bool isRead1(alignment.getReadNumber() == 1); if (!(((strand == Match::Forward) || (strand == Match::Reverse)) && ((partnerStrand == Match::Forward) || (partnerStrand == Match::Reverse)))) { // Interested only in pairs where both partners are aligned. return false; } // Possible chimeric translocation. if (!partnerChrom.empty()) { // Partner Chrom is blank if the two chromosomes are the same. isChimeric = true; leftPos = pos; insertSize = 0; return true; } isChimeric = false; // Same chromosome -> partnerPos is offset -> convert to absolute position. partnerPos += pos; if (partnerPos == pos) { // Not interested in dimers/palindromes but values as per SVfinder.pl. leftPos = pos; insertSize = (isRead1 ? readLen : partnerReadLen); return false; } const bool isLeftRead(pos < partnerPos); leftPos = isLeftRead ? pos : partnerPos; relOrient = RelOrient(isRead1 ? pos : partnerPos, isRead1 ? strand : partnerStrand, isRead1 ? partnerPos : pos, isRead1 ? partnerStrand : strand); insertSize = (isLeftRead ? (partnerPos - pos + partnerReadLen) : (pos - partnerPos + readLen)); return true; } /*****************************************************************************/ bool SequenceUtils:: ReadScorer:: isValidQualString(const char* const qual) const { const char* q(qual); while(*q != '\0'){ const int q64val(*q); if(q64val < _qmin or q64val >= MAX_Q64){ std::cerr << "ERROR:: Invalid quality value: " << static_cast(*q)-64 << " in position: " << (q-qual+1) << " of quality string: " << qual << "\n"; return false; } ++q; } return true; } bool SequenceUtils:: isValidQualString(const char* const qual, const bool isQphred) { return ReadScorer::get(isQphred).isValidQualString(qual); } void SequenceUtils:: comp_error(const char c) { std::cerr << "ERROR: unexpected input to comp(): " << c << "\n"; exit(EXIT_FAILURE); } /* void SequenceUtils::getAlignmentScore( const std::string& qv, const std::string& type, double& score ) { boost::regex optionRegex("(\\d+)*([ATCG]+)"); boost::smatch matches; std::string::const_iterator it=type.begin(); std::string::const_iterator end=type.end(); uint posInRead=0; while (boost::regex_search(it, end, matches, optionRegex)) { ++posInRead; posInRead += boost::lexical_cast(matches[1].str()); char c = qv.at(posInRead-1); int b = static_cast(c) - 64; double scoreTmp = log( 1.0 - pow(10, (-b/10.0)) ); score += log((1-scoreTmp)/3)-log(scoreTmp); it = matches[0].second; } score = exp(score); } */ double SequenceUtils:: ReadScorer:: getBreakPointHypothesisScore(const Alignment& alignment, const unsigned breakPointPos, const bool enteringBreakPoint) const { unsigned j=0; const char* qualityString = alignment.getQuality().c_str(); const char* type = alignment.getMatchDescriptor().c_str(); unsigned lengthOfAlignment = strlen (qualityString); double alignScore = 0.0; // std::cerr << "In getBreakPointHypothesisScore: qualityString " << qualityString << "\t" << "type " << type << "\t" << "breakPointPos " << breakPointPos << "\n"; if (breakPointPos==0) { if (enteringBreakPoint) alignScore = ReadScorer::getAlignmentScoreAfterBreakPoint(qualityString,type); else alignScore = ReadScorer::getAlignmentScore(qualityString,type); //std::cerr << "alignScore = " << alignScore << "\n"; //std::cerr.flush(); return alignScore; } else if (breakPointPos>=lengthOfAlignment) { if (enteringBreakPoint) alignScore = ReadScorer::getAlignmentScore(qualityString,type); else alignScore = ReadScorer::getAlignmentScoreAfterBreakPoint(qualityString,type); //std::cerr << "alignScore = " << alignScore << "\n"; return alignScore; } //if all matches const unsigned nt(strlen(type)); bool allMatches = true; for (unsigned i=0;i 57 || (int) type[i] < 48) { //not between 0 and 9 allMatches = false; break; } } // std::cerr << " lengthOfAlignment = " << lengthOfAlignment << "\n"; const std::string qualityStringBeforeBreakPoint(qualityString,qualityString+breakPointPos); const std::string qualityStringAfterBreakPoint(qualityString+breakPointPos,qualityString+lengthOfAlignment); if (allMatches) { int numAfterBreakPoint = lengthOfAlignment-breakPointPos; char numBeforeBreakPointStr[5]; char numAfterBreakPointStr[5]; sprintf(numBeforeBreakPointStr, "%d", breakPointPos); sprintf(numAfterBreakPointStr, "%d", numAfterBreakPoint); if (enteringBreakPoint) { alignScore = ReadScorer::getAlignmentScoreAfterBreakPoint(qualityStringAfterBreakPoint.c_str(),numAfterBreakPointStr); } else { alignScore = ReadScorer::getAlignmentScore(qualityStringBeforeBreakPoint.c_str(),numBeforeBreakPointStr); } return (alignScore); } //all matches, like 100 //figure out where to break the type string j=0; int posInRead = 0; //const unsigned nt(strlen(type)); unsigned i=0, tmp=0; boost::scoped_array typePtr(new char[2*(nt+1)]); char* typeBeforeBreakPoint(typePtr.get()); char* typeAfterBreakPoint(typePtr.get()+(nt+1)); bool breakPointFound = false; unsigned typeCutPosBefore, typeCutPosAfter; //char* typeComplete = (char*) malloc (lengthOfAlignment+1); for (i=0;i= 48) { tmp *= 10; tmp += (type[i] - '0'); //if (posInRead+tmp>=lengthOfAlignment) {break;} } else { if (posInRead+tmp==breakPointPos) { unsigned typeCutPos=i-1; for (j=0; jbreakPointPos) { unsigned numBaseBeforeBreakPoint = breakPointPos-posInRead; unsigned numBaseAfterBreakPoint = posInRead+tmp-breakPointPos; unsigned startPosAfterBreakPoint = 0; unsigned digitsOfTmp = (int)(log(tmp)/log(10))+1; typeCutPosBefore=i-digitsOfTmp; //if (tmp>9) typeCutPosBefore=i-2; else typeCutPosBefore=i-1; typeCutPosAfter = i; if (numBaseAfterBreakPoint>9) { for (j=0; j=typeCutPosAfter) { typeAfterBreakPoint[startPosAfterBreakPoint+j-typeCutPosAfter]=type[j]; } } typeAfterBreakPoint[startPosAfterBreakPoint+j-typeCutPosAfter]='\0'; if (numBaseBeforeBreakPoint>9) { for (j=0; j9) typeCutPosBefore=i-2; else typeCutPosBefore=i-1; //typeCutPosAfter = i; if (numBaseAfterBreakPoint>9) { for (j=0; j9) { for (j=0; j0) { typeBeforeBreakPoint[typeCutPosBefore]=(int)(numBaseBeforeBreakPoint)+48; typeBeforeBreakPoint[typeCutPosBefore+1]='\0'; } else { typeBeforeBreakPoint[typeCutPosBefore]='\0'; } typeAfterBreakPoint[startPosAfterBreakPoint]='\0'; } if (breakPointPos>0) { if (enteringBreakPoint) alignScore += ReadScorer::getAlignmentScore(qualityStringBeforeBreakPoint.c_str(),typeBeforeBreakPoint); else alignScore += ReadScorer::getAlignmentScoreAfterBreakPoint(qualityStringBeforeBreakPoint.c_str(),typeBeforeBreakPoint); } if (breakPointPos= 48) //between 0 and 9 { tmp *= 10; tmp += (type[i] - '0'); } else if (type[i] == 'A' || type[i] == 'C' || type[i] == 'G' || type[i] == 'T' || type[i]=='N') { // match but we penalize for match for (unsigned j=0; j(qualityString[posInRead+j])]; } posInRead += tmp; tmp=0; ++posInRead; } else if (type[i]=='^' || type[i] == '$' ) { } else { std::cerr << "ERROR: Unexpected match descriptor in getAlignmentScoreAfterBreakPoint: type " << type << "\n"; std::cerr << "qualityString " << qualityString << "\n"; exit(EXIT_FAILURE); } } //matches if (tmp>0) { for (unsigned j=0; j(qualityString[posInRead+j])]; } } return alignScore; } } }