//Copyright 2017 Ryan Wick //This file is part of Bandage. //Bandage is free software: you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. //Bandage is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. //You should have received a copy of the GNU General Public License //along with Bandage. If not, see . #include "blastquerypath.h" #include "blastquery.h" #include "../graph/debruijnnode.h" #include "../program/globals.h" #include "../graph/assemblygraph.h" #include "../graph/debruijnedge.h" #include BlastQueryPath::BlastQueryPath(Path path, BlastQuery * query) : m_path(path), m_query(query) { //This function follows the path, returning the BLAST hits it finds for the //query. It requires that the hits occur in order, i.e. that each hit in //the path begins later in the query than the previous hit. BlastHit * previousHit = 0; QList pathNodes = m_path.getNodes(); for (int i = 0; i < pathNodes.size(); ++i) { DeBruijnNode * node = pathNodes[i]; QList hitsThisNode; QList< QSharedPointer > queryHits = query->getHits(); for (int j = 0; j < queryHits.size(); ++j) { BlastHit * hit = queryHits[j].data(); if (hit->m_node->getName() == node->getName()) hitsThisNode.push_back(hit); } std::sort(hitsThisNode.begin(), hitsThisNode.end(), BlastHit::compareTwoBlastHitPointers); for (int j = 0; j < hitsThisNode.size(); ++j) { BlastHit * hit = hitsThisNode[j]; //First check to make sure the hits are within the path. This means //if we are in the first or last nodes of the path, we need to make //sure that our hit is contained within the start/end positions. if ( (i != 0 || hit->m_nodeStart >= m_path.getStartLocation().getPosition()) && (i != pathNodes.size()-1 || hit->m_nodeEnd <= m_path.getEndLocation().getPosition())) { //Now make sure that the hit follows the previous hit in the //query. if (previousHit == 0 || hit->m_queryStart > previousHit->m_queryStart) { m_hits.push_back(hit); previousHit = hit; } } } } } double BlastQueryPath::getMeanHitPercIdentity() const { int totalHitLength = 0; double sum = 0.0; for (int i = 0; i < m_hits.size(); ++i) { int hitLength = m_hits[i]->m_alignmentLength; totalHitLength += hitLength; double hitIdentity = m_hits[i]->m_percentIdentity; sum += hitIdentity * hitLength; } if (totalHitLength == 0) return 0.0; else return sum / totalHitLength; } //This function looks at all of the hits in the path for this query and //multiplies the e-values together. If the hits overlap each other, then //this function reduces the e-values accoringly (effectively to prevent //the overlapping region from being counted twice). SciNot BlastQueryPath::getEvalueProduct() const { double coefficientProduct = 1.0; int exponentSum = 0; for (int i = 0; i < m_hits.size(); ++i) { BlastHit * thisHit = m_hits[i]; SciNot thisHitEValue = thisHit->m_eValue; double eValueLenToRemove = 0.0; if (i > 0) { BlastHit * previousHit = m_hits[i-1]; int overlap = getHitOverlap(previousHit, thisHit); if (overlap > 0) eValueLenToRemove += overlap / 2.0; } if (i < m_hits.size() - 1) { BlastHit * nextHit = m_hits[i+1]; int overlap = getHitOverlap(thisHit, nextHit); if (overlap > 0) eValueLenToRemove += overlap / 2.0; } if (eValueLenToRemove > 0.0) { int thisHitLength = thisHit->getNodeLength(); double reduction = (thisHitLength - eValueLenToRemove) / thisHitLength; thisHitEValue.power(reduction); } coefficientProduct *= thisHitEValue.getCoefficient(); exponentSum += thisHitEValue.getExponent(); } return SciNot(coefficientProduct, exponentSum); } int BlastQueryPath::getHitOverlap(BlastHit * hit1, BlastHit * hit2) const { int hit1Start, hit1End, hit2Start, hit2End; QPair possibleEdge(hit1->m_node, hit2->m_node); // Overlap in the same node is simple. if (hit1->m_node == hit2->m_node) { hit1Start = hit1->m_nodeStart - 1; hit1End = hit1->m_nodeEnd; hit2Start = hit2->m_nodeStart - 1; hit2End = hit2->m_nodeEnd; } // Overlap in connected nodes is a bit more complex - we need to express // the second hit's coordinates in terms of the first hit's node. else if (g_assemblyGraph->m_deBruijnGraphEdges.contains(possibleEdge)) { DeBruijnEdge * edge = g_assemblyGraph->m_deBruijnGraphEdges[possibleEdge]; int overlap = edge->getOverlap(); hit1Start = hit1->m_nodeStart; hit1End = hit1->m_nodeEnd; int hit1NodeLen = hit1->m_node->getLength(); hit2Start = hit2->m_nodeStart + hit1NodeLen - overlap; hit2End = hit2->m_nodeEnd + hit1NodeLen - overlap; } else return 0; int overlap = std::min(hit1End, hit2End) - std::max(hit1Start, hit2Start); if (overlap > 0) return overlap; else return 0; } //This function looks at the length of the given path and compares it to how //long the path should be for the hits it contains (i.e. if the path perfectly //matched up the query). double BlastQueryPath::getRelativeLengthDiscrepancy() const { if (m_hits.empty()) return std::numeric_limits::max(); int hitQueryLength = getHitQueryLength(); int discrepancy = m_path.getLength() - hitQueryLength; return double(discrepancy) / hitQueryLength; } //This function gets the length of the path relative to the how long it should //be. A value of 1 means a perfect match; less than 1 means it is too short; //more than 1 means it is too long. double BlastQueryPath::getRelativePathLength() const { return double(m_path.getLength()) / getHitQueryLength(); } //This function gets the difference between how long the path is vs how long it //should be. A value of 0 means a perfect match; less than 0 means it is too //short; more than 0 means it is too long. int BlastQueryPath::getAbsolutePathLengthDifference() const { return m_path.getLength() - getHitQueryLength(); } QString BlastQueryPath::getAbsolutePathLengthDifferenceString(bool commas) const { int lengthDisc = getAbsolutePathLengthDifference(); QString lengthDiscSign = ""; if (lengthDisc > 0) lengthDiscSign = "+"; if (commas) return lengthDiscSign + formatIntForDisplay(lengthDisc); else return lengthDiscSign + QString::number(lengthDisc); } //This function returns the fraction of the query that is covered by the entire //path. double BlastQueryPath::getPathQueryCoverage() const { if (m_hits.empty()) return 0.0; int queryStart = m_hits.front()->m_queryStart; int queryEnd = m_hits.back()->m_queryEnd; int queryLength = m_query->getLength(); int notIncluded = queryStart - 1; notIncluded += queryLength - queryEnd; return 1.0 - notIncluded / double(queryLength); } //This function returns the fraction of the query that is covered by hits in the //path. double BlastQueryPath::getHitsQueryCoverage() const { return m_query->fractionCoveredByHits(&m_hits); } //This function returns the length of the query which is covered by the path. //It is returned in bp, whether or not the query is a protein or nucleotide //sequence. int BlastQueryPath::getHitQueryLength() const { int queryStart = m_hits.front()->m_queryStart; int queryEnd = m_hits.back()->m_queryEnd; int hitQueryLength = queryEnd - queryStart + 1; if (m_query->getSequenceType() == PROTEIN) hitQueryLength *= 3; return hitQueryLength; } int BlastQueryPath::getTotalHitMismatches() const { int total = 0; for (int i = 0; i < m_hits.size(); ++i) total += m_hits[i]->m_numberMismatches; return total; } int BlastQueryPath::getTotalHitGapOpens() const { int total = 0; for (int i = 0; i < m_hits.size(); ++i) total += m_hits[i]->m_numberGapOpens; return total; } //This function is used for sorting the paths for a query from best to worst. //it uses < to mean 'better than'. bool BlastQueryPath::operator<(BlastQueryPath const &other) const { //First we compare using the E-value product. This seems to value stronger //hits as well as paths with fewer, longer hits. SciNot aEValueProduct = getEvalueProduct(); SciNot bEValueProduct = other.getEvalueProduct(); if (aEValueProduct != bEValueProduct) return aEValueProduct < bEValueProduct; //If the code got here, then the two paths have the same e-value product, //possibly because they contain the same hits, or possibly because they both //contain hits so strong as to have an e-value of zero. //Now we compare using mean percent identity. double aMeanPercIdentity = getMeanHitPercIdentity(); double bMeanPercIdentity = other.getMeanHitPercIdentity(); if (aMeanPercIdentity != bMeanPercIdentity) return aMeanPercIdentity > bMeanPercIdentity; //Now we use the absolute value of the length discrepancy. double aLengthDiscrepancy = fabs(getRelativeLengthDiscrepancy()); double bLengthDiscrepancy = fabs(other.getRelativeLengthDiscrepancy()); if (aLengthDiscrepancy != bLengthDiscrepancy) return aLengthDiscrepancy < bLengthDiscrepancy; //Now we use fraction of query covered by hits. double aHitsQueryCoverage = getHitsQueryCoverage(); double bHitsQueryCoverage = other.getHitsQueryCoverage(); if (aHitsQueryCoverage != bHitsQueryCoverage) return aHitsQueryCoverage > bHitsQueryCoverage; return false; }