//Copyright 2017 Ryan Wick
//This file is part of Bandage
//Bandage is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//Bandage is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//You should have received a copy of the GNU General Public License
//along with Bandage. If not, see .
#include "blastquery.h"
#include "../program/settings.h"
#include "../graph/path.h"
#include "../graph/debruijnnode.h"
#include
#include
#include
BlastQuery::BlastQuery(QString name, QString sequence) :
m_name(name), m_sequence(sequence), m_searchedFor(false), m_shown(true)
{
autoSetSequenceType();
}
//This function looks at the query sequence to decide if it is
//a nucleotide or protein sequence.
void BlastQuery::autoSetSequenceType()
{
//If the sequence contains a letter that's in the protein
//alphabet but not in the extended DNA alphabet, then it's
//a protein
if (m_sequence.contains('e') || m_sequence.contains('E') ||
m_sequence.contains('f') || m_sequence.contains('F') ||
m_sequence.contains('i') || m_sequence.contains('I') ||
m_sequence.contains('l') || m_sequence.contains('L') ||
m_sequence.contains('p') || m_sequence.contains('P') ||
m_sequence.contains('q') || m_sequence.contains('Q'))
{
m_sequenceType = PROTEIN;
return;
}
//If the code got here, then it's a bit trickier. It could
//possibly be an extended alphabet DNA sequence or a protein
//sequence without particular amino acids.
//Look to see if A, C, G, T and N make up 75% or more of
//the sequence. If so, it's DNA. If not, it's
//protein.
int length = m_sequence.length();
int nuclLetters = m_sequence.count('a') + m_sequence.count('A') +
m_sequence.count('c') + m_sequence.count('C') +
m_sequence.count('g') + m_sequence.count('G') +
m_sequence.count('t') + m_sequence.count('T') +
m_sequence.count('n') + m_sequence.count('N');
if (double(nuclLetters) / length >= 0.75)
m_sequenceType = NUCLEOTIDE;
else
m_sequenceType = PROTEIN;
}
QString BlastQuery::getTypeString() const
{
if (m_sequenceType == NUCLEOTIDE)
return "nucl";
else
return "prot";
}
void BlastQuery::clearSearchResults()
{
m_searchedFor = false;
m_hits.clear();
}
//This function tries to find the paths through the graph which cover the query.
void BlastQuery::findQueryPaths()
{
m_paths = QList();
if (m_hits.size() > g_settings->maxHitsForQueryPath)
return;
int queryLength = m_sequence.length();
if (m_sequenceType == PROTEIN)
queryLength *= 3;
//Find all possible path starts within an acceptable distance from the query
//start.
QList possibleStarts;
double acceptableStartFraction = 1.0 - g_settings->minQueryCoveredByPath;
for (int i = 0; i < m_hits.size(); ++i)
{
BlastHit * hit = m_hits[i].data();
if (hit->m_queryStartFraction <= acceptableStartFraction)
possibleStarts.push_back(hit);
}
//Find all possible path ends.
QList possibleEnds;
double acceptableEndFraction = g_settings->minQueryCoveredByPath;
for (int i = 0; i < m_hits.size(); ++i)
{
BlastHit * hit = m_hits[i].data();
if (hit->m_queryEndFraction >= acceptableEndFraction)
possibleEnds.push_back(hit);
}
//For each possible start, find paths to each possible end.
QList possiblePaths;
for (int i = 0; i < possibleStarts.size(); ++i)
{
BlastHit * start = possibleStarts[i];
GraphLocation startLocation = start->getHitStart();
for (int j = 0; j < possibleEnds.size(); ++j)
{
BlastHit * end = possibleEnds[j];
GraphLocation endLocation = end->getHitEnd();
//Assuming there is a path from the start hit to the end hit,
//determine the ideal length. This is the query length minus the
//parts of the query not covered by the start and end.
int partialQueryLength = queryLength;
int pathStart = start->m_queryStart - 1;
int pathEnd = end->m_queryEnd;
if (m_sequenceType == PROTEIN)
{
pathStart *= 3;
pathEnd *= 3;
}
partialQueryLength -= pathStart;
partialQueryLength -= queryLength - pathEnd;
//Determine the minimum and maximum lengths allowed for the path.
int minLength;
if (g_settings->minLengthPercentage.on && g_settings->minLengthBaseDiscrepancy.on) //both on
minLength = std::max(int(partialQueryLength * g_settings->minLengthPercentage + 0.5), partialQueryLength + g_settings->minLengthBaseDiscrepancy);
else if (g_settings->minLengthPercentage.on && !g_settings->minLengthBaseDiscrepancy.on) //just relative
minLength = int(partialQueryLength * g_settings->minLengthPercentage + 0.5);
else if (!g_settings->minLengthPercentage.on && g_settings->minLengthBaseDiscrepancy.on) //just absolute
minLength = partialQueryLength + g_settings->minLengthBaseDiscrepancy;
else //neither are on
minLength = 1;
int maxLength;
if (g_settings->maxLengthPercentage.on && g_settings->maxLengthBaseDiscrepancy.on) //both on
maxLength = std::min(int(partialQueryLength * g_settings->maxLengthPercentage + 0.5), partialQueryLength + g_settings->maxLengthBaseDiscrepancy);
else if (g_settings->maxLengthPercentage.on && !g_settings->maxLengthBaseDiscrepancy.on) //just relative
maxLength = int(partialQueryLength * g_settings->maxLengthPercentage + 0.5);
else if (!g_settings->maxLengthPercentage.on && g_settings->maxLengthBaseDiscrepancy.on) //just absolute
maxLength = partialQueryLength + g_settings->maxLengthBaseDiscrepancy;
else //neither are on
maxLength = std::numeric_limits::max();
possiblePaths.append(Path::getAllPossiblePaths(startLocation,
endLocation,
g_settings->maxQueryPathNodes - 1,
minLength,
maxLength));
}
}
//Now we use the Path objects to make BlastQueryPath objects. These contain
//BLAST-specific information that the Path class doesn't.
QList blastQueryPaths;
for (int i = 0; i < possiblePaths.size(); ++i)
blastQueryPaths.push_back(BlastQueryPath(possiblePaths[i], this));
//We now want to throw out any paths for which the hits fail to meet the
//thresholds in settings.
QList sufficientCoveragePaths;
for (int i = 0; i < blastQueryPaths.size(); ++i)
{
if (blastQueryPaths[i].getPathQueryCoverage() < g_settings->minQueryCoveredByPath)
continue;
if (g_settings->minQueryCoveredByHits.on && blastQueryPaths[i].getHitsQueryCoverage() < g_settings->minQueryCoveredByHits)
continue;
if (g_settings->maxEValueProduct.on && blastQueryPaths[i].getEvalueProduct() > g_settings->maxEValueProduct)
continue;
if (g_settings->minMeanHitIdentity.on && blastQueryPaths[i].getMeanHitPercIdentity() < 100.0 * g_settings->minMeanHitIdentity)
continue;
if (g_settings->minLengthPercentage.on && blastQueryPaths[i].getRelativePathLength() < g_settings->minLengthPercentage)
continue;
if (g_settings->maxLengthPercentage.on && blastQueryPaths[i].getRelativePathLength() > g_settings->maxLengthPercentage)
continue;
if (g_settings->minLengthBaseDiscrepancy.on && blastQueryPaths[i].getAbsolutePathLengthDifference() < g_settings->minLengthBaseDiscrepancy)
continue;
if (g_settings->maxLengthBaseDiscrepancy.on && blastQueryPaths[i].getAbsolutePathLengthDifference() > g_settings->maxLengthBaseDiscrepancy)
continue;
sufficientCoveragePaths.push_back(blastQueryPaths[i]);
}
//We now want to throw out any paths which are sub-paths of other, larger
//paths.
for (int i = 0; i < sufficientCoveragePaths.size(); ++i)
{
bool throwOut = false;
for (int j = 0; j < sufficientCoveragePaths.size(); ++j)
{
//No need to compare a path with itself.
if (i == j)
continue;
if (sufficientCoveragePaths[i].getPath().hasNodeSubset(sufficientCoveragePaths[j].getPath()))
{
throwOut = true;
break;
}
}
if (!throwOut)
m_paths.push_back(sufficientCoveragePaths[i]);
}
//Now we sort the paths from best to worst.
std::sort(m_paths.begin(), m_paths.end());
}
//This function returns the fraction of the query that is covered by BLAST hits.
//If a list of BLAST hits is passed to the function, it only looks in those
//hits. If no such list is passed, it looks in all hits for this query.
// http://stackoverflow.com/questions/5276686/merging-ranges-in-c
double BlastQuery::fractionCoveredByHits(const QList * hitsToCheck) const
{
int hitBases = 0;
int queryLength = getLength();
if (queryLength == 0)
return 0.0;
std::vector > ranges;
if (hitsToCheck == 0) {
for (int i = 0; i < m_hits.size(); ++i) {
BlastHit * hit = m_hits[i].data();
ranges.push_back(std::pair(hit->m_queryStart - 1, hit->m_queryEnd));
}
}
else {
for (int i = 0; i < hitsToCheck->size(); ++i) {
BlastHit * hit = (*hitsToCheck)[i];
ranges.push_back(std::pair(hit->m_queryStart - 1, hit->m_queryEnd));
}
}
if (ranges.size() == 0)
return 0.0;
std::sort(ranges.begin(), ranges.end());
std::vector > mergedRanges;
std::vector >::iterator it = ranges.begin();
std::pair current = *(it)++;
while (it != ranges.end())
{
if (current.second >= it->first) {
current.second = std::max(current.second, it->second);
} else {
mergedRanges.push_back(current);
current = *(it);
}
it++;
}
mergedRanges.push_back(current);
for (size_t i = 0; i < mergedRanges.size(); ++i)
hitBases += mergedRanges[i].second - mergedRanges[i].first;
return double(hitBases) / queryLength;
}