//Copyright 2017 Ryan Wick //This file is part of Bandage //Bandage is free software: you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. //Bandage is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. //You should have received a copy of the GNU General Public License //along with Bandage. If not, see . #include "blastsearch.h" #include "../graph/assemblygraph.h" #include #include #include "buildblastdatabaseworker.h" #include "runblastsearchworker.h" #include "../program/settings.h" #include #include "../graph/debruijnnode.h" #include "../program/memory.h" #include BlastSearch::BlastSearch() : m_blastQueries(), m_tempDirectory("bandage_temp/") { } BlastSearch::~BlastSearch() { cleanUp(); } void BlastSearch::clearBlastHits() { m_allHits.clear(); m_blastQueries.clearSearchResults(); m_blastOutput = ""; } void BlastSearch::cleanUp() { clearBlastHits(); m_blastQueries.clearAllQueries(); emptyTempDirectory(); } //This function uses the contents of m_blastOutput (the raw output from the //BLAST search) to construct the BlastHit objects. //It looks at the filters to possibly exclude hits which fail to meet user- //defined thresholds. void BlastSearch::buildHitsFromBlastOutput() { QStringList blastHitList = m_blastOutput.split("\n", Qt::SkipEmptyParts); for (int i = 0; i < blastHitList.size(); ++i) { QString hitString = blastHitList[i]; QStringList alignmentParts = hitString.split('\t'); if (alignmentParts.size() < 12) continue; QString queryName = alignmentParts[0]; QString nodeLabel = alignmentParts[1]; double percentIdentity = alignmentParts[2].toDouble(); int alignmentLength = alignmentParts[3].toInt(); int numberMismatches = alignmentParts[4].toInt(); int numberGapOpens = alignmentParts[5].toInt(); int queryStart = alignmentParts[6].toInt(); int queryEnd = alignmentParts[7].toInt(); int nodeStart = alignmentParts[8].toInt(); int nodeEnd = alignmentParts[9].toInt(); SciNot eValue(alignmentParts[10]); double bitScore = alignmentParts[11].toDouble(); //Only save BLAST hits that are on forward strands. if (nodeStart > nodeEnd) continue; QString nodeName = getNodeNameFromString(nodeLabel); DeBruijnNode * node; if (g_assemblyGraph->m_deBruijnGraphNodes.contains(nodeName)) node = g_assemblyGraph->m_deBruijnGraphNodes[nodeName]; else continue; BlastQuery * query = g_blastSearch->m_blastQueries.getQueryFromName(queryName); if (query == 0) continue; QSharedPointer hit(new BlastHit(query, node, percentIdentity, alignmentLength, numberMismatches, numberGapOpens, queryStart, queryEnd, nodeStart, nodeEnd, eValue, bitScore)); //Check the user-defined filters. if (g_settings->blastAlignmentLengthFilter.on && alignmentLength < g_settings->blastAlignmentLengthFilter) continue; if (g_settings->blastQueryCoverageFilter.on) { double hitCoveragePercentage = 100.0 * hit->getQueryCoverageFraction(); if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) continue; } if (g_settings->blastIdentityFilter.on && percentIdentity < g_settings->blastIdentityFilter) continue; if (g_settings->blastEValueFilter.on && eValue > g_settings->blastEValueFilter) continue; if (g_settings->blastBitScoreFilter.on && bitScore < g_settings->blastBitScoreFilter) continue; m_allHits.push_back(hit); query->addHit(hit); } } //This function looks at each BLAST query and tries to find a path through //the graph which covers the maximal amount of the query. void BlastSearch::findQueryPaths() { m_blastQueries.findQueryPaths(); } QString BlastSearch::getNodeNameFromString(QString nodeString) { QStringList nodeStringParts = nodeString.split("_"); //The node string format should look like this: //NODE_nodename_length_123_cov_1.23 if (nodeStringParts.size() < 6) return ""; if (nodeStringParts.size() == 6) return nodeStringParts[1]; //If the code got here, there are more than 6 parts. This means there are //underscores in the node name (happens a lot with Trinity graphs). So we //need to pull out the parts which consitute the name. int underscoreCount = nodeStringParts.size() - 6; QString nodeName = ""; for (int i = 0; i <= underscoreCount; ++i) { nodeName += nodeStringParts[1+i]; if (i < underscoreCount) nodeName += "_"; } return nodeName; } #ifdef Q_OS_WIN32 //On Windows, we use the WHERE command to find a program. bool BlastSearch::findProgram(QString programName, QString * command) { QProcess find; find.start("WHERE", QStringList(programName)); find.waitForFinished(); *command = programName; return (find.exitCode() == 0); } #else //On Mac/Linux we use the which command to find a program. bool BlastSearch::findProgram(QString programName, QString * command) { QProcess find; //On Mac, it's necessary to adjust the PATH variable in order //for which to work. #ifdef Q_OS_MAC QProcessEnvironment env = QProcessEnvironment::systemEnvironment(); QStringList envlist = env.toStringList(); //Add some paths to the process environment envlist.replaceInStrings(QRegularExpression("^(?i)PATH=(.*)"), "PATH=" "/usr/bin:" "/bin:" "/usr/sbin:" "/sbin:" "/opt/local/bin:" "/usr/local/bin:" "/opt/homebrew/bin:" "$HOME/bin:" "$HOME/.local/bin:" "$HOME/miniconda3/bin:" "/usr/local/ncbi/blast/bin:" "\\1"); find.setEnvironment(envlist); #endif find.start("which", QStringList(programName)); find.waitForFinished(); //On a Mac, we need to use the full path to the program. #ifdef Q_OS_MAC *command = QString(find.readAll()).simplified(); #else *command = programName; #endif return (find.exitCode() == 0); } #endif void BlastSearch::clearSomeQueries(std::vector queriesToRemove) { //Remove any hits that are for queries that will be deleted. QList< QSharedPointer >::iterator i = m_allHits.begin(); while (i != m_allHits.end()) { BlastQuery * hitQuery = (*i)->m_query; bool hitIsForDeletedQuery = (std::find(queriesToRemove.begin(), queriesToRemove.end(), hitQuery) != queriesToRemove.end()); if (hitIsForDeletedQuery) i = m_allHits.erase(i); else ++i; } //Now actually delete the queries. m_blastQueries.clearSomeQueries(queriesToRemove); } void BlastSearch::emptyTempDirectory() { //Safety checks if (g_blastSearch->m_tempDirectory == "") return; if (!g_blastSearch->m_tempDirectory.contains("bandage_temp")) return; QDir tempDirectory(m_tempDirectory); tempDirectory.setNameFilters(QStringList() << "*.*"); tempDirectory.setFilter(QDir::Files); foreach(QString dirFile, tempDirectory.entryList()) tempDirectory.remove(dirFile); } //This function carries out the entire BLAST search procedure automatically, without user input. //It returns an error string which is empty if all goes well. QString BlastSearch::doAutoBlastSearch() { cleanUp(); QString makeblastdbCommand; if (!findProgram("makeblastdb", &makeblastdbCommand)) return "Error: The program makeblastdb was not found. Please install NCBI BLAST to use this feature."; BuildBlastDatabaseWorker buildBlastDatabaseWorker(makeblastdbCommand); buildBlastDatabaseWorker.buildBlastDatabase(); if (buildBlastDatabaseWorker.m_error != "") return buildBlastDatabaseWorker.m_error; loadBlastQueriesFromFastaFile(g_settings->blastQueryFilename); QString blastnCommand; if (!findProgram("blastn", &blastnCommand)) return "Error: The program blastn was not found. Please install NCBI BLAST to use this feature."; QString tblastnCommand; if (!findProgram("tblastn", &tblastnCommand)) return "Error: The program tblastn was not found. Please install NCBI BLAST to use this feature."; RunBlastSearchWorker runBlastSearchWorker(blastnCommand, tblastnCommand, g_settings->blastSearchParameters); runBlastSearchWorker.runBlastSearch(); if (runBlastSearchWorker.m_error != "") return runBlastSearchWorker.m_error; blastQueryChanged("all"); return ""; } //This function returns the number of queries loaded from the FASTA file. int BlastSearch::loadBlastQueriesFromFastaFile(QString fullFileName) { int queriesBefore = int(g_blastSearch->m_blastQueries.m_queries.size()); std::vector queryNames; std::vector querySequences; AssemblyGraph::readFastaOrFastqFile(fullFileName, &queryNames, &querySequences); for (size_t i = 0; i < queryNames.size(); ++i) { QApplication::processEvents(); //We only use the part of the query name up to the first space. QStringList queryNameParts = queryNames[i].split(" "); QString queryName; if (queryNameParts.size() > 0) queryName = cleanQueryName(queryNameParts[0]); g_blastSearch->m_blastQueries.addQuery(new BlastQuery(queryName, querySequences[i])); } int queriesAfter = int(g_blastSearch->m_blastQueries.m_queries.size()); return queriesAfter - queriesBefore; } QString BlastSearch::cleanQueryName(QString queryName) { //Replace whitespace with underscores queryName = queryName.replace(QRegularExpression("\\s"), "_"); //Remove any dots from the end of the query name. BLAST doesn't //include them in its results, so if we don't remove them, then //we won't be able to find a match between the query name and //the BLAST hit. while (queryName.length() > 0 && queryName[queryName.size() - 1] == '.') queryName = queryName.left(queryName.size() - 1); return queryName; } void BlastSearch::blastQueryChanged(QString queryName) { g_assemblyGraph->clearAllBlastHitPointers(); std::vector queries; //If "all" is selected, then we'll display each of the BLAST queries if (queryName == "all") queries = g_blastSearch->m_blastQueries.m_queries; //If only one query is selected, then just display that one. else { BlastQuery * query = g_blastSearch->m_blastQueries.getQueryFromName(queryName); if (query != 0) queries.push_back(query); } //We now filter out any queries that have been hidden by the user. std::vector shownQueries; for (size_t i = 0; i < queries.size(); ++i) { BlastQuery * query = queries[i]; if (query->isShown()) shownQueries.push_back(query); } //Add the blast hit pointers to nodes that have a hit for //the selected target(s). for (size_t i = 0; i < shownQueries.size(); ++i) { BlastQuery * query = shownQueries[i]; for (int j = 0; j < g_blastSearch->m_allHits.size(); ++j) { BlastHit * hit = g_blastSearch->m_allHits[j].data(); if (hit->m_query == query) hit->m_node->addBlastHit(hit); } } }