//Copyright 2017 Ryan Wick //This file is part of Bandage //Bandage is free software: you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. //Bandage is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. //You should have received a copy of the GNU General Public License //along with Bandage. If not, see . #include "assemblygraph.h" #include #include "../program/globals.h" #include "../program/settings.h" #include #include #include "../graph/debruijnnode.h" #include "../graph/debruijnedge.h" #include "../graph/graphicsitemnode.h" #include #include #include #include "../graph/graphicsitemedge.h" #include "../blast/blastsearch.h" #include "../ogdf/energybased/FMMMLayout.h" #include "../program/graphlayoutworker.h" #include "../program/memory.h" #include "path.h" #include "../ui/myprogressdialog.h" #include #include #include #include #include #include #include #include #include "ogdfnode.h" #include "../command_line/commoncommandlinefunctions.h" AssemblyGraph::AssemblyGraph() : m_kmer(0), m_contiguitySearchDone(false), m_sequencesLoadedFromFasta(NOT_READY) { m_ogdfGraph = new ogdf::Graph(); m_edgeArray = new ogdf::EdgeArray(*m_ogdfGraph); m_graphAttributes = new ogdf::GraphAttributes(*m_ogdfGraph, ogdf::GraphAttributes::nodeGraphics | ogdf::GraphAttributes::edgeGraphics); clearGraphInfo(); } AssemblyGraph::~AssemblyGraph() { delete m_graphAttributes; delete m_edgeArray; delete m_ogdfGraph; } void AssemblyGraph::cleanUp() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); delete i.value(); } m_deBruijnGraphNodes.clear(); QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); delete j.value(); } m_deBruijnGraphEdges.clear(); m_contiguitySearchDone = false; clearGraphInfo(); } //This function makes a double edge: in one direction for the given nodes //and the opposite direction for their reverse complements. It adds the //new edges to the vector here and to the nodes themselves. void AssemblyGraph::createDeBruijnEdge(QString node1Name, QString node2Name, int overlap, EdgeOverlapType overlapType) { QString node1Opposite = getOppositeNodeName(node1Name); QString node2Opposite = getOppositeNodeName(node2Name); //Quit if any of the nodes don't exist. if (!m_deBruijnGraphNodes.contains(node1Name) || !m_deBruijnGraphNodes.contains(node2Name) || !m_deBruijnGraphNodes.contains(node1Opposite) || !m_deBruijnGraphNodes.contains(node2Opposite)) return; DeBruijnNode * node1 = m_deBruijnGraphNodes[node1Name]; DeBruijnNode * node2 = m_deBruijnGraphNodes[node2Name]; DeBruijnNode * negNode1 = m_deBruijnGraphNodes[node1Opposite]; DeBruijnNode * negNode2 = m_deBruijnGraphNodes[node2Opposite]; //Quit if the edge already exists const std::vector * edges = node1->getEdgesPointer(); for (size_t i = 0; i < edges->size(); ++i) { if ((*edges)[i]->getStartingNode() == node1 && (*edges)[i]->getEndingNode() == node2) return; } //Usually, an edge has a different pair, but it is possible //for an edge to be its own pair. bool isOwnPair = (node1 == negNode2 && node2 == negNode1); DeBruijnEdge * forwardEdge = new DeBruijnEdge(node1, node2); DeBruijnEdge * backwardEdge; if (isOwnPair) backwardEdge = forwardEdge; else backwardEdge = new DeBruijnEdge(negNode2, negNode1); forwardEdge->setReverseComplement(backwardEdge); backwardEdge->setReverseComplement(forwardEdge); forwardEdge->setOverlap(overlap); backwardEdge->setOverlap(overlap); forwardEdge->setOverlapType(overlapType); backwardEdge->setOverlapType(overlapType); m_deBruijnGraphEdges.insert(QPair(forwardEdge->getStartingNode(), forwardEdge->getEndingNode()), forwardEdge); if (!isOwnPair) m_deBruijnGraphEdges.insert(QPair(backwardEdge->getStartingNode(), backwardEdge->getEndingNode()), backwardEdge); node1->addEdge(forwardEdge); node2->addEdge(forwardEdge); negNode1->addEdge(backwardEdge); negNode2->addEdge(backwardEdge); } void AssemblyGraph::clearOgdfGraphAndResetNodes() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); i.value()->resetNode(); } m_ogdfGraph->clear(); m_edgeArray->init(*m_ogdfGraph); } //http://www.code10.info/index.php?option=com_content&view=article&id=62:articledna-reverse-complement&catid=49:cat_coding_algorithms_bioinformatics&Itemid=74 QByteArray AssemblyGraph::getReverseComplement(QByteArray forwardSequence) { QByteArray reverseComplement; for (int i = forwardSequence.length() - 1; i >= 0; --i) { char letter = forwardSequence.at(i); switch (letter) { case 'A': reverseComplement.append('T'); break; case 'T': reverseComplement.append('A'); break; case 'G': reverseComplement.append('C'); break; case 'C': reverseComplement.append('G'); break; case 'a': reverseComplement.append('t'); break; case 't': reverseComplement.append('a'); break; case 'g': reverseComplement.append('c'); break; case 'c': reverseComplement.append('g'); break; case 'R': reverseComplement.append('Y'); break; case 'Y': reverseComplement.append('R'); break; case 'S': reverseComplement.append('S'); break; case 'W': reverseComplement.append('W'); break; case 'K': reverseComplement.append('M'); break; case 'M': reverseComplement.append('K'); break; case 'r': reverseComplement.append('y'); break; case 'y': reverseComplement.append('r'); break; case 's': reverseComplement.append('s'); break; case 'w': reverseComplement.append('w'); break; case 'k': reverseComplement.append('m'); break; case 'm': reverseComplement.append('k'); break; case 'B': reverseComplement.append('V'); break; case 'D': reverseComplement.append('H'); break; case 'H': reverseComplement.append('D'); break; case 'V': reverseComplement.append('B'); break; case 'b': reverseComplement.append('v'); break; case 'd': reverseComplement.append('h'); break; case 'h': reverseComplement.append('d'); break; case 'v': reverseComplement.append('b'); break; case 'N': reverseComplement.append('N'); break; case 'n': reverseComplement.append('n'); break; case '.': reverseComplement.append('.'); break; case '-': reverseComplement.append('-'); break; case '?': reverseComplement.append('?'); break; case '*': reverseComplement.append('*'); break; } } return reverseComplement; } void AssemblyGraph::resetEdges() { QMapIterator, DeBruijnEdge*> i(m_deBruijnGraphEdges); while (i.hasNext()) { i.next(); i.value()->reset(); } } double AssemblyGraph::getMeanDepth(bool drawnNodesOnly) { long double depthSum = 0.0; long long totalLength = 0; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (drawnNodesOnly && node->isNotDrawn()) continue; totalLength += node->getLength(); depthSum += node->getLength() * node->getDepth(); } if (totalLength == 0) return 0.0; else return depthSum / totalLength; } double AssemblyGraph::getMeanDepth(std::vector nodes) { if (nodes.size() == 0) return 0.0; if (nodes.size() == 1) return nodes[0]->getDepth(); int nodeCount = 0; long double depthSum = 0.0; long long totalLength = 0; for (size_t i = 0; i < nodes.size(); ++i) { DeBruijnNode * node = nodes[i]; ++nodeCount; totalLength += node->getLength(); depthSum += node->getLength() * node->getDepth(); } //If the total length is zero, that means all nodes have a length of zero. //In this case, just return the average node depth. if (totalLength == 0) { long double depthSum = 0.0; for (size_t i = 0; i < nodes.size(); ++i) depthSum += nodes[i]->getDepth(); return depthSum / nodes.size(); } return depthSum / totalLength; } double AssemblyGraph::getMeanDepth(QList nodes) { int nodeCount = 0; long double depthSum = 0.0; long long totalLength = 0; for (int i = 0; i < nodes.size(); ++i) { DeBruijnNode * node = nodes[i]; ++nodeCount; totalLength += node->getLength(); depthSum += node->getLength() * node->getDepth(); } if (totalLength == 0) return 0.0; else return depthSum / totalLength; } void AssemblyGraph::resetNodeContiguityStatus() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); i.value()->resetContiguityStatus(); } m_contiguitySearchDone = false; resetAllNodeColours(); } void AssemblyGraph::resetAllNodeColours() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); if (i.value()->getGraphicsItemNode() != 0) i.value()->getGraphicsItemNode()->setNodeColour(); } } void AssemblyGraph::clearAllBlastHitPointers() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); i.value()->clearBlastHits(); } } void AssemblyGraph::determineGraphInfo() { m_shortestContig = std::numeric_limits::max(); m_longestContig = 0; int nodeCount = 0; long long totalLength = 0; std::vector nodeDepths; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); long long nodeLength = i.value()->getLength(); if (nodeLength < m_shortestContig) m_shortestContig = nodeLength; if (nodeLength > m_longestContig) m_longestContig = nodeLength; //Only add up the length for positive nodes if (i.value()->isPositiveNode()) { totalLength += nodeLength; ++nodeCount; } nodeDepths.push_back(i.value()->getDepth()); } //Count up the edges that will be shown in single mode (i.e. positive //edges). int edgeCount = 0; QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); DeBruijnEdge * edge = j.value(); if (edge->isPositiveEdge()) ++edgeCount; } m_nodeCount = nodeCount; m_edgeCount = edgeCount; m_totalLength = totalLength; m_meanDepth = getMeanDepth(); std::sort(nodeDepths.begin(), nodeDepths.end()); double firstQuartileIndex = (nodeDepths.size() - 1) / 4.0; double medianIndex = (nodeDepths.size() - 1) / 2.0; double thirdQuartileIndex = (nodeDepths.size() - 1) * 3.0 / 4.0; m_firstQuartileDepth = getValueUsingFractionalIndex(&nodeDepths, firstQuartileIndex); m_medianDepth = getValueUsingFractionalIndex(&nodeDepths, medianIndex); m_thirdQuartileDepth = getValueUsingFractionalIndex(&nodeDepths, thirdQuartileIndex); //Set the auto node length setting. This is determined by aiming for a //target average node length. But if the graph is small, the value will be //increased (to avoid having an overly small and simple graph layout). double targetDrawnGraphLength = std::max(m_nodeCount * g_settings->meanNodeLength, g_settings->minTotalGraphLength); double megabases = totalLength / 1000000.0; if (megabases > 0.0) g_settings->autoNodeLengthPerMegabase = targetDrawnGraphLength / megabases; else g_settings->autoNodeLengthPerMegabase = 10000.0; } template double AssemblyGraph::getValueUsingFractionalIndex(std::vector * v, double index) const { if (v->size() == 0) return 0.0; if (v->size() == 1) return double((*v)[0]); int wholePart = floor(index); if (wholePart < 0) return double((*v)[0]); if (wholePart >= int(v->size()) - 1) return double((*v)[v->size() - 1]); double fractionalPart = index - wholePart; double piece1 = double((*v)[wholePart]); double piece2 = double((*v)[wholePart+1]); return piece1 * (1.0 - fractionalPart) + piece2 * fractionalPart; } void AssemblyGraph::clearGraphInfo() { m_totalLength = 0; m_shortestContig = 0; m_longestContig = 0; m_meanDepth = 0.0; m_firstQuartileDepth = 0.0; m_medianDepth = 0.0; m_thirdQuartileDepth = 0.0; } void AssemblyGraph::buildDeBruijnGraphFromLastGraph(QString fullFileName) { m_graphFileType = LAST_GRAPH; m_filename = fullFileName; m_depthTag = "KC"; bool firstLine = true; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); if (firstLine) { QStringList firstLineParts = line.split(QRegularExpression("\\s+")); if (firstLineParts.size() > 2) m_kmer = firstLineParts[2].toInt(); firstLine = false; } if (line.startsWith("NODE")) { QStringList nodeDetails = line.split(QRegularExpression("\\s+")); if (nodeDetails.size() < 4) throw "load error"; QString nodeName = nodeDetails.at(1); QString posNodeName = nodeName + "+"; QString negNodeName = nodeName + "-"; int nodeLength = nodeDetails.at(2).toInt(); double nodeDepth; if (nodeLength > 0) nodeDepth = double(nodeDetails.at(3).toInt()) / nodeLength; //IS THIS COLUMN ($COV_SHORT1) THE BEST ONE TO USE? else nodeDepth = double(nodeDetails.at(3).toInt()); QByteArray sequence = in.readLine().toLocal8Bit(); QByteArray revCompSequence = in.readLine().toLocal8Bit(); DeBruijnNode * node = new DeBruijnNode(posNodeName, nodeDepth, sequence); DeBruijnNode * reverseComplementNode = new DeBruijnNode(negNodeName, nodeDepth, revCompSequence); node->setReverseComplement(reverseComplementNode); reverseComplementNode->setReverseComplement(node); m_deBruijnGraphNodes.insert(posNodeName, node); m_deBruijnGraphNodes.insert(negNodeName, reverseComplementNode); } //ARC lines contain edges. else if (line.startsWith("ARC")) { QStringList arcDetails = line.split(QRegularExpression("\\s+")); if (arcDetails.size() < 3) throw "load error"; QString node1Name = convertNormalNumberStringToBandageNodeName(arcDetails.at(1)); QString node2Name = convertNormalNumberStringToBandageNodeName(arcDetails.at(2)); createDeBruijnEdge(node1Name, node2Name); } //NR lines occur after ARC lines, so we can quit looking when we see one. else if (line.startsWith("NR")) break; } inputFile.close(); setAllEdgesExactOverlap(0); } if (m_deBruijnGraphNodes.size() == 0) throw "load error"; } //This function takes a normal number string like "5" or "-6" and changes //it to "5+" or "6-" - the format of Bandage node names. QString AssemblyGraph::convertNormalNumberStringToBandageNodeName(QString number) { if (number.at(0) == '-') { number.remove(0, 1); return number + "-"; } else return number + "+"; } //This function loads a graph from a GFA file. It reports whether or not it //encountered an unsupported CIGAR string, whether the GFA has custom labels //and whether it has custom colours. void AssemblyGraph::buildDeBruijnGraphFromGfa(QString fullFileName, bool *unsupportedCigar, bool *customLabels, bool *customColours, QString *bandageOptionsError) { m_graphFileType = GFA; m_filename = fullFileName; *unsupportedCigar = false; *customLabels = false; *customColours = false; *bandageOptionsError = ""; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { std::vector edgeStartingNodeNames; std::vector edgeEndingNodeNames; std::vector edgeOverlaps; QMap colours; QMap labels; QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); QStringList lineParts = line.split(QRegularExpression("\t")); if (lineParts.size() < 1) continue; //Lines beginning with "H" are header lines. if (lineParts.at(0) == "H") { // Check for a tag containing Bandage options. for (int i = 1; i < lineParts.size(); ++i) { QString part = lineParts.at(i); if (part.size() < 6) continue; if (part.left(5) != "bn:Z:") continue; QString bandageOptionsString = part.right(part.length() - 5); QStringList bandageOptions = bandageOptionsString.split(' ', Qt::SkipEmptyParts); QStringList bandageOptionsCopy = bandageOptions; *bandageOptionsError = checkForInvalidOrExcessSettings(&bandageOptionsCopy); if (bandageOptionsError->length() == 0) parseSettings(bandageOptions); } } //Lines beginning with "S" are sequence (node) lines. if (lineParts.at(0) == "S") { if (lineParts.size() < 3) throw "load error"; QString nodeName = lineParts.at(1); if (nodeName.isEmpty()) nodeName = getUniqueNodeName("node"); if (m_deBruijnGraphNodes.contains(nodeName + "+")) throw "load error"; QByteArray sequence = lineParts.at(2).toLocal8Bit(); //Get the tags. bool kcFound = false, rcFound = false, fcFound = false, dpFound = false; double kc = 0.0, rc = 0.0, fc = 0.0, dp = 0.0; int ln = 0; QString lb, l2; QColor cl, c2; for (int i = 3; i < lineParts.size(); ++i) { QString part = lineParts.at(i); if (part.size() < 6) continue; if (part.at(2) != ':') continue; QString tag = part.left(2).toUpper(); QString valString = part.right(part.length() - 5); if (tag == "KC") { kcFound = true; kc = valString.toDouble(); } if (tag == "RC") { rcFound = true; rc = valString.toDouble(); } if (tag == "FC") { fcFound = true; fc = valString.toDouble(); } if (tag == "DP") { dpFound = true; dp = valString.toDouble(); } if (tag == "LN") ln = valString.toInt(); if (tag == "LB") lb = valString; if (tag == "CL") cl = QColor(valString); if (tag == "L2") l2 = valString; if (tag == "C2") c2 = QColor(valString); } //GFA can use * to indicate that the sequence is not in the //file. In this case, try to use the LN tag for length. If //that's not available, use a length of 0. //If there is a sequence, then the LN tag will be ignored. int length; if (sequence == "*" || sequence == "") { length = ln; sequence = ""; } else length = sequence.length(); //If there is an attribute holding the depth, we'll use that. //If there isn't, then we'll use 1.0. //We try to load 'DP' (depth), 'KC' (k-mer count), 'RC' //(read count) or 'FC'(fragment count) in that order of //preference. //If we use KC, RC or FC for the depth, then that is really a //count, so we need to divide by the sequence length to get the //depth. //We also remember which tag was used so if the graph is saved //we can use the same tag in the output. double nodeDepth = 1.0; if (dpFound) { m_depthTag = "DP"; nodeDepth = dp; } else if (kcFound) { m_depthTag = "KC"; if (length > 0) nodeDepth = kc / length; } else if (rcFound) { m_depthTag = "RC"; if (length > 0) nodeDepth = rc / length; } else if (fcFound) { m_depthTag = "FC"; if (length > 0) nodeDepth = fc / length; } //We check to see if the node ended in a "+" or "-". //If so, we assume that is giving the orientation and leave it. //And if it doesn't end in a "+" or "-", we assume "+" and add //that to the node name. QString lastChar = nodeName.right(1); if (lastChar != "+" && lastChar != "-") nodeName += "+"; // Canu nodes start with "tig" which we can remove for simplicity. nodeName = simplifyCanuNodeName(nodeName); //Save custom colours and labels to be applied later, after //reverse complement nodes are built. if (cl.isValid()) { *customColours = true; colours.insert(nodeName, cl); } if (c2.isValid()) { *customColours = true; colours.insert(getOppositeNodeName(nodeName), c2); } if (!lb.isEmpty()) { *customLabels = true; labels.insert(nodeName, lb); } if (!l2.isEmpty()) { *customLabels = true; labels.insert(getOppositeNodeName(nodeName), l2); } DeBruijnNode * node = new DeBruijnNode(nodeName, nodeDepth, sequence, length); m_deBruijnGraphNodes.insert(nodeName, node); } //Lines beginning with "L" are link (edge) lines else if (lineParts.at(0) == "L") { //Edges aren't made now, in case their sequence hasn't yet been specified. //Instead, we save the starting and ending nodes and make the edges after //we're done looking at the file. if (lineParts.size() < 6) throw "load error"; //Parts 1 and 3 hold the node names and parts 2 and 4 hold the corresponding +/-. QString startingNode = lineParts.at(1) + lineParts.at(2); QString endingNode = lineParts.at(3) + lineParts.at(4); startingNode = simplifyCanuNodeName(startingNode); endingNode = simplifyCanuNodeName(endingNode); edgeStartingNodeNames.push_back(startingNode); edgeEndingNodeNames.push_back(endingNode); //Part 5 holds the node overlap cigar string. A "*" means unspecified, so //we 0 for that. QString cigar = lineParts.at(5); if (cigar == "*") edgeOverlaps.push_back(0); if (cigarContainsOnlyM(cigar)) edgeOverlaps.push_back(getLengthFromSimpleCigar(cigar)); else { edgeOverlaps.push_back(getLengthFromCigar(cigar)); *unsupportedCigar = true; } } } //Pair up reverse complements, creating them if necessary. QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); //Add any custom colours or labels that were loaded. QMapIterator j(colours); while (j.hasNext()) { j.next(); QString nodeName = j.key(); if (m_deBruijnGraphNodes.contains(nodeName)) m_deBruijnGraphNodes[nodeName]->setCustomColour(j.value()); } QMapIterator k(labels); while (k.hasNext()) { k.next(); QString nodeName = k.key(); if (m_deBruijnGraphNodes.contains(nodeName)) m_deBruijnGraphNodes[nodeName]->setCustomLabel(k.value()); } //Create all of the edges. for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; int overlap = edgeOverlaps[i]; createDeBruijnEdge(node1Name, node2Name, overlap, EXACT_OVERLAP); } } if (m_deBruijnGraphNodes.size() == 0) throw "load error"; // For Canu graphs, if there is a file called *.layout.readToTig, then we // can use that to get better read depth values. QFileInfo gfaFileInfo(m_filename); QString baseName = gfaFileInfo.completeBaseName(); QString readToTigFilename = gfaFileInfo.dir().filePath(baseName + ".layout.readToTig"); QFileInfo readToTigFileInfo(readToTigFilename); if (readToTigFileInfo.exists()) { QFile readToTigFile(readToTigFilename); if (readToTigFile.open(QIODevice::ReadOnly)) { // Keep track of how many bases are put into each node. QMap baseCounts; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) baseCounts[node->getNameWithoutSign()] = 0; } QTextStream in(&readToTigFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); QStringList lineParts = line.split(QRegularExpression("\t")); if (lineParts.length() >= 5) { bool conversionOkay; long long readStart = lineParts[3].toLongLong(&conversionOkay); if (!conversionOkay) continue; long long readEnd = lineParts[4].toLongLong(&conversionOkay); if (!conversionOkay) continue; long long readLength = (readEnd < readStart) ? (readStart - readEnd) : (readEnd - readStart); QString nodeName = lineParts[1]; if (baseCounts.contains(nodeName)) baseCounts[nodeName] += readLength; } } // A node's depth is its total bases divided by its length. QMapIterator j(m_deBruijnGraphNodes); while (j.hasNext()) { j.next(); DeBruijnNode * node = j.value(); if (node->isPositiveNode()) { QString nodeName = node->getNameWithoutSign(); double depth; if (node->getLength() > 0) depth = double(baseCounts[nodeName]) / double(node->getLength()); else depth = 1.0; node->setDepth(depth); node->getReverseComplement()->setDepth(depth); } } } } m_sequencesLoadedFromFasta = NOT_TRIED; } bool AssemblyGraph::cigarContainsOnlyM(QString cigar) { QRegularExpression rx("\\d+M"); return rx.match(cigar).hasMatch(); } //This function assumes that the cigar string is simple: just digits followed //by "M". int AssemblyGraph::getLengthFromSimpleCigar(QString cigar) { cigar.chop(1); return cigar.toInt(); } //This function returns the length defined by a cigar string, relative to the //second sequence in the edge (the CIGAR reference). //Bandage does not fully support non-M CIGAR strings, so this is fairly crude //at the moment. int AssemblyGraph::getLengthFromCigar(QString cigar) { int length = 0; length = getCigarCount("M", cigar); length += getCigarCount("=", cigar); length += getCigarCount("X", cigar); length += getCigarCount("I", cigar); length -= getCigarCount("D", cigar); length -= getCigarCount("N", cigar); length += getCigarCount("S", cigar); length += getCigarCount("H", cigar); length += getCigarCount("P", cigar); return length; } //This function totals up the numbers for any given CIGAR code. int AssemblyGraph::getCigarCount(QString cigarCode, QString cigar) { QRegularExpression re("(\\d+)" + cigarCode); QStringList list; auto it = re.globalMatch(cigar); int sum = 0; while (it.hasNext()) { auto match = it.next(); sum += match.captured(1).toInt(); } return sum; } void AssemblyGraph::buildDeBruijnGraphFromFastg(QString fullFileName) { m_graphFileType = FASTG; m_filename = fullFileName; m_depthTag = "KC"; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { std::vector edgeStartingNodeNames; std::vector edgeEndingNodeNames; DeBruijnNode * node = 0; QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString nodeName; double nodeDepth; QString line = in.readLine(); //If the line starts with a '>', then we are beginning a new node. if (line.startsWith(">")) { line.remove(0, 1); //Remove '>' from start line.chop(1); //Remove ';' from end QStringList nodeDetails = line.split(":"); QString thisNode = nodeDetails.at(0); //A single quote as the last character indicates a negative node. bool negativeNode = thisNode.at(thisNode.size() - 1) == '\''; QStringList thisNodeDetails = thisNode.split("_"); if (thisNodeDetails.size() < 6) throw "load error"; nodeName = thisNodeDetails.at(1); if (negativeNode) nodeName += "-"; else nodeName += "+"; if (m_deBruijnGraphNodes.contains(nodeName)) throw "load error"; QString nodeDepthString = thisNodeDetails.at(5); if (negativeNode) { //It may be necessary to remove a single quote from the end of the depth if (nodeDepthString.at(nodeDepthString.size() - 1) == '\'') nodeDepthString.chop(1); } nodeDepth = nodeDepthString.toDouble(); //Make the node node = new DeBruijnNode(nodeName, nodeDepth, ""); //Sequence string is currently empty - will be added to on subsequent lines of the fastg file m_deBruijnGraphNodes.insert(nodeName, node); //The second part of nodeDetails is a comma-delimited list of edge nodes. //Edges aren't made right now (because the other node might not yet exist), //so they are saved into vectors and made after all the nodes have been made. if (nodeDetails.size() == 1 || nodeDetails.at(1).isEmpty()) continue; QStringList edgeNodes = nodeDetails.at(1).split(","); for (int i = 0; i < edgeNodes.size(); ++i) { QString edgeNode = edgeNodes.at(i); QChar lastChar = edgeNode.at(edgeNode.size() - 1); bool negativeNode = false; if (lastChar == '\'') { negativeNode = true; edgeNode.chop(1); } QStringList edgeNodeDetails = edgeNode.split("_"); if (edgeNodeDetails.size() < 2) throw "load error"; QString edgeNodeName = edgeNodeDetails.at(1); if (negativeNode) edgeNodeName += "-"; else edgeNodeName += "+"; edgeStartingNodeNames.push_back(nodeName); edgeEndingNodeNames.push_back(edgeNodeName); } } //If the line does not start with a '>', then this line is part of the //sequence for the last node. else { QByteArray sequenceLine = line.simplified().toLocal8Bit(); if (node != 0) node->appendToSequence(sequenceLine); } } inputFile.close(); //If all went well, each node will have a reverse complement and the code //will never get here. However, I have noticed that some SPAdes fastg files //have, for some reason, negative nodes with no positive counterpart. For //that reason, we will now make any reverse complement nodes for nodes that //lack them. QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); //Create all of the edges. for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; createDeBruijnEdge(node1Name, node2Name); } } autoDetermineAllEdgesExactOverlap(); if (m_deBruijnGraphNodes.size() == 0) throw "load error"; } void AssemblyGraph::makeReverseComplementNodeIfNecessary(DeBruijnNode * node) { QString reverseComplementName = getOppositeNodeName(node->getName()); DeBruijnNode * reverseComplementNode = m_deBruijnGraphNodes[reverseComplementName]; if (reverseComplementNode == 0) { QByteArray nodeSequence; if (node->sequenceIsMissing()) nodeSequence = "*"; else nodeSequence = node->getSequence(); DeBruijnNode * newNode = new DeBruijnNode(reverseComplementName, node->getDepth(), getReverseComplement(nodeSequence), node->getLength()); m_deBruijnGraphNodes.insert(reverseComplementName, newNode); } } void AssemblyGraph::pointEachNodeToItsReverseComplement() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * positiveNode = i.value(); if (positiveNode->isPositiveNode()) { DeBruijnNode * negativeNode = m_deBruijnGraphNodes[getOppositeNodeName(positiveNode->getName())]; if (negativeNode != 0) { positiveNode->setReverseComplement(negativeNode); negativeNode->setReverseComplement(positiveNode); } } } } void AssemblyGraph::buildDeBruijnGraphFromTrinityFasta(QString fullFileName) { m_graphFileType = TRINITY; m_filename = fullFileName; m_depthTag = ""; std::vector names; std::vector sequences; readFastaFile(fullFileName, &names, &sequences); std::vector edgeStartingNodeNames; std::vector edgeEndingNodeNames; for (size_t i = 0; i < names.size(); ++i) { QApplication::processEvents(); QString name = names[i]; QByteArray sequence = sequences[i]; //The header can come in a few different formats: // TR1|c0_g1_i1 len=280 path=[274:0-228 275:229-279] [-1, 274, 275, -2] // TRINITY_DN31_c1_g1_i1 len=301 path=[279:0-300] [-1, 279, -2] // GG1|c0_g1_i1 len=302 path=[1:0-301] // comp0_c0_seq1 len=286 path=[6:0-285] // c0_g1_i1 len=363 path=[119:0-185 43:186-244 43:245-303 43:304-362] //The node names will begin with a string that contains everything //up to the component number (e.g. "c0"), in the same format as it is //in the Trinity.fasta file. If the node name begins with "TRINITY_DN" //or "TRINITY_GG", "TR" or "GG", then that will be trimmed off. if (name.length() < 4) throw "load error"; int componentStartIndex = name.indexOf(QRegularExpression("c\\d+_")); int componentEndIndex = name.indexOf("_", componentStartIndex); if (componentStartIndex < 0 || componentEndIndex < 0) throw "load error"; QString component = name.left(componentEndIndex); if (component.startsWith("TRINITY_DN") || component.startsWith("TRINITY_GG")) component = component.remove(0, 10); else if (component.startsWith("TR") || component.startsWith("GG")) component = component.remove(0, 2); if (component.length() < 2) throw "load error"; int pathStartIndex = name.indexOf("path=[") + 6; int pathEndIndex = name.indexOf("]", pathStartIndex); if (pathStartIndex < 0 || pathEndIndex < 0) throw "load error"; int pathLength = pathEndIndex - pathStartIndex; QString path = name.mid(pathStartIndex, pathLength); if (path.size() == 0) throw "load error"; QStringList pathParts = path.split(" "); //Each path part is a node QString previousNodeName; for (int i = 0; i < pathParts.length(); ++i) { QString pathPart = pathParts.at(i); QStringList nodeParts = pathPart.split(":"); if (nodeParts.size() < 2) throw "load error"; //Most node numbers will be formatted simply as the number, but some //(I don't know why) have '@' and the start and '@!' at the end. In //these cases, we must strip those extra characters off. QString nodeNumberString = nodeParts.at(0); if (nodeNumberString.at(0) == '@') nodeNumberString = nodeNumberString.mid(1, nodeNumberString.length() - 3); QString nodeName = component + "_" + nodeNumberString + "+"; //If the node doesn't yet exist, make it now. if (!m_deBruijnGraphNodes.contains(nodeName)) { QString nodeRange = nodeParts.at(1); QStringList nodeRangeParts = nodeRange.split("-"); if (nodeRangeParts.size() < 2) throw "load error"; int nodeRangeStart = nodeRangeParts.at(0).toInt(); int nodeRangeEnd = nodeRangeParts.at(1).toInt(); int nodeLength = nodeRangeEnd - nodeRangeStart + 1; QByteArray nodeSequence = sequence.mid(nodeRangeStart, nodeLength); DeBruijnNode * node = new DeBruijnNode(nodeName, 1.0, nodeSequence); m_deBruijnGraphNodes.insert(nodeName, node); } //Remember to make an edge for the previous node to this one. if (i > 0) { edgeStartingNodeNames.push_back(previousNodeName); edgeEndingNodeNames.push_back(nodeName); } previousNodeName = nodeName; } } //Even though the Trinity.fasta file only contains positive nodes, Bandage //expects negative reverse complements nodes, so make them now. QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); //Create all of the edges. The createDeBruijnEdge function checks for //duplicates, so it's okay if we try to add the same edge multiple times. for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; createDeBruijnEdge(node1Name, node2Name); } setAllEdgesExactOverlap(0); if (m_deBruijnGraphNodes.size() == 0) throw "load error"; } //This function builds a graph from an ASQG file. Bandage expects edges to //conform to its expectation: overlaps are only at the ends of sequences and //always have the same length in each of the two sequences. It will not load //edges which fail to meet this expectation. The function's return value is //the number of edges which could not be loaded. int AssemblyGraph::buildDeBruijnGraphFromAsqg(QString fullFileName) { m_graphFileType = ASQG; m_filename = fullFileName; m_depthTag = ""; int badEdgeCount = 0; QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { std::vector edgeStartingNodeNames; std::vector edgeEndingNodeNames; std::vector edgeOverlaps; QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); QStringList lineParts = line.split(QRegularExpression("\t")); if (lineParts.size() < 1) continue; //Lines beginning with "VT" are sequence (node) lines if (lineParts.at(0) == "VT") { if (lineParts.size() < 3) throw "load error"; //We treat all nodes in this file as positive nodes and add "+" to the end of their names. QString nodeName = lineParts.at(1); if (nodeName.isEmpty()) nodeName = "node"; nodeName += "+"; QByteArray sequence = lineParts.at(2).toLocal8Bit(); int length = sequence.length(); //ASQG files don't seem to include depth, so just set this to one for every node. double nodeDepth = 1.0; DeBruijnNode * node = new DeBruijnNode(nodeName, nodeDepth, sequence, length); m_deBruijnGraphNodes.insert(nodeName, node); } //Lines beginning with "ED" are edge lines else if (lineParts.at(0) == "ED") { //Edges aren't made now, in case their sequence hasn't yet been specified. //Instead, we save the starting and ending nodes and make the edges after //we're done looking at the file. if (lineParts.size() < 2) throw "load error"; QStringList edgeParts = lineParts[1].split(" "); if (edgeParts.size() < 8) throw "load error"; QString s1Name = edgeParts.at(0); QString s2Name = edgeParts.at(1); int s1OverlapStart = edgeParts.at(2).toInt(); int s1OverlapEnd = edgeParts.at(3).toInt(); int s1Length = edgeParts.at(4).toInt(); int s2OverlapStart = edgeParts.at(5).toInt(); int s2OverlapEnd = edgeParts.at(6).toInt(); int s2Length = edgeParts.at(7).toInt(); //We want the overlap region of s1 to be at the end of the node sequence. If it isn't, we use the //negative node and flip the overlap coordinates. if (s1OverlapEnd == s1Length - 1) s1Name += "+"; else { s1Name += "-"; int newOverlapStart = s1Length - s1OverlapEnd - 1; int newOverlapEnd = s1Length - s1OverlapStart - 1; s1OverlapStart = newOverlapStart; s1OverlapEnd = newOverlapEnd; } //We want the overlap region of s2 to be at the start of the node sequence. If it isn't, we use the //negative node and flip the overlap coordinates. if (s2OverlapStart == 0) s2Name += "+"; else { s2Name += "-"; int newOverlapStart = s2Length - s2OverlapEnd - 1; int newOverlapEnd = s2Length - s2OverlapStart - 1; s2OverlapStart = newOverlapStart; s2OverlapEnd = newOverlapEnd; } int s1OverlapLength = s1OverlapEnd - s1OverlapStart + 1; int s2OverlapLength = s2OverlapEnd - s2OverlapStart + 1; //If the overlap between the two nodes is in agreement and the overlap regions extend to the ends of the //nodes, then we will make the edge. if (s1OverlapLength == s2OverlapLength && s1OverlapEnd == s1Length - 1 && s2OverlapStart == 0) { edgeStartingNodeNames.push_back(s1Name); edgeEndingNodeNames.push_back(s2Name); edgeOverlaps.push_back(s1OverlapLength); } else ++badEdgeCount; } } //Pair up reverse complements, creating them if necessary. QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); //Create all of the edges. for (size_t i = 0; i < edgeStartingNodeNames.size(); ++i) { QString node1Name = edgeStartingNodeNames[i]; QString node2Name = edgeEndingNodeNames[i]; int overlap = edgeOverlaps[i]; createDeBruijnEdge(node1Name, node2Name, overlap, EXACT_OVERLAP); } } if (m_deBruijnGraphNodes.size() == 0) throw "load error"; return badEdgeCount; } void AssemblyGraph::buildDeBruijnGraphFromPlainFasta(QString fullFileName) { m_graphFileType = PLAIN_FASTA; m_filename = fullFileName; m_depthTag = ""; std::vector names; std::vector sequences; readFastaFile(fullFileName, &names, &sequences); std::vector circularNodeNames; for (size_t i = 0; i < names.size(); ++i) { QApplication::processEvents(); QString name = names[i]; QString lowerName = name.toLower(); double depth = 1.0; QByteArray sequence = sequences[i]; // Check to see if the node name matches the Velvet/SPAdes contig format. If so, we can get the depth and node // number. QStringList thisNodeDetails = name.split("_"); if (thisNodeDetails.size() >= 6 && thisNodeDetails[2] == "length" && thisNodeDetails[4] == "cov") { name = thisNodeDetails[1]; depth = thisNodeDetails[5].toDouble(); m_depthTag = "KC"; } // Check to see if the name matches SKESA format, in which case we can get the depth and node number. else if (thisNodeDetails.size() >= 3 && thisNodeDetails[0] == "Contig" && thisNodeDetails[1].toInt() > 0) { name = thisNodeDetails[1]; bool ok; double convertedDepth = thisNodeDetails[2].toDouble(&ok); if (ok) depth = convertedDepth; m_depthTag = "KC"; } // If it doesn't match, then we will use the sequence name up to the first space. else { QStringList nameParts = name.split(" "); if (nameParts.size() > 0) name = nameParts[0]; } name = cleanNodeName(name); name = getUniqueNodeName(name) + "+"; // Look for "depth=" and "circular=" in the full header and use them if possible. if (lowerName.contains("depth=")) { QString depthString = lowerName.split("depth=")[1]; if (depthString.contains("x")) depthString = depthString.split("x")[0]; else depthString = depthString.split(" ")[0]; bool ok; double depthFromString = depthString.toFloat(&ok); if (ok) depth = depthFromString; } if (lowerName.contains("circular=true")) circularNodeNames.push_back(name); // SKESA circularity if (thisNodeDetails.size() == 4 and thisNodeDetails[3] == "Circ") circularNodeNames.push_back(name); if (name.length() < 1) throw "load error"; DeBruijnNode * node = new DeBruijnNode(name, depth, sequence); m_deBruijnGraphNodes.insert(name, node); makeReverseComplementNodeIfNecessary(node); } pointEachNodeToItsReverseComplement(); // For any circular nodes, make an edge connecting them to themselves. for (auto circularNodeName : circularNodeNames) { createDeBruijnEdge(circularNodeName, circularNodeName, 0, EXACT_OVERLAP); } } //This function adjusts a node name to make sure it is valid for use in Bandage. QString AssemblyGraph::cleanNodeName(QString name) { //Replace whitespace with underscores name = name.replace(QRegularExpression("\\s"), "_"); //Remove any commas. name = name.replace(",", ""); //Remove any trailing + or -. if (name.endsWith('+') || name.endsWith('-')) name.chop(1); return name; } GraphFileType AssemblyGraph::getGraphFileTypeFromFile(QString fullFileName) { if (checkFileIsLastGraph(fullFileName)) return LAST_GRAPH; if (checkFileIsFastG(fullFileName)) return FASTG; if (checkFileIsGfa(fullFileName)) return GFA; if (checkFileIsTrinityFasta(fullFileName)) return TRINITY; if (checkFileIsAsqg(fullFileName)) return ASQG; if (checkFileIsFasta(fullFileName)) return PLAIN_FASTA; return UNKNOWN_FILE_TYPE; } //Cursory look to see if file appears to be a LastGraph file. bool AssemblyGraph::checkFileIsLastGraph(QString fullFileName) { return checkFirstLineOfFile(fullFileName, "^\\d+\\s+\\d+\\s+\\d+\\s+\\d+"); } //Cursory look to see if file appears to be a FASTG file. bool AssemblyGraph::checkFileIsFastG(QString fullFileName) { return checkFirstLineOfFile(fullFileName, "^>(NODE|EDGE).*;"); } //Cursory look to see if file appears to be a FASTA file. bool AssemblyGraph::checkFileIsFasta(QString fullFileName) { return checkFirstLineOfFile(fullFileName, "^>"); } //Cursory look to see if file appears to be a GFA file. bool AssemblyGraph::checkFileIsGfa(QString fullFileName) { return checkFirstLineOfFile(fullFileName, "^[SLH]\t"); } //Cursory look to see if file appears to be a Trinity.fasta file. bool AssemblyGraph::checkFileIsTrinityFasta(QString fullFileName) { return checkFirstLineOfFile(fullFileName, "path=\\["); } //Cursory look to see if file appears to be an ASQG file. bool AssemblyGraph::checkFileIsAsqg(QString fullFileName) { return checkFirstLineOfFile(fullFileName, "^HT\t"); } bool AssemblyGraph::checkFirstLineOfFile(QString fullFileName, QString regExp) { QFile inputFile(fullFileName); if (inputFile.open(QIODevice::ReadOnly)) { QTextStream in(&inputFile); if (in.atEnd()) return false; QRegularExpression rx(regExp); QString line = in.readLine(); if (rx.match(line).hasMatch()) return true; } return false; } /* Split a QString according to CSV rules * * @param line line of a csv * @param sep field separator to use * @result list of fields with escaping removed * * Known Bugs: CSV (as per RFC4180) allows multi-line fields (\r\n between "..."), which * can't be parsed line-by line an hence isn't supported. */ QStringList AssemblyGraph::splitCsv(QString line, QString sep) { QRegularExpression rx(R"(("(?:[^"]|"")*"|[^)" + sep + "]*)"); QStringList list; auto it = rx.globalMatch(line); while (it.hasNext()) { auto match = it.next(); QString field = match.captured().replace("\"\"", "\""); if (field[0] == '"' && field[field.length() - 1] == '"') { field = field.mid(1, field.length() - 2); } list << field; } return list; } /* Load data from CSV and add to deBruijnGraphNodes * * @param filename the full path of the file to be loaded * @param *columns will contain the names of each column after loading data * (to add these to the GUI) * @param *errormsg if not empty, message to be displayed to user containing warning * or other information * @returns true/false if loading data worked */ bool AssemblyGraph::loadCSV(QString filename, QStringList * columns, QString * errormsg, bool * coloursLoaded) { clearAllCsvData(); QFile inputFile(filename); if (!inputFile.open(QIODevice::ReadOnly)) { *errormsg = "Unable to read from specified file."; return false; } QTextStream in(&inputFile); QString line = in.readLine(); // guess at separator; this assumes that any tab in the first line means // we have a tab separated file QString sep = "\t"; if (line.split(sep).length() == 1) { sep = ","; if (line.split(sep).length() == 1) { *errormsg = "Neither tab nor comma in first line. Please check file format."; return false; } } int unmatched_nodes = 0; // keep a counter for lines in file that can't be matched to nodes QStringList headers = splitCsv(line, sep); if (headers.size() < 2) { *errormsg = "Not enough CSV headers: at least two required."; return false; } headers.pop_front(); //Check to see if any of the columns holds colour data. int colourCol = -1; for (int i = 0; i < headers.size(); ++i) { QString header = headers[i].toLower(); if (header == "colour" || header == "color") { colourCol = i; *coloursLoaded = true; break; } } *columns = headers; int columnCount = headers.size(); QMap colourCategories; std::vector presetColours = getPresetColours(); while (!in.atEnd()) { QApplication::processEvents(); QStringList cols = splitCsv(in.readLine(), sep); QString nodeName = getNodeNameFromString(cols[0]); //Get rid of the node name - no need to save that. cols.pop_front(); //If one of the columns holds colour data, get the colour from that one. //Acceptable colour formats: 6-digit hex colour (e.g. #FFB6C1), an 8-digit hex colour (e.g. #7FD2B48C) or a //standard colour name (e.g. skyblue). //If the colour value is something other than one of these, a colour will be assigned to the value. That way //categorical names can be used and automatically given colours. QColor colour; if (colourCol != -1 && cols.size() > colourCol) { QString colourString = cols[colourCol]; colour = QColor(colourString); if (!colour.isValid()) { if (!colourCategories.contains(colourString)) { int nextColourIndex = colourCategories.size(); colourCategories[colourString] = presetColours[nextColourIndex]; } colour = colourCategories[colourString]; } } //Get rid of any extra data that doesn't have a header. while (cols.size() > columnCount) cols.pop_back(); if (nodeName != "" && m_deBruijnGraphNodes.contains(nodeName)) { m_deBruijnGraphNodes[nodeName]->setCsvData(cols); if (colour.isValid()) m_deBruijnGraphNodes[nodeName]->setCustomColour(colour); } else ++unmatched_nodes; } if (unmatched_nodes) *errormsg = "There were " + QString::number(unmatched_nodes) + " unmatched entries in the CSV."; return true; } //This function extracts a node name from a string. //The string may be in this Bandage format: // NODE_6+_length_50434_cov_42.3615 //Or in a number of variations of that format. //If the node name it finds does not end in a '+' or '-', it will add '+'. QString AssemblyGraph::getNodeNameFromString(QString string) { // First check for the most obvious case, where the string is already a node name. if (m_deBruijnGraphNodes.contains(string)) return string; if (m_deBruijnGraphNodes.contains(string + "+")) return string + "+"; QStringList parts = string.split("_"); if (parts.size() == 0) return ""; if (parts[0] == "NODE") parts.pop_front(); if (parts.size() == 0) return ""; QString nodeName; //This checks for the standard Bandage format where the node name does //not have any underscores. if (parts.size() == 5 && parts[1] == "length") nodeName = parts[0]; //This checks for the simple case of nothing but a node name. else if (parts.size() == 1) nodeName = parts[0]; //If the code got here, then it is likely that the node name contains //underscores. Grab everything in the string up until we encounter //"length". else { for (int i = 0; i < parts.size(); ++i) { if (parts[i] == "length") break; if (nodeName.length() > 0) nodeName += "_"; nodeName += parts[i]; } } int nameLength = nodeName.length(); if (nameLength == 0) return ""; QChar lastChar = nodeName.at(nameLength - 1); if (lastChar == '+' || lastChar == '-') return nodeName; else return nodeName + "+"; } //Returns true if successful, false if not. bool AssemblyGraph::loadGraphFromFile(QString filename) { GraphFileType graphFileType = getGraphFileTypeFromFile(filename); if (graphFileType == UNKNOWN_FILE_TYPE) return false; try { if (graphFileType == LAST_GRAPH) buildDeBruijnGraphFromLastGraph(filename); if (graphFileType == FASTG) buildDeBruijnGraphFromFastg(filename); if (graphFileType == GFA) { bool unsupportedCigar, customLabels, customColours; QString bandageOptionsError; buildDeBruijnGraphFromGfa(filename, &unsupportedCigar, &customLabels, &customColours, &bandageOptionsError); } if (graphFileType == TRINITY) buildDeBruijnGraphFromTrinityFasta(filename); if (graphFileType == ASQG) buildDeBruijnGraphFromAsqg(filename); if (graphFileType == PLAIN_FASTA) buildDeBruijnGraphFromPlainFasta(filename); } catch (...) { return false; } determineGraphInfo(); g_memory->clearGraphSpecificMemory(); return true; } //The startingNodes and nodeDistance parameters are only used if the graph scope //is not WHOLE_GRAPH. void AssemblyGraph::buildOgdfGraphFromNodesAndEdges(std::vector startingNodes, int nodeDistance) { if (g_settings->graphScope == WHOLE_GRAPH) { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); //If double mode is off, only positive nodes are drawn. If it's //on, all nodes are drawn. if (i.value()->isPositiveNode() || g_settings->doubleMode) i.value()->setAsDrawn(); } } else //The scope is either around specified nodes, around nodes with BLAST hits or a depth range. { //Distance is only used for around nodes and around blast scopes, not //for the depth range scope. if (g_settings->graphScope == DEPTH_RANGE) nodeDistance = 0; for (size_t i = 0; i < startingNodes.size(); ++i) { DeBruijnNode * node = startingNodes[i]; //If we are in single mode, make sure that each node is positive. if (!g_settings->doubleMode && node->isNegativeNode()) node = node->getReverseComplement(); node->setAsDrawn(); node->setAsSpecial(); node->labelNeighbouringNodesAsDrawn(nodeDistance, 0); } } // If performing a linear layout, we first sort the drawn nodes and add them left-to-right. if (g_settings->linearLayout) { QList sortedDrawnNodes; // We first try to sort the nodes numerically. QList> numericallySortedDrawnNodes; QMapIterator i(m_deBruijnGraphNodes); bool successfulIntConversion = true; while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isDrawn() && node->thisOrReverseComplementNotInOgdf()) { int nodeInt = node->getNameWithoutSign().toInt(&successfulIntConversion); if (!successfulIntConversion) break; numericallySortedDrawnNodes.push_back(QPair(nodeInt, node)); } } if (successfulIntConversion) { std::sort(numericallySortedDrawnNodes.begin(), numericallySortedDrawnNodes.end(), [](const QPair & a, const QPair & b) {return a.first < b.first;}); for (int i = 0; i < numericallySortedDrawnNodes.size(); ++i) { sortedDrawnNodes.reserve(numericallySortedDrawnNodes.size()); sortedDrawnNodes.push_back(numericallySortedDrawnNodes[i].second); } } // If any of the conversions from node name to integer failed, then we instead sort the nodes alphabetically. else { i = QMapIterator(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isDrawn()) sortedDrawnNodes.push_back(node); } std::sort(sortedDrawnNodes.begin(), sortedDrawnNodes.end(), [](DeBruijnNode * a, DeBruijnNode * b) {return QString::localeAwareCompare(a->getNameWithoutSign().toUpper(), b->getNameWithoutSign().toUpper()) < 0;}); } // Now we add the drawn nodes to the OGDF graph, given them initial positions based on their sort order. QSet > usedStartPositions; double lastXPos = 0.0; for (int i = 0; i < sortedDrawnNodes.size(); ++i) { DeBruijnNode * node = sortedDrawnNodes[i]; if (node->thisOrReverseComplementInOgdf()) continue; std::vector upstreamNodes = node->getUpstreamNodes(); for (size_t j = 0; j < upstreamNodes.size(); ++j) { DeBruijnNode * upstreamNode = upstreamNodes[j]; if (!upstreamNode->inOgdf()) continue; ogdf::node upstreamEnd = upstreamNode->getOgdfNode()->getLast(); double upstreamEndPos = m_graphAttributes->x(upstreamEnd); if (j == 0) lastXPos = upstreamEndPos; else lastXPos = std::max(lastXPos, upstreamEndPos); } double xPos = lastXPos + g_settings->edgeLength; double yPos = 0.0; long long intXPos = (long long)(xPos * 100.0); long long intYPos = (long long)(yPos * 100.0); while (usedStartPositions.contains(QPair(intXPos, intYPos))) { yPos += g_settings->edgeLength; intYPos = (long long)(yPos * 100.0); } node->addToOgdfGraph(m_ogdfGraph, m_graphAttributes, m_edgeArray, xPos, yPos); usedStartPositions.insert(QPair(intXPos, intYPos)); lastXPos = m_graphAttributes->x(node->getOgdfNode()->getLast()); } } // If the layout isn't linear, then we don't worry about the initial positions because they'll be randomised anyway. else { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isDrawn() && node->thisOrReverseComplementNotInOgdf()) node->addToOgdfGraph(m_ogdfGraph, m_graphAttributes, m_edgeArray, 0.0, 0.0); } } //Then loop through each edge determining its drawn status and adding it to OGDF if it is drawn. QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); DeBruijnEdge * edge = j.value(); edge->determineIfDrawn(); if (edge->isDrawn()) edge->addToOgdfGraph(m_ogdfGraph, m_edgeArray); } } void AssemblyGraph::addGraphicsItemsToScene(MyGraphicsScene * scene) { scene->clear(); double meanDrawnDepth = getMeanDepth(true); //First make the GraphicsItemNode objects QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isDrawn()) { if (meanDrawnDepth == 0) node->setDepthRelativeToMeanDrawnDepth(1.0); else node->setDepthRelativeToMeanDrawnDepth(node->getDepth() / meanDrawnDepth); GraphicsItemNode * graphicsItemNode = new GraphicsItemNode(node, m_graphAttributes); node->setGraphicsItemNode(graphicsItemNode); graphicsItemNode->setFlag(QGraphicsItem::ItemIsSelectable); graphicsItemNode->setFlag(QGraphicsItem::ItemIsMovable); } } resetAllNodeColours(); //Then make the GraphicsItemEdge objects and add them to the scene first //so they are drawn underneath QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); DeBruijnEdge * edge = j.value(); if (edge->isDrawn()) { GraphicsItemEdge * graphicsItemEdge = new GraphicsItemEdge(edge); edge->setGraphicsItemEdge(graphicsItemEdge); graphicsItemEdge->setFlag(QGraphicsItem::ItemIsSelectable); scene->addItem(graphicsItemEdge); } } //Now add the GraphicsItemNode objects to the scene so they are drawn //on top QMapIterator k(m_deBruijnGraphNodes); while (k.hasNext()) { k.next(); DeBruijnNode * node = k.value(); if (node->hasGraphicsItem()) scene->addItem(node->getGraphicsItemNode()); } } std::vector AssemblyGraph::getStartingNodes(QString * errorTitle, QString * errorMessage, bool doubleMode, QString nodesList, QString blastQueryName) { std::vector startingNodes; if (g_settings->graphScope == AROUND_NODE) { if (checkIfStringHasNodes(nodesList)) { *errorTitle = "No starting nodes"; *errorMessage = "Please enter at least one node when drawing the graph using the 'Around node(s)' scope. " "Separate multiple nodes with commas."; return startingNodes; } //Make sure the nodes the user typed in are actually in the graph. std::vector nodesNotInGraph; std::vector nodesInGraph = getNodesFromString(nodesList, g_settings->startingNodesExactMatch, &nodesNotInGraph); if (nodesNotInGraph.size() > 0) { *errorTitle = "Nodes not found"; *errorMessage = generateNodesNotFoundErrorMessage(nodesNotInGraph, g_settings->startingNodesExactMatch); if (nodesInGraph.size() == 0) return startingNodes; } } else if (g_settings->graphScope == AROUND_BLAST_HITS) { std::vector startingNodes = getNodesFromBlastHits(blastQueryName); if (startingNodes.size() == 0) { *errorTitle = "No BLAST hits"; *errorMessage = "To draw the graph around BLAST hits, you must first conduct a BLAST search."; return startingNodes; } } else if (g_settings->graphScope == DEPTH_RANGE) { if (g_settings->minDepthRange > g_settings->maxDepthRange) { *errorTitle = "Invalid depth range"; *errorMessage = "The maximum depth must be greater than or equal to the minimum depth."; return startingNodes; } std::vector startingNodes = getNodesInDepthRange(g_settings->minDepthRange, g_settings->maxDepthRange); if (startingNodes.size() == 0) { *errorTitle = "No nodes in range"; *errorMessage = "There are no nodes with depths in the specified range."; return startingNodes; } } g_settings->doubleMode = doubleMode; clearOgdfGraphAndResetNodes(); if (g_settings->graphScope == AROUND_NODE) startingNodes = getNodesFromString(nodesList, g_settings->startingNodesExactMatch); else if (g_settings->graphScope == AROUND_BLAST_HITS) startingNodes = getNodesFromBlastHits(blastQueryName); else if (g_settings->graphScope == DEPTH_RANGE) startingNodes = getNodesInDepthRange(g_settings->minDepthRange, g_settings->maxDepthRange); return startingNodes; } bool AssemblyGraph::checkIfStringHasNodes(QString nodesString) { nodesString = nodesString.simplified(); QStringList nodesList = nodesString.split(","); nodesList = removeNullStringsFromList(nodesList); return (nodesList.size() == 0); } QString AssemblyGraph::generateNodesNotFoundErrorMessage(std::vector nodesNotInGraph, bool exact) { QString errorMessage; if (exact) errorMessage += "The following nodes are not in the graph:\n"; else errorMessage += "The following queries do not match any nodes in the graph:\n"; for (size_t i = 0; i < nodesNotInGraph.size(); ++i) { errorMessage += nodesNotInGraph[i]; if (i != nodesNotInGraph.size() - 1) errorMessage += ", "; } errorMessage += "\n"; return errorMessage; } std::vector AssemblyGraph::getNodesFromString(QString nodeNamesString, bool exactMatch, std::vector * nodesNotInGraph) { nodeNamesString = nodeNamesString.simplified(); QStringList nodesList = nodeNamesString.split(","); if (exactMatch) return getNodesFromListExact(nodesList, nodesNotInGraph); else return getNodesFromListPartial(nodesList, nodesNotInGraph); } //Given a list of node names (as strings), this function will return all nodes which match //those names exactly. The last +/- on the end of the node name is optional - if missing //both + and - nodes will be returned. std::vector AssemblyGraph::getNodesFromListExact(QStringList nodesList, std::vector * nodesNotInGraph) { std::vector returnVector; for (int i = 0; i < nodesList.size(); ++i) { QString nodeName = nodesList.at(i).simplified(); if (nodeName == "") continue; //If the node name ends in +/-, then we assume the user was specifying the exact //node in the pair. If the node name does not end in +/-, then we assume the //user is asking for either node in the pair. QChar lastChar = nodeName.at(nodeName.length() - 1); if (lastChar == '+' || lastChar == '-') { if (m_deBruijnGraphNodes.contains(nodeName)) returnVector.push_back(m_deBruijnGraphNodes[nodeName]); else if (nodesNotInGraph != 0) nodesNotInGraph->push_back(nodesList.at(i).trimmed()); } else { QString posNodeName = nodeName + "+"; QString negNodeName = nodeName + "-"; bool posNodeFound = false; if (m_deBruijnGraphNodes.contains(posNodeName)) { returnVector.push_back(m_deBruijnGraphNodes[posNodeName]); posNodeFound = true; } bool negNodeFound = false; if (m_deBruijnGraphNodes.contains(negNodeName)) { returnVector.push_back(m_deBruijnGraphNodes[negNodeName]); negNodeFound = true; } if (!posNodeFound && !negNodeFound && nodesNotInGraph != 0) nodesNotInGraph->push_back(nodesList.at(i).trimmed()); } } return returnVector; } std::vector AssemblyGraph::getNodesFromListPartial(QStringList nodesList, std::vector * nodesNotInGraph) { std::vector returnVector; for (int i = 0; i < nodesList.size(); ++i) { QString queryName = nodesList.at(i).simplified(); if (queryName == "") continue; bool found = false; QMapIterator j(m_deBruijnGraphNodes); while (j.hasNext()) { j.next(); QString nodeName = j.value()->getName(); if (nodeName.contains(queryName)) { found = true; returnVector.push_back(j.value()); } } if (!found && nodesNotInGraph != 0) nodesNotInGraph->push_back(queryName.trimmed()); } return returnVector; } std::vector AssemblyGraph::getNodesFromBlastHits(QString queryName) { std::vector returnVector; if (g_blastSearch->m_blastQueries.m_queries.size() == 0) return returnVector; std::vector queries; //If "all" is selected, then we'll display nodes with hits from any query if (queryName == "all") queries = g_blastSearch->m_blastQueries.m_queries; //If only one query is selected, then we just display nodes with hits from that query else queries.push_back(g_blastSearch->m_blastQueries.getQueryFromName(queryName)); //Add pointers to nodes that have a hit for the selected target(s). for (size_t i = 0; i < queries.size(); ++i) { BlastQuery * currentQuery = queries[i]; for (int j = 0; j < g_blastSearch->m_allHits.size(); ++j) { if (g_blastSearch->m_allHits[j]->m_query == currentQuery) returnVector.push_back(g_blastSearch->m_allHits[j]->m_node); } } return returnVector; } std::vector AssemblyGraph::getNodesInDepthRange(double min, double max) { std::vector returnVector; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isInDepthRange(min, max)) returnVector.push_back(node); } return returnVector; } QStringList AssemblyGraph::removeNullStringsFromList(QStringList in) { QStringList out; for (int i = 0; i < in.size(); ++i) { QString string = in.at(i); if (string.length() > 0) out.push_back(string); } return out; } //Unlike the equivalent function in MainWindow, this does the graph layout in the main thread. void AssemblyGraph::layoutGraph() { ogdf::FMMMLayout fmmm; GraphLayoutWorker * graphLayoutWorker = new GraphLayoutWorker(&fmmm, m_graphAttributes, m_edgeArray, g_settings->graphLayoutQuality, useLinearLayout(), g_settings->componentSeparation); graphLayoutWorker->layoutGraph(); } void AssemblyGraph::setAllEdgesExactOverlap(int overlap) { QMapIterator, DeBruijnEdge*> i(m_deBruijnGraphEdges); while (i.hasNext()) { i.next(); i.value()->setExactOverlap(overlap); } } void AssemblyGraph::autoDetermineAllEdgesExactOverlap() { int edgeCount = int(m_deBruijnGraphEdges.size()); if (edgeCount == 0) return; //Determine the overlap for each edge. QMapIterator, DeBruijnEdge*> i(m_deBruijnGraphEdges); while (i.hasNext()) { i.next(); i.value()->autoDetermineExactOverlap(); } //The expectation here is that most overlaps will be //the same or from a small subset of possible sizes. //Edges with an overlap that do not match the most common //overlap(s) are suspected of having their overlap //misidentified. They are therefore rechecked using the //common ones. std::vector overlapCounts = makeOverlapCountVector(); //Sort the overlaps in order of decreasing numbers of edges. //I.e. the first overlap size in the vector will be the most //common overlap, the second will be the second most common, //etc. std::vector sortedOverlaps; int overlapsSoFar = 0; double fractionOverlapsFound = 0.0; while (fractionOverlapsFound < 1.0) { int mostCommonOverlap = 0; int mostCommonOverlapCount = 0; //Find the overlap size with the most instances. for (size_t i = 0; i < overlapCounts.size(); ++i) { if (overlapCounts[i] > mostCommonOverlapCount) { mostCommonOverlap = int(i); mostCommonOverlapCount = overlapCounts[i]; } } //Add that overlap to the common collection and remove it from the counts. sortedOverlaps.push_back(mostCommonOverlap); overlapsSoFar += mostCommonOverlapCount; fractionOverlapsFound = double(overlapsSoFar) / edgeCount; overlapCounts[mostCommonOverlap] = 0; } //For each edge, see if one of the more common overlaps also works. //If so, use that instead. QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); DeBruijnEdge * edge = j.value(); for (size_t k = 0; k < sortedOverlaps.size(); ++k) { if (edge->getOverlap() == sortedOverlaps[k]) break; else if (edge->testExactOverlap(sortedOverlaps[k])) { edge->setOverlap(sortedOverlaps[k]); break; } } } } //This function produces a vector for which the values are the number //of edges that have an overlap of the index length. //E.g. if overlapVector[61] = 123, that means that 123 edges have an //overlap of 61. std::vector AssemblyGraph::makeOverlapCountVector() { std::vector overlapCounts; QMapIterator, DeBruijnEdge*> i(m_deBruijnGraphEdges); while (i.hasNext()) { i.next(); int overlap = i.value()->getOverlap(); //Add the overlap to the count vector if (int(overlapCounts.size()) < overlap + 1) overlapCounts.resize(overlap + 1, 0); ++overlapCounts[overlap]; } return overlapCounts; } //The function returns a node name, replacing "+" at the end with "-" or //vice-versa. QString AssemblyGraph::getOppositeNodeName(QString nodeName) { QChar lastChar = nodeName.at(nodeName.size() - 1); nodeName.chop(1); if (lastChar == '-') return nodeName + "+"; else return nodeName + "-"; } void AssemblyGraph::readFastaOrFastqFile(QString filename, std::vector * names, std::vector * sequences) { QChar firstChar = QChar(0); QFile inputFile(filename); if (inputFile.open(QIODevice::ReadOnly)) { QTextStream in(&inputFile); QString firstLine = in.readLine(); firstChar = firstLine.at(0); inputFile.close(); } if (firstChar == '>') readFastaFile(filename, names, sequences); else if (firstChar == '@') readFastqFile(filename, names, sequences); } void AssemblyGraph::readFastaFile(QString filename, std::vector * names, std::vector * sequences) { QFile inputFile(filename); if (inputFile.open(QIODevice::ReadOnly)) { QString name = ""; QByteArray sequence = ""; QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString line = in.readLine(); if (line.length() == 0) continue; if (line.at(0) == '>') { //If there is a current sequence, add it to the vectors now. if (name.length() > 0) { names->push_back(name); sequences->push_back(sequence); } line.remove(0, 1); //Remove '>' from start name = line; sequence = ""; } else //It's a sequence line sequence += line.simplified().toUtf8(); } //Add the last target to the results now. if (name.length() > 0) { names->push_back(name); sequences->push_back(sequence); } inputFile.close(); } } void AssemblyGraph::readFastqFile(QString filename, std::vector * names, std::vector * sequences) { QFile inputFile(filename); if (inputFile.open(QIODevice::ReadOnly)) { QTextStream in(&inputFile); while (!in.atEnd()) { QApplication::processEvents(); QString name = in.readLine().simplified(); QByteArray sequence = in.readLine().simplified().toLocal8Bit(); in.readLine(); // separator in.readLine(); // qualities if (name.length() == 0) continue; if (sequence.length() == 0) continue; if (name.at(0) != '@') continue; name.remove(0, 1); //Remove '@' from start names->push_back(name); sequences->push_back(sequence); } inputFile.close(); } } void AssemblyGraph::recalculateAllDepthsRelativeToDrawnMean() { double meanDrawnDepth = getMeanDepth(true); QMapIterator k(m_deBruijnGraphNodes); while (k.hasNext()) { k.next(); DeBruijnNode * node = k.value(); double depthRelativeToMeanDrawnDepth; if (meanDrawnDepth == 0) depthRelativeToMeanDrawnDepth = 1.0; else depthRelativeToMeanDrawnDepth = node->getDepth() / meanDrawnDepth; node->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); } } void AssemblyGraph::recalculateAllNodeWidths() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); GraphicsItemNode * graphicsItemNode = i.value()->getGraphicsItemNode(); if (graphicsItemNode != 0) graphicsItemNode->setWidth(); } } void AssemblyGraph::clearAllCsvData() { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); i.value()->clearCsvData(); } } int AssemblyGraph::getDrawnNodeCount() const { int nodeCount = 0; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isDrawn()) ++nodeCount; } return nodeCount; } void AssemblyGraph::deleteNodes(std::vector * nodes) { //Build a list of nodes to delete. QList nodesToDelete; for (size_t i = 0; i < nodes->size(); ++i) { DeBruijnNode * node = (*nodes)[i]; DeBruijnNode * rcNode = node->getReverseComplement(); if (!nodesToDelete.contains(node)) nodesToDelete.push_back(node); if (!nodesToDelete.contains(rcNode)) nodesToDelete.push_back(rcNode); } //Build a list of edges to delete. std::vector edgesToDelete; for (int i = 0; i < nodesToDelete.size(); ++i) { DeBruijnNode * node = nodesToDelete[i]; const std::vector * nodeEdges = node->getEdgesPointer(); for (size_t j = 0; j < nodeEdges->size(); ++j) { DeBruijnEdge * edge = (*nodeEdges)[j]; bool alreadyAdded = std::find(edgesToDelete.begin(), edgesToDelete.end(), edge) != edgesToDelete.end(); if (!alreadyAdded) edgesToDelete.push_back(edge); } } //Build a list of node names to delete. QStringList nodesNamesToDelete; for (int i = 0; i < nodesToDelete.size(); ++i) { DeBruijnNode * node = nodesToDelete[i]; nodesNamesToDelete.push_back(node->getName()); } //Remove the edges from the graph, deleteEdges(&edgesToDelete); //Remove the nodes from the graph. for (int i = 0; i < nodesNamesToDelete.size(); ++i) { QString nodeName = nodesNamesToDelete[i]; m_deBruijnGraphNodes.remove(nodeName); } for (int i = 0; i < nodesToDelete.size(); ++i) { DeBruijnNode * node = nodesToDelete[i]; delete node; } } void AssemblyGraph::deleteEdges(std::vector * edges) { //Build a list of edges to delete. QList edgesToDelete; for (size_t i = 0; i < edges->size(); ++i) { DeBruijnEdge * edge = (*edges)[i]; DeBruijnEdge * rcEdge = edge->getReverseComplement(); if (!edgesToDelete.contains(edge)) edgesToDelete.push_back(edge); if (!edgesToDelete.contains(rcEdge)) edgesToDelete.push_back(rcEdge); } //Remove the edges from the graph, for (int i = 0; i < edgesToDelete.size(); ++i) { DeBruijnEdge * edge = edgesToDelete[i]; DeBruijnNode * startingNode = edge->getStartingNode(); DeBruijnNode * endingNode = edge->getEndingNode(); m_deBruijnGraphEdges.remove(QPair(startingNode, endingNode)); startingNode->removeEdge(edge); endingNode->removeEdge(edge); delete edge; } } //This function assumes it is receiving a positive node. It will duplicate both //the positive and negative node in the pair. It divided their depth in //two, giving half to each node. void AssemblyGraph::duplicateNodePair(DeBruijnNode * node, MyGraphicsScene * scene) { DeBruijnNode * originalPosNode = node; DeBruijnNode * originalNegNode = node->getReverseComplement(); QString newNodeBaseName = getNewNodeName(originalPosNode->getName()); QString newPosNodeName = newNodeBaseName + "+"; QString newNegNodeName = newNodeBaseName + "-"; double newDepth = node->getDepth() / 2.0; //Create the new nodes. DeBruijnNode * newPosNode = new DeBruijnNode(newPosNodeName, newDepth, originalPosNode->getSequence()); DeBruijnNode * newNegNode = new DeBruijnNode(newNegNodeName, newDepth, originalNegNode->getSequence()); newPosNode->setReverseComplement(newNegNode); newNegNode->setReverseComplement(newPosNode); //Copy over additional stuff from the original nodes. newPosNode->setCustomColour(originalPosNode->getCustomColour()); newNegNode->setCustomColour(originalNegNode->getCustomColour()); newPosNode->setCustomLabel(originalPosNode->getCustomLabel()); newNegNode->setCustomLabel(originalNegNode->getCustomLabel()); newPosNode->setCsvData(originalPosNode->getAllCsvData()); newNegNode->setCsvData(originalNegNode->getAllCsvData()); m_deBruijnGraphNodes.insert(newPosNodeName, newPosNode); m_deBruijnGraphNodes.insert(newNegNodeName, newNegNode); std::vector leavingEdges = originalPosNode->getLeavingEdges(); for (size_t i = 0; i < leavingEdges.size(); ++i) { DeBruijnEdge * edge = leavingEdges[i]; DeBruijnNode * downstreamNode = edge->getEndingNode(); createDeBruijnEdge(newPosNodeName, downstreamNode->getName(), edge->getOverlap(), edge->getOverlapType()); } std::vector enteringEdges = originalPosNode->getEnteringEdges(); for (size_t i = 0; i < enteringEdges.size(); ++i) { DeBruijnEdge * edge = enteringEdges[i]; DeBruijnNode * upstreamNode = edge->getStartingNode(); createDeBruijnEdge(upstreamNode->getName(), newPosNodeName, edge->getOverlap(), edge->getOverlapType()); } originalPosNode->setDepth(newDepth); originalNegNode->setDepth(newDepth); double meanDrawnDepth = getMeanDepth(true); double depthRelativeToMeanDrawnDepth; if (meanDrawnDepth == 0) depthRelativeToMeanDrawnDepth = 1.0; else depthRelativeToMeanDrawnDepth = originalPosNode->getDepth() / meanDrawnDepth; originalPosNode->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); originalNegNode->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); newPosNode->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); newPosNode->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); duplicateGraphicsNode(originalPosNode, newPosNode, scene); duplicateGraphicsNode(originalNegNode, newNegNode, scene); } QString AssemblyGraph::getNewNodeName(QString oldNodeName) { oldNodeName.chop(1); //Remove trailing +/- QString newNodeNameBase = oldNodeName + "_copy"; QString newNodeName = newNodeNameBase; int suffix = 1; while (m_deBruijnGraphNodes.contains(newNodeName + "+")) { ++suffix; newNodeName = newNodeNameBase + QString::number(suffix); } return newNodeName; } void AssemblyGraph::duplicateGraphicsNode(DeBruijnNode * originalNode, DeBruijnNode * newNode, MyGraphicsScene * scene) { GraphicsItemNode * originalGraphicsItemNode = originalNode->getGraphicsItemNode(); if (originalGraphicsItemNode == 0) return; GraphicsItemNode * newGraphicsItemNode = new GraphicsItemNode(newNode, originalGraphicsItemNode); newNode->setGraphicsItemNode(newGraphicsItemNode); newGraphicsItemNode->setFlag(QGraphicsItem::ItemIsSelectable); newGraphicsItemNode->setFlag(QGraphicsItem::ItemIsMovable); originalGraphicsItemNode->shiftPointsLeft(); newGraphicsItemNode->shiftPointsRight(); originalGraphicsItemNode->fixEdgePaths(); originalGraphicsItemNode->setNodeColour(); newGraphicsItemNode->setNodeColour(); originalGraphicsItemNode->setWidth(); scene->addItem(newGraphicsItemNode); const std::vector * newEdges = newNode->getEdgesPointer(); for (size_t i = 0; i < newEdges->size(); ++i) { DeBruijnEdge * newEdge = (*newEdges)[i]; GraphicsItemEdge * graphicsItemEdge = new GraphicsItemEdge(newEdge); graphicsItemEdge->setZValue(-1.0); newEdge->setGraphicsItemEdge(graphicsItemEdge); graphicsItemEdge->setFlag(QGraphicsItem::ItemIsSelectable); scene->addItem(graphicsItemEdge); } } //This function will merge the given nodes, if possible. Nodes can only be //merged if they are in a simple, unbranching path with no extra edges. If the //merge is successful, it returns true, otherwise false. bool AssemblyGraph::mergeNodes(QList nodes, MyGraphicsScene * scene, bool recalulateDepth) { if (nodes.size() == 0) return true; //We now need to sort the nodes into merge order. QList orderedList; orderedList.push_back(nodes[0]); nodes.pop_front(); bool addedNode; do { addedNode = false; for (int i = 0; i < nodes.size(); ++i) { DeBruijnNode * potentialNextNode = nodes[i]; //Check if the node can be added to the end of the list. if (canAddNodeToEndOfMergeList(&orderedList, potentialNextNode)) { orderedList.push_back(potentialNextNode); nodes.removeAt(i); addedNode = true; break; } //Check if the node can be added to the front of the list. if (canAddNodeToStartOfMergeList(&orderedList, potentialNextNode)) { orderedList.push_front(potentialNextNode); nodes.removeAt(i); addedNode = true; break; } //If neither of those worked, then we should try the node's reverse //complement. DeBruijnNode * potentialNextNodeRevComp = potentialNextNode->getReverseComplement(); if (canAddNodeToEndOfMergeList(&orderedList, potentialNextNodeRevComp)) { orderedList.push_back(potentialNextNodeRevComp); nodes.removeAt(i); addedNode = true; break; } if (canAddNodeToStartOfMergeList(&orderedList, potentialNextNodeRevComp)) { orderedList.push_front(potentialNextNodeRevComp); nodes.removeAt(i); addedNode = true; break; } } if (nodes.size() == 0) break; } while (addedNode); //If there are still nodes left in the first list, then they don't form a //nice simple path and the merge won't work. if (nodes.size() > 0) return false; double mergedNodeDepth = getMeanDepth(orderedList); Path posPath = Path::makeFromOrderedNodes(orderedList, false); QByteArray mergedNodePosSequence = posPath.getPathSequence(); QList revCompOrderedList; for (int i = 0; i < orderedList.size(); ++i) revCompOrderedList.push_front(orderedList[i]->getReverseComplement()); Path negPath = Path::makeFromOrderedNodes(revCompOrderedList, false); QByteArray mergedNodeNegSequence = negPath.getPathSequence(); QString newNodeBaseName; for (int i = 0; i < orderedList.size(); ++i) { newNodeBaseName += orderedList[i]->getNameWithoutSign(); if (i < orderedList.size() - 1) newNodeBaseName += "_"; } newNodeBaseName = getUniqueNodeName(newNodeBaseName); QString newPosNodeName = newNodeBaseName + "+"; QString newNegNodeName = newNodeBaseName + "-"; DeBruijnNode * newPosNode = new DeBruijnNode(newPosNodeName, mergedNodeDepth, mergedNodePosSequence); DeBruijnNode * newNegNode = new DeBruijnNode(newNegNodeName, mergedNodeDepth, mergedNodeNegSequence); newPosNode->setReverseComplement(newNegNode); newNegNode->setReverseComplement(newPosNode); m_deBruijnGraphNodes.insert(newPosNodeName, newPosNode); m_deBruijnGraphNodes.insert(newNegNodeName, newNegNode); std::vector leavingEdges = orderedList.back()->getLeavingEdges(); for (size_t i = 0; i < leavingEdges.size(); ++i) { DeBruijnEdge * leavingEdge = leavingEdges[i]; createDeBruijnEdge(newPosNodeName, leavingEdge->getEndingNode()->getName(), leavingEdge->getOverlap(), leavingEdge->getOverlapType()); } std::vector enteringEdges = orderedList.front()->getEnteringEdges(); for (size_t i = 0; i < enteringEdges.size(); ++i) { DeBruijnEdge * enteringEdge = enteringEdges[i]; createDeBruijnEdge(enteringEdge->getStartingNode()->getName(), newPosNodeName, enteringEdge->getOverlap(), enteringEdge->getOverlapType()); } if (recalulateDepth) { double meanDrawnDepth = getMeanDepth(true); double depthRelativeToMeanDrawnDepth; if (meanDrawnDepth == 0) depthRelativeToMeanDrawnDepth = 1.0; else depthRelativeToMeanDrawnDepth = newPosNode->getDepth() / meanDrawnDepth; newPosNode->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); newNegNode->setDepthRelativeToMeanDrawnDepth(depthRelativeToMeanDrawnDepth); } else { newPosNode->setDepthRelativeToMeanDrawnDepth(1.0); newNegNode->setDepthRelativeToMeanDrawnDepth(1.0); } mergeGraphicsNodes(&orderedList, &revCompOrderedList, newPosNode, scene); std::vector nodesToDelete; for (int i = 0; i < orderedList.size(); ++i) nodesToDelete.push_back(orderedList[i]); deleteNodes(&nodesToDelete); return true; } bool AssemblyGraph::canAddNodeToStartOfMergeList(QList * mergeList, DeBruijnNode * potentialNode) { DeBruijnNode * firstNode = mergeList->front(); std::vector edgesEnteringFirstNode = firstNode->getEnteringEdges(); std::vector edgesLeavingPotentialNode = potentialNode->getLeavingEdges(); return (edgesEnteringFirstNode.size() == 1 && edgesLeavingPotentialNode.size() == 1 && edgesEnteringFirstNode[0]->getStartingNode() == potentialNode && edgesLeavingPotentialNode[0]->getEndingNode() == firstNode); } bool AssemblyGraph::canAddNodeToEndOfMergeList(QList * mergeList, DeBruijnNode * potentialNode) { DeBruijnNode * lastNode = mergeList->back(); std::vector edgesLeavingLastNode = lastNode->getLeavingEdges(); std::vector edgesEnteringPotentialNode = potentialNode->getEnteringEdges(); return (edgesLeavingLastNode.size() == 1 && edgesEnteringPotentialNode.size() == 1 && edgesLeavingLastNode[0]->getEndingNode() == potentialNode && edgesEnteringPotentialNode[0]->getStartingNode() == lastNode); } QString AssemblyGraph::getUniqueNodeName(QString baseName) { //If the base name is untaken, then that's it! if (!m_deBruijnGraphNodes.contains(baseName + "+")) return baseName; int suffix = 1; while (true) { ++suffix; QString potentialUniqueName = baseName + "_" + QString::number(suffix); if (!m_deBruijnGraphNodes.contains(potentialUniqueName + "+")) return potentialUniqueName; } //Code should never get here. return baseName; } void AssemblyGraph::mergeGraphicsNodes(QList * originalNodes, QList * revCompOriginalNodes, DeBruijnNode * newNode, MyGraphicsScene * scene) { bool success = mergeGraphicsNodes2(originalNodes, newNode, scene); if (success) newNode->setAsDrawn(); if (g_settings->doubleMode) { DeBruijnNode * newRevComp = newNode->getReverseComplement(); bool revCompSuccess = mergeGraphicsNodes2(revCompOriginalNodes, newRevComp, scene); if (revCompSuccess) newRevComp->setAsDrawn(); } std::vector nodesToRemove; for (int i = 0; i < originalNodes->size(); ++i) nodesToRemove.push_back((*originalNodes)[i]); removeGraphicsItemNodes(&nodesToRemove, true, scene); } bool AssemblyGraph::mergeGraphicsNodes2(QList * originalNodes, DeBruijnNode * newNode, MyGraphicsScene * scene) { bool success = true; std::vector linePoints; for (int i = 0; i < originalNodes->size(); ++i) { DeBruijnNode * node = (*originalNodes)[i]; //If we are in single mode, then we should check for a GraphicsItemNode only //in the positive nodes. bool opposite = false; if (!g_settings->doubleMode && node->isNegativeNode()) { node = node->getReverseComplement(); opposite = true; } GraphicsItemNode * originalGraphicsItemNode = node->getGraphicsItemNode(); if (originalGraphicsItemNode == 0) { success = false; break; } std::vector originalLinePoints = originalGraphicsItemNode->m_linePoints; //Add the original line points to the new line point collection. If we //are working with an opposite node, then we need to reverse the order. if (opposite) { for (size_t j = originalLinePoints.size(); j > 0; --j) linePoints.push_back(originalLinePoints[j-1]); } else { for (size_t j = 0; j < originalLinePoints.size(); ++j) linePoints.push_back(originalLinePoints[j]); } } if (success) { GraphicsItemNode * newGraphicsItemNode = new GraphicsItemNode(newNode, linePoints); newNode->setGraphicsItemNode(newGraphicsItemNode); newGraphicsItemNode->setFlag(QGraphicsItem::ItemIsSelectable); newGraphicsItemNode->setFlag(QGraphicsItem::ItemIsMovable); newGraphicsItemNode->setNodeColour(); scene->addItem(newGraphicsItemNode); const std::vector * newEdges = newNode->getEdgesPointer(); for (size_t i = 0; i < newEdges->size(); ++i) { DeBruijnEdge * newEdge = (*newEdges)[i]; GraphicsItemEdge * graphicsItemEdge = new GraphicsItemEdge(newEdge); graphicsItemEdge->setZValue(-1.0); newEdge->setGraphicsItemEdge(graphicsItemEdge); graphicsItemEdge->setFlag(QGraphicsItem::ItemIsSelectable); scene->addItem(graphicsItemEdge); } } return success; } //If reverseComplement is true, this function will also remove the graphics items for reverse complements of the nodes. void AssemblyGraph::removeGraphicsItemNodes(const std::vector * nodes, bool reverseComplement, MyGraphicsScene * scene) { QSet graphicsItemNodesToDelete; for (size_t i = 0; i < nodes->size(); ++i) { DeBruijnNode * node = (*nodes)[i]; removeAllGraphicsEdgesFromNode(node, reverseComplement, scene); GraphicsItemNode * graphicsItemNode = node->getGraphicsItemNode(); if (graphicsItemNode != 0 && !graphicsItemNodesToDelete.contains(graphicsItemNode)) graphicsItemNodesToDelete.insert(graphicsItemNode); node->setGraphicsItemNode(0); if (reverseComplement) { DeBruijnNode * rcNode = node->getReverseComplement(); GraphicsItemNode * rcGraphicsItemNode = rcNode->getGraphicsItemNode(); if (rcGraphicsItemNode != 0 && !graphicsItemNodesToDelete.contains(rcGraphicsItemNode)) graphicsItemNodesToDelete.insert(rcGraphicsItemNode); rcNode->setGraphicsItemNode(0); } } if (scene != 0) scene->blockSignals(true); QSetIterator i(graphicsItemNodesToDelete); while (i.hasNext()) { GraphicsItemNode * graphicsItemNode = i.next(); if (graphicsItemNode != 0) { if (scene != 0) scene->removeItem(graphicsItemNode); delete graphicsItemNode; } } if (scene != 0) scene->blockSignals(false); } void AssemblyGraph::removeAllGraphicsEdgesFromNode(DeBruijnNode * node, bool reverseComplement, MyGraphicsScene * scene) { const std::vector * edges = node->getEdgesPointer(); removeGraphicsItemEdges(edges, reverseComplement, scene); } void AssemblyGraph::removeGraphicsItemEdges(const std::vector * edges, bool reverseComplement, MyGraphicsScene * scene) { QSet graphicsItemEdgesToDelete; for (size_t i = 0; i < edges->size(); ++i) { DeBruijnEdge * edge = (*edges)[i]; GraphicsItemEdge * graphicsItemEdge = edge->getGraphicsItemEdge(); if (graphicsItemEdge != 0 && !graphicsItemEdgesToDelete.contains(graphicsItemEdge)) graphicsItemEdgesToDelete.insert(graphicsItemEdge); edge->setGraphicsItemEdge(0); if (reverseComplement) { DeBruijnEdge * rcEdge = edge->getReverseComplement(); GraphicsItemEdge * rcGraphicsItemEdge = rcEdge->getGraphicsItemEdge(); if (rcGraphicsItemEdge != 0 && !graphicsItemEdgesToDelete.contains(rcGraphicsItemEdge)) graphicsItemEdgesToDelete.insert(rcGraphicsItemEdge); rcEdge->setGraphicsItemEdge(0); } } if (scene != 0) scene->blockSignals(true); QSetIterator i(graphicsItemEdgesToDelete); while (i.hasNext()) { GraphicsItemEdge * graphicsItemEdge = i.next(); if (graphicsItemEdge != 0) { if (scene != 0) scene->removeItem(graphicsItemEdge); delete graphicsItemEdge; } } if (scene != 0) scene->blockSignals(false); } //This function simplifies the graph by merging all possible nodes in a simple //line. It returns the number of merges that it did. //It gets a pointer to the progress dialog as well so it can check to see if the //user has cancelled the merge. int AssemblyGraph::mergeAllPossible(MyGraphicsScene * scene, MyProgressDialog * progressDialog) { //Create a set of all nodes. QSet uncheckedNodes; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); uncheckedNodes.insert(i.value()); } //Create a list of all merges to be done. QList< QList > allMerges; QMapIterator j(m_deBruijnGraphNodes); while (j.hasNext()) { j.next(); DeBruijnNode * node = j.value(); //If the current node isn't checked, then we will find the longest //possible mergable sequence containing this node. if (uncheckedNodes.contains(node)) { QList nodesToMerge; nodesToMerge.push_back(node); uncheckedNodes.remove(node); uncheckedNodes.remove(node->getReverseComplement()); //Extend forward as much as possible. bool extended; do { extended = false; DeBruijnNode * last = nodesToMerge.back(); std::vector outgoingEdges = last->getLeavingEdges(); if (outgoingEdges.size() == 1) { DeBruijnEdge * potentialEdge = outgoingEdges[0]; DeBruijnNode * potentialNode = potentialEdge->getEndingNode(); std::vector edgesEnteringPotentialNode = potentialNode->getEnteringEdges(); if (edgesEnteringPotentialNode.size() == 1 && edgesEnteringPotentialNode[0] == potentialEdge && !nodesToMerge.contains(potentialNode) && uncheckedNodes.contains(potentialNode)) { nodesToMerge.push_back(potentialNode); uncheckedNodes.remove(potentialNode); uncheckedNodes.remove(potentialNode->getReverseComplement()); extended = true; } } } while (extended); //Extend backward as much as possible. do { extended = false; DeBruijnNode * first = nodesToMerge.front(); std::vector incomingEdges = first->getEnteringEdges(); if (incomingEdges.size() == 1) { DeBruijnEdge * potentialEdge = incomingEdges[0]; DeBruijnNode * potentialNode = potentialEdge->getStartingNode(); std::vector edgesLeavingPotentialNode = potentialNode->getLeavingEdges(); if (edgesLeavingPotentialNode.size() == 1 && edgesLeavingPotentialNode[0] == potentialEdge && !nodesToMerge.contains(potentialNode) && uncheckedNodes.contains(potentialNode)) { nodesToMerge.push_front(potentialNode); uncheckedNodes.remove(potentialNode); uncheckedNodes.remove(potentialNode->getReverseComplement()); extended = true; } } } while (extended); if (nodesToMerge.size() > 1) allMerges.push_back(nodesToMerge); } } //Now do the actual merges. QApplication::processEvents(); emit setMergeTotalCount(allMerges.size()); for (int i = 0; i < allMerges.size(); ++i) { if (progressDialog != 0 && progressDialog->wasCancelled()) break; mergeNodes(allMerges[i], scene, false); emit setMergeCompletedCount(i+1); QApplication::processEvents(); } recalculateAllDepthsRelativeToDrawnMean(); recalculateAllNodeWidths(); return allMerges.size(); } void AssemblyGraph::saveEntireGraphToFasta(QString filename) { QFile file(filename); file.open(QIODevice::WriteOnly | QIODevice::Text); QTextStream out(&file); QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); out << i.value()->getFasta(true); } } void AssemblyGraph::saveEntireGraphToFastaOnlyPositiveNodes(QString filename) { QFile file(filename); file.open(QIODevice::WriteOnly | QIODevice::Text); QTextStream out(&file); QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) out << node->getFasta(false); } } bool AssemblyGraph::saveEntireGraphToGfa(QString filename) { QFile file(filename); bool success = file.open(QIODevice::WriteOnly | QIODevice::Text); if (!success) return false; QTextStream out(&file); QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) out << node->getGfaSegmentLine(m_depthTag); } QList edgesToSave; QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); DeBruijnEdge * edge = j.value(); if (edge->isPositiveEdge()) edgesToSave.push_back(edge); } std::sort(edgesToSave.begin(), edgesToSave.end(), DeBruijnEdge::compareEdgePointers); for (int i = 0; i < edgesToSave.size(); ++i) out << edgesToSave[i]->getGfaLinkLine(); return true; } bool AssemblyGraph::saveVisibleGraphToGfa(QString filename) { QFile file(filename); bool success = file.open(QIODevice::WriteOnly | QIODevice::Text); if (!success) return false; QTextStream out(&file); QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->thisNodeOrReverseComplementIsDrawn() && node->isPositiveNode()) out << node->getGfaSegmentLine(m_depthTag); } QList edgesToSave; QMapIterator, DeBruijnEdge*> j(m_deBruijnGraphEdges); while (j.hasNext()) { j.next(); DeBruijnEdge * edge = j.value(); if (edge->getStartingNode()->thisNodeOrReverseComplementIsDrawn() && edge->getEndingNode()->thisNodeOrReverseComplementIsDrawn() && edge->isPositiveEdge()) edgesToSave.push_back(edge); } std::sort(edgesToSave.begin(), edgesToSave.end(), DeBruijnEdge::compareEdgePointers); for (int i = 0; i < edgesToSave.size(); ++i) out << edgesToSave[i]->getGfaLinkLine(); return true; } //This function changes the name of a node pair. The new and old names are //both assumed to not include the +/- at the end. void AssemblyGraph::changeNodeName(QString oldName, QString newName) { if (checkNodeNameValidity(newName) != NODE_NAME_OKAY) return; QString posOldNodeName = oldName + "+"; QString negOldNodeName = oldName + "-"; if (!m_deBruijnGraphNodes.contains(posOldNodeName)) return; if (!m_deBruijnGraphNodes.contains(negOldNodeName)) return; DeBruijnNode * posNode = m_deBruijnGraphNodes[posOldNodeName]; DeBruijnNode * negNode = m_deBruijnGraphNodes[negOldNodeName]; m_deBruijnGraphNodes.remove(posOldNodeName); m_deBruijnGraphNodes.remove(negOldNodeName); QString posNewNodeName = newName + "+"; QString negNewNodeName = newName + "-"; posNode->setName(posNewNodeName); negNode->setName(negNewNodeName); m_deBruijnGraphNodes.insert(posNewNodeName, posNode); m_deBruijnGraphNodes.insert(negNewNodeName, negNode); } //This function checks whether a new node name is okay. It takes a node name //without a +/- at the end. NodeNameStatus AssemblyGraph::checkNodeNameValidity(QString nodeName) { if (nodeName.contains('\t')) return NODE_NAME_CONTAINS_TAB; if (nodeName.contains('\n')) return NODE_NAME_CONTAINS_NEWLINE; if (nodeName.contains(',')) return NODE_NAME_CONTAINS_COMMA; if (nodeName.contains(' ')) return NODE_NAME_CONTAINS_SPACE; if (m_deBruijnGraphNodes.contains(nodeName + "+")) return NODE_NAME_TAKEN; return NODE_NAME_OKAY; } void AssemblyGraph::changeNodeDepth(std::vector * nodes, double newDepth) { if (nodes->size() == 0) return; for (size_t i = 0; i < nodes->size(); ++i) { (*nodes)[i]->setDepth(newDepth); (*nodes)[i]->getReverseComplement()->setDepth(newDepth); } //If this graph does not already have a depthTag, give it a depthTag of KC //so the depth info will be saved. if (m_depthTag == "") m_depthTag = "KC"; } //This function is used when making FASTA outputs - it breaks a sequence into //separate lines. The default interval is 70, as that seems to be what NCBI //uses. //The returned string always ends in a newline. QByteArray AssemblyGraph::addNewlinesToSequence(QByteArray sequence, int interval) { QByteArray output; int charactersRemaining = sequence.length(); int currentIndex = 0; while (charactersRemaining > interval) { output += sequence.mid(currentIndex, interval); output += "\n"; charactersRemaining -= interval; currentIndex += interval; } output += sequence.mid(currentIndex); output += "\n"; return output; } //This function returns the number of dead ends in the graph. //It looks only at positive nodes, which can have 0, 1 or 2 dead ends each. //This value therefore varies between zero and twice the node count (specifically //the positive node count). int AssemblyGraph::getDeadEndCount() const { int deadEndCount = 0; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) deadEndCount += node->getDeadEndCount(); } return deadEndCount; } void AssemblyGraph::getNodeStats(int * n50, int * shortestNode, int * firstQuartile, int * median, int * thirdQuartile, int * longestNode) const { if (m_totalLength == 0.0) return; std::vector nodeLengths; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) nodeLengths.push_back(node->getLength()); } if (nodeLengths.size() == 0) return; std::sort(nodeLengths.begin(), nodeLengths.end()); *shortestNode = nodeLengths.front(); *longestNode = nodeLengths.back(); double firstQuartileIndex = (nodeLengths.size() - 1) / 4.0; double medianIndex = (nodeLengths.size() - 1) / 2.0; double thirdQuartileIndex = (nodeLengths.size() - 1) * 3.0 / 4.0; *firstQuartile = round(getValueUsingFractionalIndex(&nodeLengths, firstQuartileIndex)); *median = round(getValueUsingFractionalIndex(&nodeLengths, medianIndex)); *thirdQuartile = round(getValueUsingFractionalIndex(&nodeLengths, thirdQuartileIndex)); double halfTotalLength = m_totalLength / 2.0; long long totalSoFar = 0; for (int i = int(nodeLengths.size()) - 1; i >= 0 ; --i) { totalSoFar += nodeLengths[i]; if (totalSoFar >= halfTotalLength) { *n50 = nodeLengths[i]; break; } } } //This function uses an algorithm adapted from: http://math.hws.edu/eck/cs327_s04/chapter9.pdf void AssemblyGraph::getGraphComponentCountAndLargestComponentSize(int * componentCount, int * largestComponentLength) const { *componentCount = 0; *largestComponentLength = 0; QSet visitedNodes; QList< QList > connectedComponents; //Loop through all positive nodes. QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * v = i.value(); if (v->isNegativeNode()) continue; //If the node has not yet been visited, then it must be the start of a new connected component. if (!visitedNodes.contains(v)) { QList connectedComponent; QQueue q; q.enqueue(v); visitedNodes.insert(v); while (!q.isEmpty()) { DeBruijnNode * w = q.dequeue(); connectedComponent.push_back(w); std::vector connectedNodes = w->getAllConnectedPositiveNodes(); for (size_t j = 0; j < connectedNodes.size(); ++j) { DeBruijnNode * k = connectedNodes[j]; if (!visitedNodes.contains(k)) { visitedNodes.insert(k); q.enqueue(k); } } } connectedComponents.push_back(connectedComponent); } } //Now that the list of connected components is built, we look for the //largest one (as measured by total node length). *componentCount = connectedComponents.size(); for (int i = 0; i < *componentCount; ++i) { int componentLength = 0; for (int j = 0; j < connectedComponents[i].size(); ++j) componentLength += connectedComponents[i][j]->getLength(); if (componentLength > *largestComponentLength) *largestComponentLength = componentLength; } } bool compareNodeDepth(DeBruijnNode * a, DeBruijnNode * b) {return (a->getDepth() < b->getDepth());} double AssemblyGraph::getMedianDepthByBase() const { if (m_totalLength == 0) return 0.0; //Make a list of all nodes. long long totalLength = 0; QList nodeList; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) { nodeList.push_back(node); totalLength += node->getLength(); } } //If there is only one node, then its depth is the median. if (nodeList.size() == 1) return nodeList[0]->getDepth(); //Sort the node list from low to high depth. std::sort(nodeList.begin(), nodeList.end(), compareNodeDepth); if (totalLength % 2 == 0) //Even total length { long long medianIndex2 = totalLength / 2; long long medianIndex1 = medianIndex2 - 1; double depth1 = findDepthAtIndex(&nodeList, medianIndex1); double depth2 = findDepthAtIndex(&nodeList, medianIndex2); return (depth1 + depth2) / 2.0; } else //Odd total length { long long medianIndex = (totalLength - 1) / 2; return findDepthAtIndex(&nodeList, medianIndex); } } //This function takes a node list sorted by depth and a target index (in terms of //the whole sequence length). It returns the depth at that index. double AssemblyGraph::findDepthAtIndex(QList * nodeList, long long targetIndex) const { long long lengthSoFar = 0; for (int i = 0; i < nodeList->size(); ++i) { DeBruijnNode * node = (*nodeList)[i]; lengthSoFar += node->getLength(); long long currentIndex = lengthSoFar - 1; if (currentIndex >= targetIndex) return node->getDepth(); } return 0.0; } long long AssemblyGraph::getEstimatedSequenceLength() const { return getEstimatedSequenceLength(getMedianDepthByBase()); } long long AssemblyGraph::getEstimatedSequenceLength(double medianDepthByBase) const { long long estimatedSequenceLength = 0; if (medianDepthByBase == 0.0) return 0; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) { int nodeLength = node->getLengthWithoutTrailingOverlap(); double relativeDepth = node->getDepth() / medianDepthByBase; int closestIntegerDepth = round(relativeDepth); int lengthAdjustedForDepth = nodeLength * closestIntegerDepth; estimatedSequenceLength += lengthAdjustedForDepth; } } return estimatedSequenceLength; } long long AssemblyGraph::getTotalLengthMinusEdgeOverlaps() const { long long totalLength = 0; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode()) { totalLength += node->getLength(); const std::vector * edges = node->getEdgesPointer(); int maxOverlap = 0; for (size_t j = 0; j < edges->size(); ++j) { int edgeOverlap = (*edges)[j]->getOverlap(); maxOverlap = std::max(maxOverlap, edgeOverlap); } totalLength -= maxOverlap; } } return totalLength; } QPair AssemblyGraph::getOverlapRange() const { int smallestOverlap = std::numeric_limits::max(); int largestOverlap = 0; QMapIterator, DeBruijnEdge*> i(m_deBruijnGraphEdges); while (i.hasNext()) { i.next(); int overlap = i.value()->getOverlap(); if (overlap < smallestOverlap) smallestOverlap = overlap; if (overlap > largestOverlap) largestOverlap = overlap; } if (smallestOverlap == std::numeric_limits::max()) smallestOverlap = 0; return QPair(smallestOverlap, largestOverlap); } //This function will look to see if there is a FASTA file (.fa or .fasta) with //the same base name as the graph. If so, it will load it and give its //sequences to the graph nodes with matching names. This is useful for GFA //files which have no sequences (just '*') like ABySS makes. //Returns true if any sequences were loaded (doesn't have to be all sequences //in the graph). bool AssemblyGraph::attemptToLoadSequencesFromFasta() { if (m_sequencesLoadedFromFasta == NOT_READY || m_sequencesLoadedFromFasta == TRIED) return false; m_sequencesLoadedFromFasta = TRIED; QFileInfo gfaFileInfo(m_filename); QString baseName = gfaFileInfo.completeBaseName(); QString fastaName = gfaFileInfo.dir().filePath(baseName + ".fa"); QFileInfo fastaFileInfo(fastaName); if (!fastaFileInfo.exists()) { fastaName = gfaFileInfo.dir().filePath(baseName + ".fasta"); fastaFileInfo = QFileInfo(fastaName); } if (!fastaFileInfo.exists()) { fastaName = gfaFileInfo.dir().filePath(baseName + ".contigs.fasta"); fastaFileInfo = QFileInfo(fastaName); } if (!fastaFileInfo.exists()) return false; bool atLeastOneNodeSequenceLoaded = false; std::vector names; std::vector sequences; readFastaFile(fastaName, &names, &sequences); for (size_t i = 0; i < names.size(); ++i) { QString name = names[i]; name = simplifyCanuNodeName(name); name = name.split(QRegularExpression("\\s+"))[0]; if (m_deBruijnGraphNodes.contains(name + "+")) { DeBruijnNode * posNode = m_deBruijnGraphNodes[name + "+"]; if (posNode->sequenceIsMissing()) { atLeastOneNodeSequenceLoaded = true; posNode->setSequence(sequences[i]); DeBruijnNode * negNode = m_deBruijnGraphNodes[name + "-"]; negNode->setSequence(getReverseComplement(sequences[i])); } } } return atLeastOneNodeSequenceLoaded; } // Returns true if every node name in the graph starts with the string. bool AssemblyGraph::allNodesStartWith(QString start) const { QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (!node->getName().startsWith(start)) return false; } return true; } QString AssemblyGraph::simplifyCanuNodeName(QString oldName) const { QString newName; // Remove "tig" from front. if (!oldName.startsWith("tig")) return oldName; newName = oldName.remove(0, 3); if (newName.isEmpty()) return oldName; // Remove +/- from end. QChar sign = oldName[oldName.length()-1]; newName.chop(1); if (newName.isEmpty()) return oldName; // Remove leading zeros. while (newName.length() > 1 && newName[0] == '0') newName.remove(0, 1); return newName + sign; } long long AssemblyGraph::getTotalLengthOrphanedNodes() const { long long total = 0; QMapIterator i(m_deBruijnGraphNodes); while (i.hasNext()) { i.next(); DeBruijnNode * node = i.value(); if (node->isPositiveNode() && node->getDeadEndCount() == 2) total += node->getLength(); } return total; } bool AssemblyGraph::useLinearLayout() const { // If the graph has no edges, then we use a linear layout. Otherwise check the setting. if (m_edgeCount == 0) return true; else return g_settings->linearLayout; }