/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ /** * Changes: * * 16-02-07,Nigel Brown(EMBL): Added friend NameIterator to allow a caller to * process the name vector. * * 05-03-07,Nigel Brown(EMBL): modified searchForString() to skip over gaps in * sequence while matching. * * 26-03-07,Nigel Brown(EMBL): suppressed error message box for name clashes; * added testUniqueNames() predicate, which compares new sequence names with * those in the alignment vector BEFORE appending them. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "Alignment.h" using namespace std; namespace clustalw { Alignment::Alignment() : gapPos1(userParameters->getGapPos1()), gapPos2(userParameters->getGapPos2()) { maxNames = 0; numSeqs = 0; maxAlignmentLength = 0; lengthLongestSequence = 0; profile1NumSeqs = 0; } // Andreas Wilm (UCD): shouldn't resetProfile1 and resetProfile2 be merged? /** remove gaps from older alignments (code = gap_pos1) */ void Alignment::resetProfile1() { register int sl; /* which have code = gap_pos2 */ int i,j; int _gapPos1 = userParameters->getGapPos1(); int _gapPos2 = userParameters->getGapPos2(); bool _resetAlignNew = userParameters->getResetAlignmentsNew(); bool _resetAlignAll = userParameters->getResetAlignmentsAll(); if (userParameters->getStructPenalties1() != NONE) { sl = 0; for (j = 0; j < gapPenaltyMask1.size(); ++j) { if (gapPenaltyMask1[j] == _gapPos1 && (_resetAlignNew || _resetAlignAll)) { continue; } if (gapPenaltyMask1[j] == _gapPos2 && (_resetAlignAll)) { continue; } gapPenaltyMask1[sl] = gapPenaltyMask1[j]; ++sl; } } if (userParameters->getStructPenalties1() == SECST) { sl = 0; for (j = 0; j < secStructMask1.size(); ++j) { if (secStructMask1[j] == _gapPos1 && (_resetAlignNew || _resetAlignAll)) { continue; } if (secStructMask1[j] == _gapPos2 && (_resetAlignAll)) { continue; } secStructMask1[sl] = secStructMask1[j]; ++sl; } } for(i = 1; i <= profile1NumSeqs; ++i) { sl = 0; for(j = 1; j <= getSeqLength(i); ++j) { if(seqArray[i][j] == _gapPos1 && (_resetAlignNew || _resetAlignAll)) { continue; } if(seqArray[i][j] == _gapPos2 && (_resetAlignAll)) { continue; } ++sl; seqArray[i][sl] = seqArray[i][j]; } // Andreas Wilm (UCD): added 2008-03-07: // Remove excess bit at end of sequence int numExtraElements = seqArray[i].size() - 1 - sl; for(int k = 0; k < numExtraElements; k++) { seqArray[i].pop_back(); } } } // Andreas Wilm (UCD): shouldn't resetProfile1 and resetProfile2 be merged? void Alignment::resetProfile2() { register int sl; /* which have code = gap_pos2 */ int i, j; int _gapPos1 = userParameters->getGapPos1(); int _gapPos2 = userParameters->getGapPos2(); bool _resetAlignNew = userParameters->getResetAlignmentsNew(); bool _resetAlignAll = userParameters->getResetAlignmentsAll(); int _profile1NumSeqs = profile1NumSeqs; if (userParameters->getStructPenalties2() != NONE) { sl = 0; for (j = 0; j < gapPenaltyMask2.size(); ++j) { if (gapPenaltyMask2[j] == _gapPos1 && (_resetAlignNew || _resetAlignAll)) { continue; } if (gapPenaltyMask2[j] == _gapPos2 && (_resetAlignAll)) { continue; } gapPenaltyMask2[sl] = gapPenaltyMask2[j]; ++sl; } } if (userParameters->getStructPenalties2() == SECST) { sl = 0; for (j = 0; j < secStructMask2.size(); ++j) { if (secStructMask2[j] == _gapPos1 && (_resetAlignNew || _resetAlignAll)) { continue; } if (secStructMask2[j] == _gapPos2 && (_resetAlignAll)) { continue; } secStructMask2[sl] = secStructMask2[j]; ++sl; } } for(i = _profile1NumSeqs + 1; i <= numSeqs; ++i) { sl = 0; for(j = 1; j <= getSeqLength(i); ++j) { if(seqArray[i][j] == _gapPos1 && (_resetAlignNew || _resetAlignAll)) { continue; } if(seqArray[i][j] == _gapPos2 && (_resetAlignAll)) { continue; } ++sl; seqArray[i][sl] = seqArray[i][j]; } // Andreas Wilm (UCD) added 2008-03-07: // Remove excess bit at end of sequence int numExtraElements = seqArray[i].size() - 1 - sl; for(int k = 0; k < numExtraElements; k++) { seqArray[i].pop_back(); } } } void Alignment::resetAllSeqWeights() { seqWeight.clear(); seqWeight.resize(numSeqs + 1, 100); } /* * The function addOutputIndex must check that the number of outputindex * Not sure what to do with this one. */ bool Alignment::addOutputIndex(vector* outputIndexToAdd) { // First need to clear the old outputIndex outputIndex.clear(); // Check if the size is correct if(outputIndexToAdd->size() == numSeqs) { // Add the output index outputIndex = *outputIndexToAdd; return true; } else { clearAlignment(); return false; // Could not add them } } /* * The function appendOutputIndex is used when we are appending the outputIndex * to the current one. We do this when we are adding a profile2. */ bool Alignment::appendOutputIndex(vector* outputIndexToAppend) { // Check if the size is correct if((outputIndexToAppend->size() + outputIndex.size()) == numSeqs) { // Append the outputIndex vector::iterator outIndexIter = outputIndexToAppend->begin(); while(outIndexIter != outputIndexToAppend->end()) { outputIndex.push_back(*outIndexIter); outIndexIter++; } if(outputIndex.size() == numSeqs) { return true; } else { clearAlignment(); cout << "There is a problem with adding the sequences\n"; return false; } } else { clearAlignment(); return false; } } /* * The function addSequences takes a vector of sequences and adds it to the Alignment. * This is used if there is nothing in memory that we need. * */ void Alignment::addSequences(vector* seqVector) { clearAlignment(); numSeqs = seqVector->size(); vector emptyVec; // NOTE push dummy sequences on first! /******************************************************** * It was decided to stay with the seqs and seqArray * * index begining at 1. This is because it is difficult * * to change in the code, so we push dummy seqs on * ********************************************************/ seqArray.push_back(emptyVec); // EMPTY sequence names.push_back(string("")); titles.push_back(string("")); sequenceIds.push_back(0); addSequencesToVector(seqVector); calculateMaxLengths(); seqWeight.resize(numSeqs + 1, 100); } /* * The function appendSequences is used when we have a profile2. * */ void Alignment::appendSequences(vector* seqVector) { numSeqs += seqVector->size(); // Add to the number we already have! addSequencesToVector(seqVector); seqWeight.clear(); seqWeight.resize(numSeqs + 1, 100); calculateMaxLengths(); } void Alignment::pasteSequencesIntoPosition(vector* seqVector, int pos, bool explicitPasteToProfile2) { SeqArray::iterator seqArrayIterator; vector::iterator namesIterator; vector::iterator titlesIterator; vector::iterator sequenceIdsIterator; int profNum = userParameters->getProfileNum(); int numSeqsToInsert = seqVector->size(); if(numSeqsToInsert == 0 || pos < 0) { return; } if(pos == numSeqs) { seqArrayIterator = seqArray.end(); namesIterator = names.end(); titlesIterator = titles.end(); sequenceIdsIterator = sequenceIds.end(); } else { seqArrayIterator = seqArray.begin() + pos + 1; namesIterator = names.begin() + pos + 1; titlesIterator = titles.begin() + pos + 1; sequenceIdsIterator = sequenceIds.begin() + pos + 1; } int prof1NumSeqs = profile1NumSeqs; int initialProf1NumSeqs = prof1NumSeqs; for(int i = numSeqsToInsert - 1; i >= 0; i--) { seqArray.insert(seqArrayIterator, *(*seqVector)[i].getSequence()); names.insert(namesIterator, (*seqVector)[i].getName()); titles.insert(titlesIterator, (*seqVector)[i].getTitle()); sequenceIds.insert(sequenceIdsIterator, (*seqVector)[i].getIdentifier()); numSeqs++; if(profNum != 0 && !explicitPasteToProfile2 && pos <= prof1NumSeqs) { prof1NumSeqs++; } } if(profNum != 0 && pos <= prof1NumSeqs) { profile1NumSeqs = prof1NumSeqs; } resetAllSeqWeights(); setDefaultOutputIndex(); } void Alignment::printAllNames() { vector::iterator nameIter = names.begin(); while(nameIter != names.end()) { cout << *nameIter << endl; nameIter++; } } void Alignment::NameIterator::begin(Alignment *a) { //cout << "begin()\n"; if (a) { alignment = a; i = alignment->names.begin(); } } const string Alignment::NameIterator::next() { //cout << "next()\n"; if (!alignment) return string(); if (i == alignment->names.end()) return string(); return *i++; } bool Alignment::NameIterator::end() { //cout << "end()\n"; if (!alignment) return true; if (i == alignment->names.end()) return true; return false; } // 26-03-03,nige: test before appending seqVector bool Alignment::testUniqueNames(vector* seqVector) { vector::iterator oldName; vector::iterator newName; bool unique = true; //iterate over new candidate names for (newName = seqVector->begin(); unique && newName != seqVector->end(); newName++) { //iterate over old stored names for (oldName = names.begin() + 1; unique && oldName != names.end(); oldName++) { if (*oldName == newName->getName()) unique = false; } } return unique; } /* * The function addSequencesToVector is used to add sequences to our seqVector * It is used by both addSequences and appendSequences. This is to reduce code * duplication. */ void Alignment::addSequencesToVector(vector* seqVector) { std::vector::iterator itSeq; for(itSeq = seqVector->begin(); itSeq != seqVector->end(); ++itSeq) { seqArray.push_back(*(*itSeq).getSequence()); names.push_back((*itSeq).getName()); titles.push_back((*itSeq).getTitle()); sequenceIds.push_back((*itSeq).getIdentifier()); } if(!((seqArray.size() == numSeqs + 1) && (names.size() == numSeqs + 1) && (titles.size() == numSeqs + 1) && (sequenceIds.size() == numSeqs + 1))) { cout << "There has been an error adding the sequences to Alignment.\n" << "Must terminate the program. EaddSequencesrror occured in addSequences.\n"; exit(1); } } void Alignment::clearAlignment() { // Erase all the elements from the vector! clearSeqArray(); names.clear(); titles.clear(); outputIndex.clear(); sequenceIds.clear(); clearSecStruct1(); clearSecStruct2(); seqWeight.clear(); maxNames = 0; numSeqs = 0; maxAlignmentLength = 0; lengthLongestSequence = 0; userParameters->setProfileNum(0); userParameters->setProfile1Empty(true); userParameters->setProfile2Empty(true); } void Alignment::clearSeqArray() { for(int i = 0; i < seqArray.size(); i++) { seqArray[i].clear(); } seqArray.clear(); } void Alignment::clearSecStruct1() { gapPenaltyMask1.clear(); secStructMask1.clear(); secStructName1 = ""; } void Alignment::clearSecStruct2() { gapPenaltyMask2.clear(); secStructMask2.clear(); secStructName2 = ""; } /* * The function alignScore is used to score the alignment! * This is used by other classes to see what score the alignment has gotten. */ int Alignment::alignScore(void) { int maxRes; int seq1, seq2, res1, res2; int ngaps; int i, len1, len2; int score; int _maxAA = userParameters->getMaxAA(); float _gapOpen = userParameters->getGapOpen(); int matrix[NUMRES][NUMRES]; // // calculate an overall score for the alignment by summing the // scores for each pairwise alignment // maxRes = subMatrix->getAlnScoreMatrix(matrix); if (maxRes == 0) { utilityObject->error("Matrix for alignment scoring not found\n"); return 0; } score = 0; for (seq1 = 1; seq1 <= numSeqs; seq1++) { for (seq2 = 1; seq2 < seq1; seq2++) { len1 = seqArray[seq1].size() - 1; len2 = seqArray[seq2].size() - 1; for (i = 1; i < len1 && i < len2; i++) { res1 = seqArray[seq1][i]; res2 = seqArray[seq2][i]; if ((res1 >= 0) && (res1 <= _maxAA) && (res2 >= 0) && (res2 <= _maxAA)) { score += matrix[res1][res2]; } } ngaps = countGaps(seq1, seq2, len1); score = static_cast(score - (100 * _gapOpen * ngaps)); // Mark change 17-5-07 } } score /= 100; utilityObject->info("Alignment Score %d\n", score); return score; } int Alignment::countGaps(int seq1, int seq2, int len) { int i, g; int q, r;//, *Q, *R; vector Q, R; Q.resize(len + 2, 0); R.resize(len + 2, 0); int _maxAA = userParameters->getMaxAA(); try { Q[0] = R[0] = g = 0; for (i = 1; i < len; i++) { if (seqArray[seq1][i] > _maxAA) { q = 1; } else { q = 0; } if (seqArray[seq2][i] > _maxAA) { r = 1; } else { r = 0; } // NOTE I havent a clue what this does!!!! if (((Q[i - 1] <= R[i - 1]) && (q != 0) && (1-r != 0)) || ((Q[i - 1] >= R[i - 1]) && (1-q != 0) && (r != 0))) { g += 1; } if (q != 0) { Q[i] = Q[i - 1] + 1; } else { Q[i] = 0; } if (r != 0) { R[i] = R[i - 1] + 1; } else { R[i] = 0; } } } catch(const exception &ex) { cerr << ex.what() << endl; cerr << "Terminating program. Cannot continue. Function = countGaps\n"; exit(1); } return (g); } void Alignment::resetAlign() { /* remove gaps from older alignments (code = gap_pos1) EXCEPT for gaps that were INPUT with the seqs. which have code = gap_pos2 */ register int sl; int i, j; int _gapPos1 = userParameters->getGapPos1(); int _gapPos2 = userParameters->getGapPos2(); bool _resetAlignNew = userParameters->getResetAlignmentsNew(); bool _resetAlignAll = userParameters->getResetAlignmentsAll(); for(i = 1; i <= numSeqs;++i) { sl = 0; for(j = 1; j <= getSeqLength(i); ++j) { if(seqArray[i][j] == _gapPos1 && ( _resetAlignNew || _resetAlignAll)) { continue; } if(seqArray[i][j] == _gapPos2 && (_resetAlignAll)) { continue; } ++sl; seqArray[i][sl] = seqArray[i][j]; } // Andreas Wilm (UCD) added 2008-03-07: // Remove excess bit at end of sequence int numExtraElements = seqArray[i].size() - 1 - sl; for(int k = 0; k < numExtraElements; k++) { seqArray[i].pop_back(); } } } void Alignment::fixGaps() { int i,j; if (userParameters->getStructPenalties1() != NONE) { for (j = 0; j < getSeqLength(1); ++j) { if (gapPenaltyMask1[j] == userParameters->getGapPos1()) { gapPenaltyMask1[j] = userParameters->getGapPos2(); } } } if (userParameters->getStructPenalties1() == SECST) { for (j = 0; j < getSeqLength(1); ++j) { if (secStructMask1[j] == userParameters->getGapPos1()) { secStructMask1[j] = userParameters->getGapPos2(); } } } for(i = 1; i <= numSeqs; ++i) { for(j = 1; j <= getSeqLength(i); ++j) { if(seqArray[i][j] == userParameters->getGapPos1()) { seqArray[i][j] = userParameters->getGapPos2(); } } } } float Alignment::countid(int s1, int s2) { int c1, c2; int i; int count, total; float score; int shorterSeqLength = (getSeqLength(s1) < getSeqLength(s2)) ? getSeqLength(s1) : getSeqLength(s2); count = total = 0; for (i = 1; i <= shorterSeqLength; i++) // NOTE june29 { c1 = seqArray[s1][i]; c2 = seqArray[s2][i]; if ((c1 >= 0) && (c1 < userParameters->getMaxAA())) { total++; if (c1 == c2) { count++; } } } if (total == 0) { score = 0; } else { score = 100.0 *(float)count / (float)total; } return (score); } void Alignment::printSequences() { cout << std::endl; for(int i = 0; i < seqArray.size(); i++) { for(int j = 0; j < seqArray[i].size(); j++) { if(seqArray[i][j] > 9) cout << " " << seqArray[i][j]; else cout << " " << seqArray[i][j]; } cout << std::endl; } } /* * Note the max_aln_length is now called maxAlignmentLength, and it will be stored * and calculated in this class. Mostly it is used for allocating arrays. But not always. */ void Alignment::calculateMaxLengths() { maxAlignmentLength = 0; lengthLongestSequence = 0; if(seqArray.size() > 0) { SeqArray::iterator seqArrayIter = seqArray.begin(); while(seqArrayIter != seqArray.end()) { // NOTE I needed to change this to >= for a bug I had!!!!!!! if((*seqArrayIter).size() - 1 >= lengthLongestSequence) { lengthLongestSequence = (*seqArrayIter).size(); } ++seqArrayIter; } if(lengthLongestSequence > 0) { maxAlignmentLength = (lengthLongestSequence * 2) - 2; lengthLongestSequence -= 1; // MADE A CHANGE HERE AS WELL!! } else { lengthLongestSequence = 0; maxAlignmentLength = 0; } } else { maxAlignmentLength = 0; } maxNames = 0; if(names.size() > 0) { vector::iterator nameVecIter = names.begin(); while(nameVecIter != names.end()) { if((*nameVecIter).size() > maxNames) { maxNames = (*nameVecIter).size(); } ++nameVecIter; } if(maxNames < 10) { maxNames = 10; } } else { maxNames = 0; } } /* * This function checks to see if all names are different. It returns true if they * are all different, and false if there are 2 the same. */ bool Alignment::checkAllNamesDifferent() { bool different = true; // NOTE I added the + 1 here because, if we had a sequence with a name as blank // this would be the same as the first one! vector::iterator namesIter1 = names.begin() + 1; vector::iterator namesIter2; int counter1 = 1; int counter2 = 2; while(namesIter1 != names.end()) { namesIter2 = namesIter1 + 1; while(namesIter2 != names.end()) { if((*namesIter1).compare(*namesIter2) == 0) // If we have 2 strings the same. { different = false; /* 23-03-2007,nige: let someone up the stack deal with this - GUI is too deeply entangled. * utilityObject->error("Multiple sequences found with same name '%s' (first %d chars are significant)\n", namesIter1->c_str(), MAXNAMES); */ clearAlignment(); return different; // Not all different! } namesIter2++; counter2++; } namesIter1++; counter1++; counter2 = counter1 + 1; } return different; } void Alignment::addSecStructMask1(vector* secStructMaskToAdd) { secStructMask1 = *secStructMaskToAdd; } void Alignment::addSecStructMask2(vector* secStructMaskToAdd) { secStructMask2 = *secStructMaskToAdd; } void Alignment::addGapPenaltyMask1(vector* gapPenaltyMaskToAdd) { gapPenaltyMask1 = *gapPenaltyMaskToAdd; } void Alignment::addGapPenaltyMask2(vector* gapPenaltyMaskToAdd) { gapPenaltyMask2 = *gapPenaltyMaskToAdd; } void Alignment::addSecStructName1(string nameToAdd) { secStructName1 = nameToAdd; } void Alignment::addSecStructName2(string nameToAdd) { secStructName2 = nameToAdd; } void Alignment::addSeqWeight(vector* _seqWeight) { if(seqWeight.size() == _seqWeight->size()) { int size = seqWeight.size(); for(int i = 0; i < size; i++) { seqWeight[i] = (*_seqWeight)[i]; } } } void Alignment::printSequencesAddedInfo() { if(userParameters->getDisplayInfo()) { int startSeq = userParameters->getProfile2Empty() ? 1: profile1NumSeqs + 1; string dnaFlag = userParameters->getDNAFlag() ? "bp" : "aa"; for(int i = startSeq; i <= numSeqs; i++) { cout << "Sequence " << i << ": " << std::left << setw(maxNames) << names.at(i) << std::right << setw(6) << getSequenceLength(i) << " " << dnaFlag << std::endl; } } } void Alignment::debugPrintOutAlignInfo() { for(int i = 1; i <= numSeqs; i++) { cout << "Sequence " << i << ": " << std::left << setw(maxNames) << names.at(i) << std::right << setw(6) << getSequenceLength(i) << std::endl; } } int Alignment::getSequenceLength(int index) { return seqArray.at(index).size() - 1; } int Alignment::getLengthLongestSequence() { int _lengthLongestSequence = 0; for(int i = 1; i <= numSeqs; i++) { if(getSeqLength(i) > _lengthLongestSequence) { _lengthLongestSequence = getSeqLength(i); } } return _lengthLongestSequence; } /** * This function is used to find the length of the longest sequence in the range. */ int Alignment::getLengthLongestSequence(int firstSeq, int lastSeq) { int _lengthLongestSequence = 0; if(firstSeq >= 1 && lastSeq <= numSeqs) { for(int i = firstSeq; i <= lastSeq; i++) { if(getSeqLength(i) > _lengthLongestSequence) { _lengthLongestSequence = getSeqLength(i); } } } return _lengthLongestSequence; // Will return 0 if cant check seqs } int Alignment::getMaxNames() { return maxNames; } string Alignment::getSecStructName1() { return secStructName1; } string Alignment::getSecStructName2() { return secStructName2; } vector* Alignment::getGapPenaltyMask1() { return &gapPenaltyMask1; } vector* Alignment::getGapPenaltyMask2() { return &gapPenaltyMask2; } vector* Alignment::getSecStructMask1() { return &secStructMask1; } vector* Alignment::getSecStructMask2() { return &secStructMask2; } int Alignment::getSecStructMask1Element(int index) { if(index > 0 && index < secStructMask1.size()) { return secStructMask1[index]; } else { throw VectorOutOfRange(string("secStructMask1"), index, secStructMask1.size() - 1); } } int Alignment::getSecStructMask2Element(int index) { if(index > 0 && index < secStructMask2.size()) { return secStructMask2[index]; } else { throw VectorOutOfRange(string("secStructMask2"), index, secStructMask2.size() - 1); } } int Alignment::getGapPenaltyMask1Element(int index) { if(index > 0 && index < gapPenaltyMask1.size()) { return gapPenaltyMask1[index]; } else { throw VectorOutOfRange(string("gapPenaltyMask1"), index, gapPenaltyMask1.size() - 1); } } int Alignment::getGapPenaltyMask2Element(int index) { if(index > 0 && index < gapPenaltyMask2.size()) { return gapPenaltyMask2[index]; } else { throw VectorOutOfRange(string("gapPenaltyMask2"), index, gapPenaltyMask2.size() - 1); } } string Alignment::getName(int index) { if(index > 0 && index < names.size()) { return names[index]; } else { throw VectorOutOfRange(string("names"), index, names.size() - 1); } } /** * Change: * Mark 25-1-2007: This function was added to allow access to the unique id. */ unsigned long Alignment::getUniqueId(int seq) { if(seq > 0 && seq < sequenceIds.size()) { return sequenceIds[seq]; } else { throw VectorOutOfRange(string("sequenceIds"), seq, sequenceIds.size() - 1); } } string Alignment::getTitle(int index) { if(index > 0 && index < titles.size()) { return titles[index]; } else { throw VectorOutOfRange(string("titles"), index, titles.size() - 1); } } int Alignment::getOutputIndex(int index) { if(index >= 0 && index < outputIndex.size()) { return outputIndex[index]; } else { throw VectorOutOfRange(string("outputIndex"), index, outputIndex.size() - 1); } } int Alignment::getSeqWeight(int index) const { if(index >= 0 && index < seqWeight.size()) { return seqWeight[index]; } else { throw VectorOutOfRange(string("seqWeight"), index, seqWeight.size() - 1); } } /** * This function is used by Qt. It is used to calculate the histogram values for the * widget in clustalQt. The function is in here because it needs access to the SubMatrix * class. * @param matNum The number of the matrix to be used in the comparison. * @return A pointer to a vector containing the histogram values. */ vector* Alignment::QTcalcHistColumnHeights(int firstSeq, int nSeqs, Array2D* exceptionalRes) { int n, i, s, p, r, r1; int numColumns = getLengthLongestSequence(); int scoreScale = userParameters->getQTScorePlotScale();//5; // From ClautalX. int scoreCutOff = userParameters->getQTResExceptionCutOff(); bool includeGaps = false; //short *mat_xref, *matptr; float median, mean; float t, q1, q3, ul; vector seqdist, sorteddist; float diff; int maxRes; vector seqvector; vector freq; vector > profile; int matrix[NUMRES][NUMRES]; int _maxAA = userParameters->getMaxAA(); int _gapPos1 = userParameters->getGapPos1(); int _gapPos2 = userParameters->getGapPos2(); histogramColumnHeights.resize(numColumns); //panel_data data1; subMatrix->getQTMatrixForHistogram(matrix); profile.resize(numColumns + 2, vector(_maxAA + 2)); freq.resize(_maxAA + 2); for(p = 0; p < numColumns; p++) { for(r = 0; r < _maxAA; r++) { freq[r] = 0; } for(s = firstSeq; s < firstSeq + nSeqs; s++) { if(p < getSeqLength(s + 1) && seqArray[s + 1][p + 1] >= 0 && seqArray[s + 1][p + 1] < _maxAA) { freq[seqArray[s + 1][p + 1]]++; } } for(r = 0; r < _maxAA; r++) { profile[p][r] = 0; for(r1 = 0; r1 < _maxAA; r1++) { profile[p][r] += freq[r1] * matrix[r1][r]; } profile[p][r] = static_cast(profile[p][r] / static_cast(nSeqs)); // Mark change 17-5-07 } } seqvector.resize(_maxAA + 2); seqdist.resize(nSeqs + 1); sorteddist.resize(nSeqs + 1); for(p = 0; p < numColumns; p++) { for(s = firstSeq; s < firstSeq + nSeqs; s++) { if (p < getSeqLength(s + 1)) { for (r = 0; r < _maxAA; r++) { seqvector[r] = matrix[r][seqArray[s + 1][p + 1]]; } } else { for (r = 0; r < _maxAA; r++) { seqvector[r] = matrix[r][_gapPos1]; } } seqdist[s - firstSeq] = 0.0; for(r = 0; r < _maxAA; r++) { diff = profile[p][r] - seqvector[r]; diff /= 1000.0; seqdist[s - firstSeq] += diff * diff; } seqdist[s - firstSeq] = sqrt((double)seqdist[s - firstSeq]); } // calculate mean,median and rms of seq distances mean = median = 0.0; if(includeGaps) { for(s = 0; s < nSeqs; s++) { mean += seqdist[s]; } mean /= nSeqs; n = nSeqs; for(s = 0; s < nSeqs; s++) { sorteddist[s] = seqdist[s]; } } else { n = 0; for(s = firstSeq; s < firstSeq + nSeqs; s++) { if(p < getSeqLength(s + 1) && seqArray[s + 1][p + 1] >= 0 && seqArray[s + 1][p + 1] < _maxAA) { mean += seqdist[s - firstSeq]; n++; } } if(n > 0) { mean /= n; } for(s = firstSeq, i = 0; s < firstSeq + nSeqs; s++) { if(p < getSeqLength(s + 1) && seqArray[s + 1][p + 1] >= 0 && seqArray[s + 1][p + 1] < _maxAA) { sorteddist[i++] = seqdist[s - firstSeq]; } } } sortScores(&sorteddist, 0, n - 1); if(n == 0) { median = 0; } else if(n % 2 == 0) { median = (sorteddist[n / 2 - 1] + sorteddist[n / 2]) / 2.0; } else { median = sorteddist[n / 2]; } if(scoreScale <= 5) { histogramColumnHeights[p] = static_cast(exp((double)(-mean * (6 - scoreScale) / 4.0)) * 100.0 * n / nSeqs); } else { histogramColumnHeights[p] = static_cast(exp((double)(-mean / (4.0 * (scoreScale - 4)))) * 100.0 * n / nSeqs); } if(n == 0) { ul = 0; } else { t = n/4.0 + 0.5; if(t - (int)t == 0.5) { q3 = (sorteddist[(int)t] + sorteddist[(int)t + 1]) / 2.0; q1 = (sorteddist[n-(int)t] + sorteddist[n - (int)t - 1]) / 2.0; } else if(t - (int)t > 0.5) { q3 = sorteddist[(int)t + 1]; q1 = sorteddist[n - (int)t - 1]; } else { q3 = sorteddist[(int)t]; q1 = sorteddist[n - (int)t]; } if (n < 4) { ul = sorteddist[0]; } else { ul = q3 + (q3 - q1) * ((float)scoreCutOff / 2.0); } } if((exceptionalRes->getRowSize() >= nSeqs) && exceptionalRes->getColSize() >= numColumns) { for(int s = firstSeq; s < firstSeq + nSeqs; s++) { if(seqdist[s - firstSeq] > ul && p < getSeqLength(s + 1) && seqArray[s + 1][p + 1] >= 0 && seqArray[s + 1][p + 1] < userParameters->getMaxAA()) { (*exceptionalRes)[s - firstSeq][p] = 1; } else { (*exceptionalRes)[s - firstSeq][p] = 0; } } } } return &histogramColumnHeights; } void Alignment::sortScores(vector* scores, int f, int l) { int i,last; if(f >= l) return; swap(scores, f, (f + l) / 2); last = f; for(i = f + 1; i <= l; i++) { if((*scores)[i] > (*scores)[f]) { swap(scores, ++last, i); } } swap(scores, f, last); sortScores(scores, f, last - 1); sortScores(scores, last + 1, l); } void Alignment::swap(vector* scores, int s1, int s2) { float temp; temp = (*scores)[s1]; (*scores)[s1] = (*scores)[s2]; (*scores)[s2] = temp; } int Alignment::searchForString(bool* found, int seq, int beginRes, string search) { int lengthSeq = getSeqLength(seq); if(beginRes > lengthSeq) { *found = false; return beginRes; } int res = beginRes; // First need to convert search into a vector of ints! vector codedSearch; int size = search.size(); codedSearch.resize(size); int code; for(int i = 0; i < size; i++) { code = userParameters->resIndex(userParameters->getAminoAcidCodes(), search[i]); codedSearch[i] = code; } int numSame = 0; int startPos = -1; int searchSize = codedSearch.size(); // Now check for the string of ints!!!! for(; res <= lengthSeq; res++) //- nige: res starts at 1 { if(seqArray[seq][res] == codedSearch[0]) { startPos = res; //- nige for(int i = 0; i < searchSize && (i + res) <= lengthSeq; i++) //- nige: res starts at 1 { if(seqArray[seq][res + i] == codedSearch[i]) { numSame++; } //nige: hack: encoded gap character: see also AlignmentScroll.cpp else if (seqArray[seq][res + i] == 31 || seqArray[seq][res + i] == 30) { res++; i--; } else { break; // Not the same } } if(numSame == searchSize) { *found = true; return startPos; } else { numSame = 0; } } } *found = false; return startPos; // Not found!!!!!!! } void Alignment::removeGapsFromSelectedSeqs(vector* selected) { //getNumSeqs() int size = selected->size(); int lengthOfSelectedSeq = 0; int gapPos1 = userParameters->getGapPos1(); int gapPos2 = userParameters->getGapPos2(); int s1; for(int i = 1; i <= getNumSeqs() && i < size; i++) { if((*selected)[i] == 1) { // remove gaps from this seq! lengthOfSelectedSeq = getSeqLength(i); s1 = 0; for(int j = 1; j <= lengthOfSelectedSeq; j++) { if(seqArray[i][j] == gapPos1 || seqArray[i][j] == gapPos2) { continue; } ++s1; seqArray[i][s1] = seqArray[i][j]; } // Then remove the excess bit at the end of the array int numExtraElements = lengthOfSelectedSeq - s1; if(seqArray[i].size() > numExtraElements) { for(int k = 0; k < numExtraElements; k++) { seqArray[i].pop_back(); } } } } } void Alignment::removeGapOnlyColsFromSelectedSeqs(vector* selected) { int numGaps = 0; int NoneSelected = -1; int numColumns = 0; int sizeSelected = selected->size(); int firstSeqSelected = NoneSelected; int gapPos1 = userParameters->getGapPos1(); int gapPos2 = userParameters->getGapPos2(); int k; for(int i = 1; i < sizeSelected; i++) { if((*selected)[i] == 1) { numColumns++; if(firstSeqSelected == -1) { firstSeqSelected = i; } } } if(firstSeqSelected == NoneSelected) { cout << "No Sequences have been selected\n"; return; } for(int i = 1; i <= getSeqLength(firstSeqSelected);) { numGaps = 0; for(int j = firstSeqSelected; j < sizeSelected && (*selected)[j] == 1; j++) { if(getSeqLength(j) >= i) { if(seqArray[j][i] == gapPos1 || seqArray[j][i] == gapPos2) { numGaps++; } } } if(numGaps == numColumns) { //cout << " removing a gap column\n\n"; for(int j = firstSeqSelected; j < sizeSelected && (*selected)[j] == 1; j++) { for(k = i + 1; k <= getSeqLength(j) + 1 && seqArray[j].size() > k; k++) { seqArray[j][k - 1] = seqArray[j][k]; } seqArray[j].pop_back(); // Remove the last element!! if(getSeqLength(firstSeqSelected) <= 0) { break; } } } else { i++; } } } void Alignment::removeAllGapOnlyColumns(int fSeq, int lSeq, int profileNum) { if(fSeq >= lSeq) { return; } int gapPos1 = userParameters->getGapPos1(); int gapPos2 = userParameters->getGapPos2(); int numGaps = 0; int numColumns = lSeq - fSeq + 1; int k; // We must cheack each column to see if it consists of only '-' for(int i = 1; i <= getSeqLength(fSeq);) { numGaps = 0; for(int j = fSeq; j <= lSeq; j++) { if(getSeqLength(j) >= i) { if(seqArray[j][i] == gapPos1 || seqArray[j][i] == gapPos2) { numGaps++; } } } if(numGaps == numColumns) { for(int j = fSeq; j <= lSeq; j++) { for(k = i + 1; k <= getSeqLength(j) + 1; k++) { seqArray[j][k - 1] = seqArray[j][k]; } seqArray[j].pop_back(); // Remove the last element!! if(profileNum == 1) { int lengthSecStruct = secStructMask1.size(); int lengthGapMask = gapPenaltyMask1.size(); for(k = i; k <= getSeqLength(fSeq) && k < lengthSecStruct; k++) { secStructMask1[k - 1] = secStructMask1[k]; } for(k = i; k <= getSeqLength(fSeq) && k < lengthGapMask; k++) { gapPenaltyMask1[k - 1] = gapPenaltyMask1[k]; } } if(profileNum == 2) { int lengthSecStruct = secStructMask2.size(); int lengthGapMask = gapPenaltyMask2.size(); for(k = i; k <= getSeqLength(fSeq) && k < lengthSecStruct; k++) { secStructMask2[k - 1] = secStructMask2[k]; } for(k = i; k <= getSeqLength(fSeq) && k < lengthGapMask; k++) { gapPenaltyMask2[k - 1] = gapPenaltyMask2[k]; } } if(getSeqLength(fSeq) <= 0) { break; } } } else { i++; } } } vector Alignment::cutSelectedSequencesFromAlignment(vector* selected) { vector cutSequences; int sizeOfSelected = selected->size(); SeqArray::iterator seqArrayIterator; vector::iterator namesIterator; vector::iterator titlesIterator; vector::iterator seqWeightIterator; vector::iterator sequenceIdsIterator; int newProfile1NumSeqs = profile1NumSeqs; int profNum = userParameters->getProfileNum(); int prof1NumSeqs = profile1NumSeqs; int numCutSoFar = 0; int intialNumSeqs = numSeqs; for(int i = 1; i < sizeOfSelected && i <= intialNumSeqs; i++) { if((*selected)[i] == 1) { // Cut the sequence from the alignment! seqArrayIterator = seqArray.begin() + i - numCutSoFar; namesIterator = names.begin() + i - numCutSoFar; titlesIterator = titles.begin() + i - numCutSoFar; seqWeightIterator = seqWeight.begin() + i - numCutSoFar; sequenceIdsIterator = sequenceIds.begin() + i - numCutSoFar; Sequence SeqToCut(&seqArray[i - numCutSoFar], *namesIterator, *titlesIterator, *sequenceIdsIterator); numCutSoFar++; seqArrayIterator->clear(); seqArray.erase(seqArrayIterator); names.erase(namesIterator); titles.erase(titlesIterator); seqWeight.erase(seqWeightIterator); sequenceIds.erase(sequenceIdsIterator); if(numSeqs > 0) { numSeqs--; } else { numSeqs = 0; break; } if(profNum > 0 && i <= prof1NumSeqs) { if(newProfile1NumSeqs > 0) { newProfile1NumSeqs--; } else { newProfile1NumSeqs = 0; } } cutSequences.push_back(SeqToCut); } } profile1NumSeqs = newProfile1NumSeqs; setDefaultOutputIndex(); resetAllSeqWeights(); return cutSequences; } void Alignment::setDefaultOutputIndex() { outputIndex.clear(); outputIndex.resize(numSeqs); for(int iseq = 1; iseq <= numSeqs; iseq++) { outputIndex[iseq - 1] = iseq; } } bool Alignment::removeAllOutsideRange(int beginPos, int endPos) { bool ok; if(beginPos < 0 || endPos > getLengthLongestSequence()) { return false; // cannot do it!!!! } // trim the seqArray ok = keepPortionOfSeqArray(beginPos, endPos); if(!ok) { cout << "There was a problem removing a portion of the array\n"; return false; } // recalculate the maxLengths calculateMaxLengths(); // Clear the histogram columns histogramColumnHeights.clear(); // reset the weights resetAllSeqWeights(); return true; } bool Alignment::keepPortionOfSeqArray(int beginRangeIndex, int endRangeIndex) { SeqArray sectionToRealign; vector emptyVec; sectionToRealign.push_back(emptyVec); // EMPTY sequence SeqArray::iterator posToAddTo = sectionToRealign.begin(); // erase from all sequences the range specified here!!!!! if(beginRangeIndex < 0 || endRangeIndex < 0) { return false; } SeqArray::iterator mainBeginIt = seqArray.begin() + 1; SeqArray::iterator mainEndIt = seqArray.end(); vector::iterator begin, end, beginRange, endRange, beginCopyRange, endCopyRange; for(; mainBeginIt != mainEndIt; mainBeginIt++) { vector vecToAdd; begin = mainBeginIt->begin() + 1; end = mainBeginIt->end(); beginRange = begin + beginRangeIndex; endRange = begin + endRangeIndex + 1; beginCopyRange = beginRange; endCopyRange = endRange; // We need to copy all of this into another vector. if(endCopyRange < end && beginCopyRange < end) { vecToAdd.push_back(0); for(; beginCopyRange != endCopyRange; beginCopyRange++) { vecToAdd.push_back(*beginCopyRange); } sectionToRealign.push_back(vecToAdd); } else { return false; } if(endRange < end && beginRange < end) { mainBeginIt->erase(beginRange, endRange); } else { return false; } } clearSeqArray(); seqArray = sectionToRealign; return true; } void Alignment::printSeqArray(SeqArray* arrayToPrint) { // I need to use iterators for everything here. SeqArray::iterator mainBeginIt = arrayToPrint->begin(); SeqArray::iterator mainEndIt = arrayToPrint->end(); vector::iterator begin, end; string aaCodes = userParameters->getAminoAcidCodes(); for(; mainBeginIt != mainEndIt; mainBeginIt++) { if(mainBeginIt->size() > 0) { begin = mainBeginIt->begin() + 1; end = mainBeginIt->end(); for(; begin != end; begin++) { if(*begin < aaCodes.size()) { cout << aaCodes[*begin]; } else { cout << "-"; } } cout << "\n"; } } } void Alignment::printProfile1() { cout << "************** PROFILE1 *********************\n"; SeqArray::iterator mainBeginIt = seqArray.begin() + 1; SeqArray::iterator mainEndIt = mainBeginIt + profile1NumSeqs; vector::iterator begin, end; string aaCodes = userParameters->getAminoAcidCodes(); for(; mainBeginIt != mainEndIt; mainBeginIt++) { cout << "PROFILE1 SEQ: "; if(mainBeginIt->size() > 0) { begin = mainBeginIt->begin() + 1; end = mainBeginIt->end(); for(; begin != end; begin++) { if(*begin < aaCodes.size()) { cout << aaCodes[*begin]; } else { cout << "-"; } } cout << "\n"; } } } void Alignment::printProfile2() { cout << "************** PROFILE2 *********************\n"; SeqArray::iterator mainBeginIt = seqArray.begin() + 1 + profile1NumSeqs; SeqArray::iterator mainEndIt = seqArray.end(); vector::iterator begin, end; string aaCodes = userParameters->getAminoAcidCodes(); for(; mainBeginIt != mainEndIt; mainBeginIt++) { cout << "PROFILE2 SEQ: "; if(mainBeginIt->size() > 0) { begin = mainBeginIt->begin() + 1; end = mainBeginIt->end(); for(; begin != end; begin++) { if(*begin < aaCodes.size()) { cout << aaCodes[*begin]; } else { cout << "-"; } } cout << "\n"; } } } bool Alignment::updateRealignedRange(SeqArray realignedSeqs, int beginPos, int endPos) { if(realignedSeqs.size() != seqArray.size()) { return false; } if(beginPos < 0 || endPos < 0) { return false; } // erase from all sequences the range specified here!!!!! SeqArray::iterator mainBeginIt = seqArray.begin() + 1; SeqArray::iterator mainEndIt = seqArray.end(); SeqArray::iterator pasteBeginIt = realignedSeqs.begin() + 1; SeqArray::iterator pasteEndIt = realignedSeqs.end(); vector::iterator begin, end, beginRange, endRange; for(; mainBeginIt != mainEndIt && pasteBeginIt != pasteEndIt; mainBeginIt++) { vector vecToAdd; begin = mainBeginIt->begin() + 1; end = mainBeginIt->end(); beginRange = begin + beginPos; endRange = begin + endPos + 1; if(endRange < end && beginRange < end) { mainBeginIt->erase(beginRange, endRange); mainBeginIt->insert(beginRange, pasteBeginIt->begin() + 1, pasteBeginIt->end()); } else { return false; } pasteBeginIt++; } return true; } bool Alignment::reloadAlignment() { if(getNumSeqs() <= 0) { return false; } if(userParameters->getOutputOrder() == INPUT) { return true; } if(outputIndex.size() != getNumSeqs()) { outputIndex.clear(); return false; } vector emptyVec; string emptyString = ""; SeqArray outputOrderSeqArray; outputOrderSeqArray.resize(getNumSeqs() + 1); outputOrderSeqArray[0] = emptyVec; vector outputOrderNames; outputOrderNames.resize(getNumSeqs() + 1); outputOrderNames[0] = emptyString; vector outputOrderTitles; outputOrderTitles.resize(getNumSeqs() + 1); outputOrderTitles[0] = emptyString; vector outputOrderSequenceIds; outputOrderSequenceIds.resize(getNumSeqs() + 1); outputOrderSequenceIds[0] = 0; int size = seqArray.size(); if((seqArray.size() != names.size()) || (seqArray.size() != titles.size()) || sequenceIds.size() != names.size()) { return false; } int _outIndex; // Now for each seq, for(int i = 1; i < size; i++) { if(i < outputOrderSeqArray.size() && i - 1 < outputIndex.size() && outputIndex[i - 1] < size) { _outIndex = outputIndex[i - 1]; outputOrderSeqArray[i] = seqArray[_outIndex]; outputOrderNames[i] = names[_outIndex]; outputOrderTitles[i] = titles[_outIndex]; outputOrderSequenceIds[i] = sequenceIds[_outIndex]; } else { return false; } } // Now we have a copy in the correct order. // Remove all the elements from the old ones and set them to be these arrays clearSeqArray(); seqArray = outputOrderSeqArray; names.clear(); names = outputOrderNames; titles.clear(); titles = outputOrderTitles; sequenceIds.clear(); sequenceIds = outputOrderSequenceIds; } const vector* Alignment::getSequenceFromUniqueId(unsigned long id) { for(int i = 0; i < sequenceIds.size(); i++) { if(sequenceIds[i] == id) { return getSequence(i); } } // We have not found it, throw an exception!!! throw SequenceNotFoundException(); } /** * Change: * Mark 26-1-2007: This function was added to allow access to the unique id. */ void Alignment::updateSequence(int index, const vector* seq) { if(index >= 1 && index < seqArray.size()) { seqArray[index] = *seq; } } /** * Change: * Mark 17-2-2007: This function was added to check if a res is a gap or not. */ bool Alignment::isGap(int seq, int col) const { int res = seqArray[seq][col]; if(res == gapPos1 || res == gapPos2) { return true; } else { return false; } } /** * This function will be used so that we can create an alignment object from a seqArray. * This will be used for the tree iteration. */ void Alignment::addSequences(SeqArray* seqVector) { clearAlignment(); numSeqs = seqVector->size() - 1; vector emptyVec; seqArray.push_back(emptyVec); // EMPTY sequence names.push_back(string("")); titles.push_back(string("")); sequenceIds.push_back(0); cout << "\nThere are " << numSeqs << " in the alignment obj\n"; for(int i = 1; i <= numSeqs; i++) { ostringstream name; seqArray.push_back((*seqVector)[i]); titles.push_back(string("")); sequenceIds.push_back(utilityObject->getUniqueSequenceIdentifier()); name << "name" << numSeqs; names.push_back(name.str()); } calculateMaxLengths(); seqWeight.resize(numSeqs + 1, 100); } }