/* Pollux Copyright (C) 2014 Eric Marinier This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "ErrorCorrection.h" #include "Utility.h" #include "ErrorTyping.h" #include "Counting.h" #include #include #include #include "Reads.h" #include "Correction.h" #include "ErrorOutput.h" typedef enum { SUB_A = 0, SUB_T, SUB_C, SUB_G, INS, DEL_L_A, DEL_L_T, DEL_L_C, DEL_L_G, DEL_R_A, DEL_R_T, DEL_R_C, DEL_R_G, CorrectionType_Count // Needs to be last! Count of items in enum! } CorrectionType; unsigned int substitutionErrors = 0; unsigned int insertionErrors = 0; unsigned int deletionErrors = 0; unsigned int multipleErrors = 0; unsigned int homopolymerErrors = 0; #define HOMOPOLYMER_SIZE_RANGE 10 // [< -RANGE, -RANGE, .. , RANGE, > RANGE] #define HOMOPOLYMER_SIZE_OUTSIDE_NEGATIVE 0 // < -RANGE #define HOMOPOLYMER_SIZE_NEGATIVE_START 1 // -RANGE #define HOMOPOLYMER_SIZE_NEGATIVE_END (HOMOPOLYMER_SIZE_NEGATIVE_START + HOMOPOLYMER_SIZE_RANGE - 1) // (-1) #define HOMOPOLYMER_SIZE_POSITIVE_START (HOMOPOLYMER_SIZE_NEGATIVE_END + 1) // (1)) #define HOMOPOLYMER_SIZE_POSITIVE_END (HOMOPOLYMER_SIZE_POSITIVE_START + HOMOPOLYMER_SIZE_RANGE - 1) // + RANGE #define HOMOPOLYMER_SIZE_OUTSIDE_POSITIVE (HOMOPOLYMER_SIZE_POSITIVE_END + 1) // > + RANGE #define MINIMUM_HOMOPOLYMER_SIZE 1 unsigned int homopolymerSize[HOMOPOLYMER_SIZE_OUTSIDE_POSITIVE + 1] = {0}; void printCorrectionResults() { unsigned int homopolymerInsertions = 0; unsigned int homopolymerDeletions = 0; int currentSize; printf("\n"); printf("Corrected...\n"); printf("Reads with Multiple Corrections: %d\n", multipleErrors); printf("\n"); printf("Substitution Corrections: %d\n", substitutionErrors); printf("Single Insertion Corrections: %d\n", insertionErrors); printf("Single Deletion Corrections: %d\n", deletionErrors); printf("\n"); printf("Total Homopolymer Corrections: %d\n", homopolymerErrors); printf("\n"); printf("Breakdown:\n"); printf("< -%d: %d\n", HOMOPOLYMER_SIZE_RANGE, homopolymerSize[HOMOPOLYMER_SIZE_OUTSIDE_NEGATIVE]); for(int i = HOMOPOLYMER_SIZE_NEGATIVE_START; i <= HOMOPOLYMER_SIZE_NEGATIVE_END; i++) { currentSize = (i - HOMOPOLYMER_SIZE_RANGE - 1); homopolymerDeletions += homopolymerSize[i] * -currentSize; printf("%d : %d\n", currentSize, homopolymerSize[i]); } for(int i = HOMOPOLYMER_SIZE_POSITIVE_START; i <= HOMOPOLYMER_SIZE_POSITIVE_END; i++) { currentSize = (i - HOMOPOLYMER_SIZE_RANGE); homopolymerInsertions += homopolymerSize[i] * currentSize; printf("%d : %d\n", currentSize, homopolymerSize[i]); } printf("> %d: %d\n", HOMOPOLYMER_SIZE_RANGE, homopolymerSize[HOMOPOLYMER_SIZE_OUTSIDE_POSITIVE]); printf("\n"); printf("Total Insertions (Single + Homopolymer): %d\n", insertionErrors + homopolymerInsertions); printf("Total Deletions (Single + Homopolymer): %d\n", deletionErrors + homopolymerDeletions); } bool isHighToLow(unsigned int* kmers, unsigned int kmerLocation) { return (kmers[kmerLocation] > kmers[kmerLocation + 1]); } int getSequenceLocation(struct Sequence* sequence, int kmerLocation, KMerHashTable* kmers, unsigned int kmerSize) { // K-mers. int total = sequence->length - kmerSize + 1; unsigned int kmerCounts[total]; getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts); int sequenceLocation; // Order of discrepancy? if(isHighToLow(kmerCounts, kmerLocation)) { // High -> Low sequenceLocation = kmerLocation + kmerSize; } else { // Low -> High sequenceLocation = kmerLocation; } return sequenceLocation; } char getAverageQuality(struct Sequence* sequence, int first, int second) { const char ERROR_VALUE = 33; int total = 0; int count = 0; // Within range: if(0 <= first && first < sequence->length) { total += (int)sequence->quality[first]; count++; } // Within range: if(0 <= second && second < sequence->length) { total += (int)sequence->quality[second]; count++; } if(count != 0) { return (char)(total / count); } else { return ERROR_VALUE; } } int evaluateCorrection(bool highToLow, unsigned int* kmerCounts, int total, int kmerLocation) { int count = 0; // HIGH -> LOW if(highToLow) { for(int i = kmerLocation; i < total - 1; i++) { if(isJump(kmerCounts[i], kmerCounts[i + 1])) { break; } else { count++; } } } // LOW -> HIGH else { for(int i = kmerLocation; i >= 0; i--) { if(isJump(kmerCounts[i], kmerCounts[i + 1])) { break; } else { count++; } } } return count; } bool doSubstitutionCorrection( struct Sequence* sequence, unsigned int kmerLocation, char substitution, KMerHashTable* kmers, unsigned int kmerSize) { int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize); char originalBase = getBase(sequence->sequence, sequenceLocation); char originalQuality = sequence->quality[sequenceLocation]; char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1); // K-mers: int total = sequence->length - kmerSize + 1; unsigned int kmersInitial[total]; unsigned int kmersCorrection[total]; // Get initial counts. getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersInitial); // Try correction. changeBase(sequence, sequenceLocation, substitution, newQuality); // Get new counts. getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersCorrection); // Note changes: int kmersImproved = evaluateCorrection(isHighToLow(kmersInitial, kmerLocation), kmersCorrection, sequence->length - kmerSize + 1, kmerLocation); // Revert: changeBase(sequence, sequenceLocation, originalBase, originalQuality); return kmersImproved; } // Deletion Error: Need to insert a base. bool doDeletionCorrection(struct Sequence* sequence, unsigned int kmerLocation, char nucleotide, bool insertLeft, KMerHashTable* kmers, unsigned int kmerSize) { int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize); char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1); // K-mers. int total = sequence->length - kmerSize + 1; unsigned int kmersInitial[total]; unsigned int kmersCorrection[total]; // Get initial counts. getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersInitial); // Try correction. if(insertLeft) { insertBase(sequence, sequenceLocation, nucleotide, newQuality); } else { insertBase(sequence, sequenceLocation + 1, nucleotide, newQuality); } // Get new counts. getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersCorrection); bool highToLow = isHighToLow(kmersInitial, kmerLocation); int kmersImproved; // Note changes: if(highToLow) { kmersImproved = evaluateCorrection(highToLow, kmersCorrection, sequence->length - kmerSize + 1, kmerLocation); } else { kmersImproved = evaluateCorrection(highToLow, kmersCorrection + 1, sequence->length - kmerSize + 1, kmerLocation); // (kmersCorrection + 1) because the 'good' bases has moved right // we don't want to look at garbage and think it's gold } // Revert: if(insertLeft) { deleteBase(sequence, sequenceLocation); } else { deleteBase(sequence, sequenceLocation + 1); } return kmersImproved - 1; } bool correctErrorAtLocationInsertion(struct Sequence* sequence, KMerHashTable* kmers, unsigned int kmerSize, int kmerLocation) { // Safety: if(sequence->length <= kmerSize + 1) { // We will be making the read even shorter. return false; } // Sequence: int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize); char originalBase = getBase(sequence->sequence, sequenceLocation); char originalQuality = sequence->quality[sequenceLocation]; // K-mers. int total = sequence->length - kmerSize + 1; unsigned int kmersInitial[total]; // Original unsigned int kmersCorrection[total - 1]; // Original with deletion. (-1) // Get initial counts. getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersInitial); // Delete: deleteBase(sequence, sequenceLocation); // Get new counts. getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersCorrection); bool highToLow = isHighToLow(kmersInitial, kmerLocation); int kmersImproved; // Note changes: if(highToLow) { kmersImproved = evaluateCorrection(highToLow, kmersCorrection, sequence->length - kmerSize + 1, kmerLocation); } else { kmersImproved = evaluateCorrection(highToLow, kmersCorrection - 1, sequence->length - kmerSize + 1, kmerLocation); // (kmersCorrection - 1) because 'good' bases have fallen into the position } // Revert: insertBase(sequence, sequenceLocation, originalBase, originalQuality); return kmersImproved; } int getNextKMerDiscrepancy(double* kmerDiscrepancies, int total) { double score = -1; int kmerLocation = -1; for(int i = 0; i < total; i++) { if(kmerDiscrepancies[i] > 0 && kmerDiscrepancies[i] > score) { score = kmerDiscrepancies[i]; kmerLocation = i; } } return kmerLocation; } double scoreKMerDiscrepancy(int value1, int value2) { int high = getMax(value1, value2); int low = getMin(value1, value2); high = getMax(high, 1); low = getMax(low, 1); //double score = ((double)low * (double)low) / (double)high; double score = (double)high - (double)low; return score; } void findKMerDiscrepancies(struct Sequence* sequence, KMerHashTable* kmers, unsigned int kmerSize, double* kmerDiscrepancies) { // K-mers. int total = sequence->length - kmerSize + 1; unsigned int kmerCounts[total]; getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts); // Analyze possible discrepancies for(int i = 0; i < total - 1; i++) { // Is there something we would consider a discrepancy? if(isJump(kmerCounts[i], kmerCounts[i + 1])) { kmerDiscrepancies[i] = scoreKMerDiscrepancy(kmerCounts[i], kmerCounts[i + 1]); } // Not a discrepancy. else { kmerDiscrepancies[i] = -1; } } } double getAverageKMerCount(struct Sequence* sequence, KMerHashTable* kmers, unsigned int kmerSize, int start, int end) { double average = 0; // K-mers: int total = sequence->length - kmerSize + 1; unsigned int kmerCounts[total]; getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts); // Averaged k-mers: for(int i = start; i < end; i++) { average += kmerCounts[i]; } average = (double)average / (double)(end - start); return average; } bool correctErrorAtLocationHomopolymer(struct Sequence* sequence, KMerHashTable* kmers, unsigned int kmerSize, int kmerLocation) { bool highToLow; int initialHomopolymerLength; int currentHomopolymerLength; int homopolymerLeftmostNucleotide; double currentAverage; double maxAverage = 0; int bestLength; int start, end; // Used in averaging. // K-mers: int total = sequence->length - kmerSize + 1; unsigned int kmerCounts[total]; getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts); int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize); char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1); //TODO: THIS SHOULD BE TEMP // High -> Low if(isHighToLow(kmerCounts, kmerLocation)) { highToLow = true; homopolymerLeftmostNucleotide = getHomopolymerLeftmostNucleotide(sequence->sequence, sequenceLocation - 1); // Initialize Average (Single): start = (homopolymerLeftmostNucleotide - kmerSize + 1) + getHomopolymerLength(sequence->sequence, sequence->length, homopolymerLeftmostNucleotide); end = start + 1; maxAverage = getAverageKMerCount(sequence, kmers, kmerSize, start, end); } // Low -> High else { highToLow = false; homopolymerLeftmostNucleotide = getHomopolymerLeftmostNucleotide(sequence->sequence, sequenceLocation + 1); // Initialize Average (Single): start = homopolymerLeftmostNucleotide - 1; end = start + 1; maxAverage = getAverageKMerCount(sequence, kmers, kmerSize, start, end); } // Initialize: initialHomopolymerLength = getHomopolymerLength(sequence->sequence, sequence->length, homopolymerLeftmostNucleotide); // Homopolymer must be at least minimum size to try operating on it. if (initialHomopolymerLength < MINIMUM_HOMOPOLYMER_SIZE) { return false; } currentHomopolymerLength = initialHomopolymerLength; bestLength = initialHomopolymerLength; for (int i = getMax(initialHomopolymerLength / 2, 1); i <= initialHomopolymerLength * 2 && currentHomopolymerLength < kmerSize; i++) { setHomopolymerLength(sequence, homopolymerLeftmostNucleotide, i, newQuality); if(sequence->length <= kmerSize) { // The read is too short to gain any information. continue; } currentHomopolymerLength = getHomopolymerLength(sequence->sequence, sequence->length, homopolymerLeftmostNucleotide); // FIND K-MER RANGE: // Idea is to use as few k-mers as needed. // Have the k-mers be mostly the better part of the read. // High -> Low if(highToLow) { start = (homopolymerLeftmostNucleotide - kmerSize + 1) + (currentHomopolymerLength); end = start + 2; } // Low -> High else { start = homopolymerLeftmostNucleotide - 2; end = start + 2; } if(start < 0 || end > sequence->length - kmerSize + 1) { break; // We're outside the k-mer range and will always be. } currentAverage = getAverageKMerCount(sequence, kmers, kmerSize, start, end); if(currentAverage > maxAverage) { maxAverage = currentAverage; bestLength = i; } } setHomopolymerLength(sequence, homopolymerLeftmostNucleotide, bestLength, newQuality); // Did we find something better? if(bestLength != initialHomopolymerLength) { // Update information. sequence->corrections[sequence->numCorrections] = 'H'; sequence->homopolymerSize[sequence->numCorrections] = bestLength - initialHomopolymerLength; sequence->numCorrections = sequence->numCorrections + 1; return true; } else { return false; } } void assignCorrection(CorrectionType correctionType, struct Sequence* sequence, unsigned int kmerLocation, KMerHashTable* kmers, unsigned int kmerSize) { int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize); char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1); switch(correctionType) { case SUB_A: changeBase(sequence, sequenceLocation, 'A', newQuality); break; case SUB_T: changeBase(sequence, sequenceLocation, 'T', newQuality); break; case SUB_C: changeBase(sequence, sequenceLocation, 'C', newQuality); break; case SUB_G: changeBase(sequence, sequenceLocation, 'G', newQuality); break; case INS: deleteBase(sequence, sequenceLocation); break; case DEL_L_A: insertBase(sequence, sequenceLocation, 'A', newQuality); break; case DEL_L_T: insertBase(sequence, sequenceLocation, 'T', newQuality); break; case DEL_L_C: insertBase(sequence, sequenceLocation, 'C', newQuality); break; case DEL_L_G: insertBase(sequence, sequenceLocation, 'G', newQuality); break; case DEL_R_A: insertBase(sequence, sequenceLocation + 1, 'A', newQuality); break; case DEL_R_T: insertBase(sequence, sequenceLocation + 1, 'T', newQuality); break; case DEL_R_C: insertBase(sequence, sequenceLocation + 1, 'C', newQuality); break; case DEL_R_G: insertBase(sequence, sequenceLocation + 1, 'G', newQuality); break; default: printf("UNRECOGNIZED ERROR CORRECTION TYPE!\n"); exit(0); } if(correctionType == SUB_A || correctionType == SUB_T || correctionType == SUB_C || correctionType == SUB_G) { sequence->corrections[sequence->numCorrections] = 'S'; } else if(correctionType == INS) { sequence->corrections[sequence->numCorrections] = 'I'; } else if(correctionType == DEL_L_A || correctionType == DEL_L_T || correctionType == DEL_L_C || correctionType == DEL_L_G || correctionType == DEL_R_A || correctionType == DEL_R_T || correctionType == DEL_R_C || correctionType == DEL_R_G) { sequence->corrections[sequence->numCorrections] = 'D'; } else { printf("UNRECOGNIZED ERROR CORRECTION TYPE!\n"); exit(0); } sequence->numCorrections = sequence->numCorrections + 1; } bool correctErrorAtLocation(struct Sequence* sequence, int kmerLocation, Correction* correction) { // KMers: KMerHashTable* kmers = correctionGetKMers(correction); unsigned int kmerSize = correctionGetKMerSize(correction); int total = sequence->length - kmerSize + 1; // Safety: if (kmerLocation <= 0 || kmerLocation >= (total - 2)) { // Out of bounds. return false; } int correctionAttempt[CorrectionType_Count]; // Initialize: for(int i = 0; i < CorrectionType_Count; i++) { correctionAttempt[i] = 0; } // Substitution Error if(correction->substitutions) { correctionAttempt[SUB_A] = doSubstitutionCorrection(sequence, kmerLocation, 'A', kmers, kmerSize); correctionAttempt[SUB_T] = doSubstitutionCorrection(sequence, kmerLocation, 'T', kmers, kmerSize); correctionAttempt[SUB_C] = doSubstitutionCorrection(sequence, kmerLocation, 'C', kmers, kmerSize); correctionAttempt[SUB_G] = doSubstitutionCorrection(sequence, kmerLocation, 'G', kmers, kmerSize); } // Insertion Error if(correction->insertions) { correctionAttempt[INS] = correctErrorAtLocationInsertion(sequence, kmers, kmerSize, kmerLocation); } // Deletion Error if(correction->deletions) { correctionAttempt[DEL_L_A] = doDeletionCorrection(sequence, kmerLocation, 'A', true, kmers, kmerSize); correctionAttempt[DEL_L_T] = doDeletionCorrection(sequence, kmerLocation, 'T', true, kmers, kmerSize); correctionAttempt[DEL_L_C] = doDeletionCorrection(sequence, kmerLocation, 'C', true, kmers, kmerSize); correctionAttempt[DEL_L_G] = doDeletionCorrection(sequence, kmerLocation, 'G', true, kmers, kmerSize); correctionAttempt[DEL_R_A] = doDeletionCorrection(sequence, kmerLocation, 'A', false, kmers, kmerSize); correctionAttempt[DEL_R_T] = doDeletionCorrection(sequence, kmerLocation, 'T', false, kmers, kmerSize); correctionAttempt[DEL_R_C] = doDeletionCorrection(sequence, kmerLocation, 'C', false, kmers, kmerSize); correctionAttempt[DEL_R_G] = doDeletionCorrection(sequence, kmerLocation, 'G', false, kmers, kmerSize); } // Find the best correction: CorrectionType bestCorrectionIndex = 0; int bestCorrectionValue = 0; for(int i = 0; i < CorrectionType_Count; i++) { if(correctionAttempt[i] > bestCorrectionValue) { bestCorrectionIndex = i; bestCorrectionValue = correctionAttempt[i]; } } if(bestCorrectionValue >= 2) { assignCorrection(bestCorrectionIndex, sequence, kmerLocation, kmers, kmerSize); return true; } // Homopolymers if(correction->homopolymers && correctErrorAtLocationHomopolymer(sequence, kmers, kmerSize, kmerLocation)) { return true; } return false; } void recordHomopolymerSize(int size) { if(size > HOMOPOLYMER_SIZE_RANGE) { homopolymerSize[HOMOPOLYMER_SIZE_RANGE * 2 + 1]++; } else if(size < -HOMOPOLYMER_SIZE_RANGE) { homopolymerSize[0]++; } else if(size > 0) { homopolymerSize[HOMOPOLYMER_SIZE_RANGE + size]++; } else if(size < 0) { homopolymerSize[(HOMOPOLYMER_SIZE_RANGE + 1) + size]++; } else { printf("Homopolymer size recording error!"); } } void applyCorrection(struct Sequence* sequence, struct read* read) { free(read->sequence); read->sequence = sequence->sequence; // NOT QUITE CORRECT! free(read->quality); read->quality = sequence->quality; read->length = sequence->length; // Update information. for(int i = 0; i < sequence->numCorrections; i++) { switch(sequence->corrections[i]) { case 'S': substitutionErrors++; break; case 'I': insertionErrors++; break; case 'D': deletionErrors++; break; // Homopolymers: case 'H': homopolymerErrors++; recordHomopolymerSize(sequence->homopolymerSize[i]); break; } } if(sequence->numCorrections > 1) { multipleErrors++; } free(sequence->corrections); free(sequence->homopolymerSize); } void revertCorrection(struct Sequence* sequence, struct read* read) { free(sequence->sequence); free(sequence->quality); free(sequence->corrections); free(sequence->homopolymerSize); } void initializeSequence(struct Sequence* sequence, struct read* read, Correction* correction, int maxCorrections) { // KMers: KMerHashTable* kmers = correctionGetKMers(correction); unsigned int kmerSize = correctionGetKMerSize(correction); // Creating temporary date structure. sequence->length = read->length; sequence->blocks = getNumMemoryBlocks(sequence->length); // Copy sequence: sequence->sequence = (unsigned long long int*) malloc(sequence->blocks * sizeof(unsigned long long int)); memcpy(sequence->sequence, read->sequence, sequence->blocks * sizeof(unsigned long long int)); // Copy quality scores: sequence->quality = (char*) malloc(strlen(read->quality) + 1); strcpy(sequence->quality, read->quality); // Correction information: sequence->numCorrections = 0; // Recorded corrections: sequence->corrections = (char*) malloc(maxCorrections * sizeof(char)); sequence->homopolymerSize = (int*) malloc(maxCorrections * sizeof(int)); // The size associated with the homopolymer correction record. } bool isHighQuality(struct read* read, Correction* correction) { // K-mers: KMerHashTable* kmers = correctionGetKMers(correction); unsigned int kmerSize = correctionGetKMerSize(correction); int total = read->length - kmerSize + 1; double numUniqueKMers = 0; unsigned int kmerCounts[total]; getKMerCounts(read->sequence, read->length, kmers, kmerSize, kmerCounts); // Scan k-mers: for(int i = 0; i < total; i++) { if(kmerCounts[i] == 1) { numUniqueKMers++; } } bool result; double fraction = (double)numUniqueKMers / (double)total; if (fraction < 0.5) { result = true; } else { result = false; } return result; } bool correctRead(struct read* read, Correction* correction) { // KMers: KMerHashTable* kmers = correctionGetKMers(correction); unsigned int kmerSize = correctionGetKMerSize(correction); int maxCorrections = getMax(MAX_CORRECTIONS, (int)(0.2 * read->length)); struct Sequence sequence; if(read->length <= kmerSize) { return false; } initializeSequence(&sequence, read, correction, maxCorrections); //-----------------------------// // Get initial discrepancies: int total = sequence.length - kmerSize + 1; double* discrepancies = (double*)malloc((total - 1) * sizeof(double*)); findKMerDiscrepancies(&sequence, kmers, kmerSize, discrepancies); int kmerLocation = getNextKMerDiscrepancy(discrepancies, total - 1); // No discrepancies? if(kmerLocation < 0) { // Read is high quality or low coverage. if(typeHighQuality(sequence.sequence, sequence.length, kmers, kmerSize, correction->lowKMerThreshold)) { sequence.type = HIGH_QUALITY; } else { sequence.type = LOW_COVERAGE; } // Differentiate between low coverage and bad. if(!isHighQuality(read, correction)) { read->type = BAD; } free(sequence.sequence); free(sequence.quality); free(discrepancies); return true; } // While there is a location of something to correct: while(kmerLocation >= 0 && kmerLocation < sequence.length) { // Safety -- In case we're stuck in a correction loop or read is terrible. if(sequence.numCorrections >= maxCorrections) { break; } if(correctErrorAtLocation(&sequence, kmerLocation, correction)) { // Correction successful! // Get new discrepancies: total = sequence.length - kmerSize + 1; free(discrepancies); discrepancies = (double*)malloc((total - 1) * sizeof(double*)); findKMerDiscrepancies(&sequence, kmers, kmerSize, discrepancies); } else { // Correction unsuccessful. discrepancies[kmerLocation] = -1; // Move to next discrepancy. } kmerLocation = getNextKMerDiscrepancy(discrepancies, total - 1); } // Were we successful? if(sequence.numCorrections < MAX_CORRECTIONS) { // Correction worked. applyCorrection(&sequence, read); } else { // Couldn't correct. revertCorrection(&sequence, read); } free(discrepancies); // Check to see if the read is considered bad. if(isHighQuality(read, correction)) { read->type = CORRECTED; } else { read->type = BAD; } return true; }