/*
Pollux
Copyright (C) 2014 Eric Marinier
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include "ErrorCorrection.h"
#include "Utility.h"
#include "ErrorTyping.h"
#include "Counting.h"
#include
#include
#include
#include "Reads.h"
#include "Correction.h"
#include "ErrorOutput.h"
typedef enum
{
SUB_A = 0,
SUB_T,
SUB_C,
SUB_G,
INS,
DEL_L_A,
DEL_L_T,
DEL_L_C,
DEL_L_G,
DEL_R_A,
DEL_R_T,
DEL_R_C,
DEL_R_G,
CorrectionType_Count // Needs to be last! Count of items in enum!
} CorrectionType;
unsigned int substitutionErrors = 0;
unsigned int insertionErrors = 0;
unsigned int deletionErrors = 0;
unsigned int multipleErrors = 0;
unsigned int homopolymerErrors = 0;
#define HOMOPOLYMER_SIZE_RANGE 10 // [< -RANGE, -RANGE, .. , RANGE, > RANGE]
#define HOMOPOLYMER_SIZE_OUTSIDE_NEGATIVE 0 // < -RANGE
#define HOMOPOLYMER_SIZE_NEGATIVE_START 1 // -RANGE
#define HOMOPOLYMER_SIZE_NEGATIVE_END (HOMOPOLYMER_SIZE_NEGATIVE_START + HOMOPOLYMER_SIZE_RANGE - 1) // (-1)
#define HOMOPOLYMER_SIZE_POSITIVE_START (HOMOPOLYMER_SIZE_NEGATIVE_END + 1) // (1))
#define HOMOPOLYMER_SIZE_POSITIVE_END (HOMOPOLYMER_SIZE_POSITIVE_START + HOMOPOLYMER_SIZE_RANGE - 1) // + RANGE
#define HOMOPOLYMER_SIZE_OUTSIDE_POSITIVE (HOMOPOLYMER_SIZE_POSITIVE_END + 1) // > + RANGE
#define MINIMUM_HOMOPOLYMER_SIZE 1
unsigned int homopolymerSize[HOMOPOLYMER_SIZE_OUTSIDE_POSITIVE + 1] = {0};
void printCorrectionResults()
{
unsigned int homopolymerInsertions = 0;
unsigned int homopolymerDeletions = 0;
int currentSize;
printf("\n");
printf("Corrected...\n");
printf("Reads with Multiple Corrections: %d\n", multipleErrors);
printf("\n");
printf("Substitution Corrections: %d\n", substitutionErrors);
printf("Single Insertion Corrections: %d\n", insertionErrors);
printf("Single Deletion Corrections: %d\n", deletionErrors);
printf("\n");
printf("Total Homopolymer Corrections: %d\n", homopolymerErrors);
printf("\n");
printf("Breakdown:\n");
printf("< -%d: %d\n", HOMOPOLYMER_SIZE_RANGE, homopolymerSize[HOMOPOLYMER_SIZE_OUTSIDE_NEGATIVE]);
for(int i = HOMOPOLYMER_SIZE_NEGATIVE_START; i <= HOMOPOLYMER_SIZE_NEGATIVE_END; i++)
{
currentSize = (i - HOMOPOLYMER_SIZE_RANGE - 1);
homopolymerDeletions += homopolymerSize[i] * -currentSize;
printf("%d : %d\n", currentSize, homopolymerSize[i]);
}
for(int i = HOMOPOLYMER_SIZE_POSITIVE_START; i <= HOMOPOLYMER_SIZE_POSITIVE_END; i++)
{
currentSize = (i - HOMOPOLYMER_SIZE_RANGE);
homopolymerInsertions += homopolymerSize[i] * currentSize;
printf("%d : %d\n", currentSize, homopolymerSize[i]);
}
printf("> %d: %d\n", HOMOPOLYMER_SIZE_RANGE, homopolymerSize[HOMOPOLYMER_SIZE_OUTSIDE_POSITIVE]);
printf("\n");
printf("Total Insertions (Single + Homopolymer): %d\n", insertionErrors + homopolymerInsertions);
printf("Total Deletions (Single + Homopolymer): %d\n", deletionErrors + homopolymerDeletions);
}
bool isHighToLow(unsigned int* kmers, unsigned int kmerLocation)
{
return (kmers[kmerLocation] > kmers[kmerLocation + 1]);
}
int getSequenceLocation(struct Sequence* sequence, int kmerLocation,
KMerHashTable* kmers, unsigned int kmerSize)
{
// K-mers.
int total = sequence->length - kmerSize + 1;
unsigned int kmerCounts[total];
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts);
int sequenceLocation;
// Order of discrepancy?
if(isHighToLow(kmerCounts, kmerLocation))
{
// High -> Low
sequenceLocation = kmerLocation + kmerSize;
}
else
{
// Low -> High
sequenceLocation = kmerLocation;
}
return sequenceLocation;
}
char getAverageQuality(struct Sequence* sequence, int first, int second)
{
const char ERROR_VALUE = 33;
int total = 0;
int count = 0;
// Within range:
if(0 <= first && first < sequence->length)
{
total += (int)sequence->quality[first];
count++;
}
// Within range:
if(0 <= second && second < sequence->length)
{
total += (int)sequence->quality[second];
count++;
}
if(count != 0)
{
return (char)(total / count);
}
else
{
return ERROR_VALUE;
}
}
int evaluateCorrection(bool highToLow, unsigned int* kmerCounts, int total, int kmerLocation)
{
int count = 0;
// HIGH -> LOW
if(highToLow)
{
for(int i = kmerLocation; i < total - 1; i++)
{
if(isJump(kmerCounts[i], kmerCounts[i + 1]))
{
break;
}
else
{
count++;
}
}
}
// LOW -> HIGH
else
{
for(int i = kmerLocation; i >= 0; i--)
{
if(isJump(kmerCounts[i], kmerCounts[i + 1]))
{
break;
}
else
{
count++;
}
}
}
return count;
}
bool doSubstitutionCorrection(
struct Sequence* sequence, unsigned int kmerLocation, char substitution,
KMerHashTable* kmers, unsigned int kmerSize)
{
int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize);
char originalBase = getBase(sequence->sequence, sequenceLocation);
char originalQuality = sequence->quality[sequenceLocation];
char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1);
// K-mers:
int total = sequence->length - kmerSize + 1;
unsigned int kmersInitial[total];
unsigned int kmersCorrection[total];
// Get initial counts.
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersInitial);
// Try correction.
changeBase(sequence, sequenceLocation, substitution, newQuality);
// Get new counts.
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersCorrection);
// Note changes:
int kmersImproved = evaluateCorrection(isHighToLow(kmersInitial, kmerLocation),
kmersCorrection, sequence->length - kmerSize + 1, kmerLocation);
// Revert:
changeBase(sequence, sequenceLocation, originalBase, originalQuality);
return kmersImproved;
}
// Deletion Error: Need to insert a base.
bool doDeletionCorrection(struct Sequence* sequence, unsigned int kmerLocation,
char nucleotide, bool insertLeft,
KMerHashTable* kmers, unsigned int kmerSize)
{
int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize);
char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1);
// K-mers.
int total = sequence->length - kmerSize + 1;
unsigned int kmersInitial[total];
unsigned int kmersCorrection[total];
// Get initial counts.
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersInitial);
// Try correction.
if(insertLeft)
{
insertBase(sequence, sequenceLocation, nucleotide, newQuality);
}
else
{
insertBase(sequence, sequenceLocation + 1, nucleotide, newQuality);
}
// Get new counts.
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersCorrection);
bool highToLow = isHighToLow(kmersInitial, kmerLocation);
int kmersImproved;
// Note changes:
if(highToLow)
{
kmersImproved = evaluateCorrection(highToLow, kmersCorrection, sequence->length - kmerSize + 1, kmerLocation);
}
else
{
kmersImproved = evaluateCorrection(highToLow, kmersCorrection + 1, sequence->length - kmerSize + 1, kmerLocation);
// (kmersCorrection + 1) because the 'good' bases has moved right
// we don't want to look at garbage and think it's gold
}
// Revert:
if(insertLeft)
{
deleteBase(sequence, sequenceLocation);
}
else
{
deleteBase(sequence, sequenceLocation + 1);
}
return kmersImproved - 1;
}
bool correctErrorAtLocationInsertion(struct Sequence* sequence,
KMerHashTable* kmers, unsigned int kmerSize, int kmerLocation)
{
// Safety:
if(sequence->length <= kmerSize + 1)
{
// We will be making the read even shorter.
return false;
}
// Sequence:
int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize);
char originalBase = getBase(sequence->sequence, sequenceLocation);
char originalQuality = sequence->quality[sequenceLocation];
// K-mers.
int total = sequence->length - kmerSize + 1;
unsigned int kmersInitial[total]; // Original
unsigned int kmersCorrection[total - 1]; // Original with deletion. (-1)
// Get initial counts.
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersInitial);
// Delete:
deleteBase(sequence, sequenceLocation);
// Get new counts.
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmersCorrection);
bool highToLow = isHighToLow(kmersInitial, kmerLocation);
int kmersImproved;
// Note changes:
if(highToLow)
{
kmersImproved = evaluateCorrection(highToLow, kmersCorrection, sequence->length - kmerSize + 1, kmerLocation);
}
else
{
kmersImproved = evaluateCorrection(highToLow, kmersCorrection - 1, sequence->length - kmerSize + 1, kmerLocation);
// (kmersCorrection - 1) because 'good' bases have fallen into the position
}
// Revert:
insertBase(sequence, sequenceLocation, originalBase, originalQuality);
return kmersImproved;
}
int getNextKMerDiscrepancy(double* kmerDiscrepancies, int total)
{
double score = -1;
int kmerLocation = -1;
for(int i = 0; i < total; i++)
{
if(kmerDiscrepancies[i] > 0 && kmerDiscrepancies[i] > score)
{
score = kmerDiscrepancies[i];
kmerLocation = i;
}
}
return kmerLocation;
}
double scoreKMerDiscrepancy(int value1, int value2)
{
int high = getMax(value1, value2);
int low = getMin(value1, value2);
high = getMax(high, 1);
low = getMax(low, 1);
//double score = ((double)low * (double)low) / (double)high;
double score = (double)high - (double)low;
return score;
}
void findKMerDiscrepancies(struct Sequence* sequence,
KMerHashTable* kmers, unsigned int kmerSize, double* kmerDiscrepancies)
{
// K-mers.
int total = sequence->length - kmerSize + 1;
unsigned int kmerCounts[total];
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts);
// Analyze possible discrepancies
for(int i = 0; i < total - 1; i++)
{
// Is there something we would consider a discrepancy?
if(isJump(kmerCounts[i], kmerCounts[i + 1]))
{
kmerDiscrepancies[i] = scoreKMerDiscrepancy(kmerCounts[i], kmerCounts[i + 1]);
}
// Not a discrepancy.
else
{
kmerDiscrepancies[i] = -1;
}
}
}
double getAverageKMerCount(struct Sequence* sequence,
KMerHashTable* kmers, unsigned int kmerSize, int start, int end)
{
double average = 0;
// K-mers:
int total = sequence->length - kmerSize + 1;
unsigned int kmerCounts[total];
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts);
// Averaged k-mers:
for(int i = start; i < end; i++)
{
average += kmerCounts[i];
}
average = (double)average / (double)(end - start);
return average;
}
bool correctErrorAtLocationHomopolymer(struct Sequence* sequence,
KMerHashTable* kmers, unsigned int kmerSize, int kmerLocation)
{
bool highToLow;
int initialHomopolymerLength;
int currentHomopolymerLength;
int homopolymerLeftmostNucleotide;
double currentAverage;
double maxAverage = 0;
int bestLength;
int start, end; // Used in averaging.
// K-mers:
int total = sequence->length - kmerSize + 1;
unsigned int kmerCounts[total];
getKMerCounts(sequence->sequence, sequence->length, kmers, kmerSize, kmerCounts);
int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize);
char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1); //TODO: THIS SHOULD BE TEMP
// High -> Low
if(isHighToLow(kmerCounts, kmerLocation))
{
highToLow = true;
homopolymerLeftmostNucleotide = getHomopolymerLeftmostNucleotide(sequence->sequence, sequenceLocation - 1);
// Initialize Average (Single):
start = (homopolymerLeftmostNucleotide - kmerSize + 1) + getHomopolymerLength(sequence->sequence, sequence->length, homopolymerLeftmostNucleotide);
end = start + 1;
maxAverage = getAverageKMerCount(sequence, kmers, kmerSize, start, end);
}
// Low -> High
else
{
highToLow = false;
homopolymerLeftmostNucleotide = getHomopolymerLeftmostNucleotide(sequence->sequence, sequenceLocation + 1);
// Initialize Average (Single):
start = homopolymerLeftmostNucleotide - 1;
end = start + 1;
maxAverage = getAverageKMerCount(sequence, kmers, kmerSize, start, end);
}
// Initialize:
initialHomopolymerLength = getHomopolymerLength(sequence->sequence, sequence->length, homopolymerLeftmostNucleotide);
// Homopolymer must be at least minimum size to try operating on it.
if (initialHomopolymerLength < MINIMUM_HOMOPOLYMER_SIZE)
{
return false;
}
currentHomopolymerLength = initialHomopolymerLength;
bestLength = initialHomopolymerLength;
for (int i = getMax(initialHomopolymerLength / 2, 1);
i <= initialHomopolymerLength * 2 && currentHomopolymerLength < kmerSize; i++)
{
setHomopolymerLength(sequence, homopolymerLeftmostNucleotide, i, newQuality);
if(sequence->length <= kmerSize)
{
// The read is too short to gain any information.
continue;
}
currentHomopolymerLength = getHomopolymerLength(sequence->sequence, sequence->length, homopolymerLeftmostNucleotide);
// FIND K-MER RANGE:
// Idea is to use as few k-mers as needed.
// Have the k-mers be mostly the better part of the read.
// High -> Low
if(highToLow)
{
start = (homopolymerLeftmostNucleotide - kmerSize + 1) + (currentHomopolymerLength);
end = start + 2;
}
// Low -> High
else
{
start = homopolymerLeftmostNucleotide - 2;
end = start + 2;
}
if(start < 0 || end > sequence->length - kmerSize + 1)
{
break; // We're outside the k-mer range and will always be.
}
currentAverage = getAverageKMerCount(sequence, kmers, kmerSize, start, end);
if(currentAverage > maxAverage)
{
maxAverage = currentAverage;
bestLength = i;
}
}
setHomopolymerLength(sequence, homopolymerLeftmostNucleotide, bestLength, newQuality);
// Did we find something better?
if(bestLength != initialHomopolymerLength)
{
// Update information.
sequence->corrections[sequence->numCorrections] = 'H';
sequence->homopolymerSize[sequence->numCorrections] = bestLength - initialHomopolymerLength;
sequence->numCorrections = sequence->numCorrections + 1;
return true;
}
else
{
return false;
}
}
void assignCorrection(CorrectionType correctionType, struct Sequence* sequence,
unsigned int kmerLocation, KMerHashTable* kmers, unsigned int kmerSize)
{
int sequenceLocation = getSequenceLocation(sequence, kmerLocation, kmers, kmerSize);
char newQuality = getAverageQuality(sequence, sequenceLocation - 1, sequenceLocation + 1);
switch(correctionType)
{
case SUB_A:
changeBase(sequence, sequenceLocation, 'A', newQuality);
break;
case SUB_T:
changeBase(sequence, sequenceLocation, 'T', newQuality);
break;
case SUB_C:
changeBase(sequence, sequenceLocation, 'C', newQuality);
break;
case SUB_G:
changeBase(sequence, sequenceLocation, 'G', newQuality);
break;
case INS:
deleteBase(sequence, sequenceLocation);
break;
case DEL_L_A:
insertBase(sequence, sequenceLocation, 'A', newQuality);
break;
case DEL_L_T:
insertBase(sequence, sequenceLocation, 'T', newQuality);
break;
case DEL_L_C:
insertBase(sequence, sequenceLocation, 'C', newQuality);
break;
case DEL_L_G:
insertBase(sequence, sequenceLocation, 'G', newQuality);
break;
case DEL_R_A:
insertBase(sequence, sequenceLocation + 1, 'A', newQuality);
break;
case DEL_R_T:
insertBase(sequence, sequenceLocation + 1, 'T', newQuality);
break;
case DEL_R_C:
insertBase(sequence, sequenceLocation + 1, 'C', newQuality);
break;
case DEL_R_G:
insertBase(sequence, sequenceLocation + 1, 'G', newQuality);
break;
default:
printf("UNRECOGNIZED ERROR CORRECTION TYPE!\n");
exit(0);
}
if(correctionType == SUB_A || correctionType == SUB_T
|| correctionType == SUB_C || correctionType == SUB_G)
{
sequence->corrections[sequence->numCorrections] = 'S';
}
else if(correctionType == INS)
{
sequence->corrections[sequence->numCorrections] = 'I';
}
else if(correctionType == DEL_L_A || correctionType == DEL_L_T
|| correctionType == DEL_L_C || correctionType == DEL_L_G
|| correctionType == DEL_R_A || correctionType == DEL_R_T
|| correctionType == DEL_R_C || correctionType == DEL_R_G)
{
sequence->corrections[sequence->numCorrections] = 'D';
}
else
{
printf("UNRECOGNIZED ERROR CORRECTION TYPE!\n");
exit(0);
}
sequence->numCorrections = sequence->numCorrections + 1;
}
bool correctErrorAtLocation(struct Sequence* sequence, int kmerLocation,
Correction* correction)
{
// KMers:
KMerHashTable* kmers = correctionGetKMers(correction);
unsigned int kmerSize = correctionGetKMerSize(correction);
int total = sequence->length - kmerSize + 1;
// Safety:
if (kmerLocation <= 0 || kmerLocation >= (total - 2))
{
// Out of bounds.
return false;
}
int correctionAttempt[CorrectionType_Count];
// Initialize:
for(int i = 0; i < CorrectionType_Count; i++)
{
correctionAttempt[i] = 0;
}
// Substitution Error
if(correction->substitutions)
{
correctionAttempt[SUB_A] = doSubstitutionCorrection(sequence, kmerLocation, 'A', kmers, kmerSize);
correctionAttempt[SUB_T] = doSubstitutionCorrection(sequence, kmerLocation, 'T', kmers, kmerSize);
correctionAttempt[SUB_C] = doSubstitutionCorrection(sequence, kmerLocation, 'C', kmers, kmerSize);
correctionAttempt[SUB_G] = doSubstitutionCorrection(sequence, kmerLocation, 'G', kmers, kmerSize);
}
// Insertion Error
if(correction->insertions)
{
correctionAttempt[INS] = correctErrorAtLocationInsertion(sequence, kmers, kmerSize, kmerLocation);
}
// Deletion Error
if(correction->deletions)
{
correctionAttempt[DEL_L_A] = doDeletionCorrection(sequence, kmerLocation, 'A', true, kmers, kmerSize);
correctionAttempt[DEL_L_T] = doDeletionCorrection(sequence, kmerLocation, 'T', true, kmers, kmerSize);
correctionAttempt[DEL_L_C] = doDeletionCorrection(sequence, kmerLocation, 'C', true, kmers, kmerSize);
correctionAttempt[DEL_L_G] = doDeletionCorrection(sequence, kmerLocation, 'G', true, kmers, kmerSize);
correctionAttempt[DEL_R_A] = doDeletionCorrection(sequence, kmerLocation, 'A', false, kmers, kmerSize);
correctionAttempt[DEL_R_T] = doDeletionCorrection(sequence, kmerLocation, 'T', false, kmers, kmerSize);
correctionAttempt[DEL_R_C] = doDeletionCorrection(sequence, kmerLocation, 'C', false, kmers, kmerSize);
correctionAttempt[DEL_R_G] = doDeletionCorrection(sequence, kmerLocation, 'G', false, kmers, kmerSize);
}
// Find the best correction:
CorrectionType bestCorrectionIndex = 0;
int bestCorrectionValue = 0;
for(int i = 0; i < CorrectionType_Count; i++)
{
if(correctionAttempt[i] > bestCorrectionValue)
{
bestCorrectionIndex = i;
bestCorrectionValue = correctionAttempt[i];
}
}
if(bestCorrectionValue >= 2)
{
assignCorrection(bestCorrectionIndex, sequence, kmerLocation, kmers, kmerSize);
return true;
}
// Homopolymers
if(correction->homopolymers
&& correctErrorAtLocationHomopolymer(sequence, kmers, kmerSize, kmerLocation))
{
return true;
}
return false;
}
void recordHomopolymerSize(int size)
{
if(size > HOMOPOLYMER_SIZE_RANGE)
{
homopolymerSize[HOMOPOLYMER_SIZE_RANGE * 2 + 1]++;
}
else if(size < -HOMOPOLYMER_SIZE_RANGE)
{
homopolymerSize[0]++;
}
else if(size > 0)
{
homopolymerSize[HOMOPOLYMER_SIZE_RANGE + size]++;
}
else if(size < 0)
{
homopolymerSize[(HOMOPOLYMER_SIZE_RANGE + 1) + size]++;
}
else
{
printf("Homopolymer size recording error!");
}
}
void applyCorrection(struct Sequence* sequence, struct read* read)
{
free(read->sequence);
read->sequence = sequence->sequence; // NOT QUITE CORRECT!
free(read->quality);
read->quality = sequence->quality;
read->length = sequence->length;
// Update information.
for(int i = 0; i < sequence->numCorrections; i++)
{
switch(sequence->corrections[i])
{
case 'S': substitutionErrors++; break;
case 'I': insertionErrors++; break;
case 'D': deletionErrors++; break;
// Homopolymers:
case 'H': homopolymerErrors++;
recordHomopolymerSize(sequence->homopolymerSize[i]);
break;
}
}
if(sequence->numCorrections > 1)
{
multipleErrors++;
}
free(sequence->corrections);
free(sequence->homopolymerSize);
}
void revertCorrection(struct Sequence* sequence, struct read* read)
{
free(sequence->sequence);
free(sequence->quality);
free(sequence->corrections);
free(sequence->homopolymerSize);
}
void initializeSequence(struct Sequence* sequence, struct read* read,
Correction* correction, int maxCorrections)
{
// KMers:
KMerHashTable* kmers = correctionGetKMers(correction);
unsigned int kmerSize = correctionGetKMerSize(correction);
// Creating temporary date structure.
sequence->length = read->length;
sequence->blocks = getNumMemoryBlocks(sequence->length);
// Copy sequence:
sequence->sequence = (unsigned long long int*) malloc(sequence->blocks * sizeof(unsigned long long int));
memcpy(sequence->sequence, read->sequence, sequence->blocks * sizeof(unsigned long long int));
// Copy quality scores:
sequence->quality = (char*) malloc(strlen(read->quality) + 1);
strcpy(sequence->quality, read->quality);
// Correction information:
sequence->numCorrections = 0;
// Recorded corrections:
sequence->corrections = (char*) malloc(maxCorrections * sizeof(char));
sequence->homopolymerSize = (int*) malloc(maxCorrections * sizeof(int));
// The size associated with the homopolymer correction record.
}
bool isHighQuality(struct read* read, Correction* correction)
{
// K-mers:
KMerHashTable* kmers = correctionGetKMers(correction);
unsigned int kmerSize = correctionGetKMerSize(correction);
int total = read->length - kmerSize + 1;
double numUniqueKMers = 0;
unsigned int kmerCounts[total];
getKMerCounts(read->sequence, read->length, kmers, kmerSize, kmerCounts);
// Scan k-mers:
for(int i = 0; i < total; i++)
{
if(kmerCounts[i] == 1)
{
numUniqueKMers++;
}
}
bool result;
double fraction = (double)numUniqueKMers / (double)total;
if (fraction < 0.5)
{
result = true;
}
else
{
result = false;
}
return result;
}
bool correctRead(struct read* read, Correction* correction)
{
// KMers:
KMerHashTable* kmers = correctionGetKMers(correction);
unsigned int kmerSize = correctionGetKMerSize(correction);
int maxCorrections = getMax(MAX_CORRECTIONS, (int)(0.2 * read->length));
struct Sequence sequence;
if(read->length <= kmerSize)
{
return false;
}
initializeSequence(&sequence, read, correction, maxCorrections);
//-----------------------------//
// Get initial discrepancies:
int total = sequence.length - kmerSize + 1;
double* discrepancies = (double*)malloc((total - 1) * sizeof(double*));
findKMerDiscrepancies(&sequence, kmers, kmerSize, discrepancies);
int kmerLocation = getNextKMerDiscrepancy(discrepancies, total - 1);
// No discrepancies?
if(kmerLocation < 0)
{
// Read is high quality or low coverage.
if(typeHighQuality(sequence.sequence, sequence.length, kmers, kmerSize, correction->lowKMerThreshold))
{
sequence.type = HIGH_QUALITY;
}
else
{
sequence.type = LOW_COVERAGE;
}
// Differentiate between low coverage and bad.
if(!isHighQuality(read, correction))
{
read->type = BAD;
}
free(sequence.sequence);
free(sequence.quality);
free(discrepancies);
return true;
}
// While there is a location of something to correct:
while(kmerLocation >= 0 && kmerLocation < sequence.length)
{
// Safety -- In case we're stuck in a correction loop or read is terrible.
if(sequence.numCorrections >= maxCorrections)
{
break;
}
if(correctErrorAtLocation(&sequence, kmerLocation, correction))
{
// Correction successful!
// Get new discrepancies:
total = sequence.length - kmerSize + 1;
free(discrepancies);
discrepancies = (double*)malloc((total - 1) * sizeof(double*));
findKMerDiscrepancies(&sequence, kmers, kmerSize, discrepancies);
}
else
{
// Correction unsuccessful.
discrepancies[kmerLocation] = -1; // Move to next discrepancy.
}
kmerLocation = getNextKMerDiscrepancy(discrepancies, total - 1);
}
// Were we successful?
if(sequence.numCorrections < MAX_CORRECTIONS)
{
// Correction worked.
applyCorrection(&sequence, read);
}
else
{
// Couldn't correct.
revertCorrection(&sequence, read);
}
free(discrepancies);
// Check to see if the read is considered bad.
if(isHighQuality(read, correction))
{
read->type = CORRECTED;
}
else
{
read->type = BAD;
}
return true;
}