/*
Pollux
Copyright (C) 2014 Eric Marinier
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include "Globals.h"
#include "Utility.h"
#include "ErrorProcessing.h"
#include "ErrorOutput.h"
#include "ErrorTyping.h"
#include "ErrorCorrection.h"
#include "Reads.h"
#include "Correction.h"
#include
#include
#include
#include
#include
#include
#include
unsigned int KMER_SIZE = 31;
const int LEFT = 0;
const int RIGHT = 1;
static inline void printProgress(int x, int n, int r)
{
// Only update r times.
if (n/r == 0 || x % (n/r) != 0) return; // Mod 0 causes problems.
double result;
if(n / r != 1)
result = x / (n/r) * (100/r);
else
result = (x + 1) / ((double)n/(double)r) * ((double)100/(double)r);
printf("%d%% ", (int)result);
fflush(stdout);
}
void hashSequence(unsigned long long int* sequence, unsigned int sequenceLength,
KMerHashTable* kmers, unsigned int kmerSize)
{
unsigned long long int* reverse;
addKMersToTable(kmers, sequence, sequenceLength, kmerSize);
reverse = createReverseCompliment(sequence, sequenceLength);
addKMersToTable(kmers, reverse, sequenceLength, kmerSize);
free(reverse);
}
void hashReads(Correction* correction)
{
// Reads:
Reads** reads = correctionGetReads(correction);
unsigned int numReadSets = correctionGetNumReadSets(correction);
struct read* current;
// Read:
unsigned long long int* sequence;
unsigned int length;
// KMers:
KMerHashTable* kmers = correctionGetKMers(correction);
unsigned int kmerSize = correctionGetKMerSize(correction);
// Iterate over all files:
for(int file = 0; file < numReadSets; file++)
{
printf("Processing file %d/%d...\n", file + 1, numReadSets);
readsReset(reads[file]);
// Iterate over all reads:
for(int i = 0; readsHasNext(reads[file]); i++)
{
printProgress(i, readsGetCount(reads[file]), 20);
current = readsGetNext(reads[file]);
sequence = current->sequence;
length = current->length;
hashSequence(sequence, length, kmers, kmerSize);
}
printf("\n");
printf("Preprocessing k-mers...\n");
preprocessKMers(kmers, correction);
printf("Finished preprocessing k-mers!\n\n");
}
}
void checkDirectoryExistsAndCreate(char* directory)
{
const int MISSING = -1;
struct stat st = {0};
if (stat(directory, &st) == MISSING)
{
printf("Creating directory: %s\n", directory);
mkdir(directory, 0700);
}
}
void executePairedCorrection(Correction* correction,
FILE* leftCorrectedFile, FILE* rightCorrectedFile,
FILE* leftGarbageFile, FILE* rightGarbageFile, FILE* extraFile)
{
// Reads:
Reads** reads = correctionGetReads(correction);
// Correction Function:
CorrectionFunction correctionFunction = correctionGetFunction(correction);
int maxReads = getMax(readsGetCount(reads[LEFT]), readsGetCount(reads[RIGHT]));
int current = 0;
printf("Correcting paired files.\n");
readsReset(reads[LEFT]);
readsReset(reads[RIGHT]);
// Safety check:
if(!(readsHasNext(reads[LEFT]) && readsHasNext(reads[RIGHT])))
{
printf("No reads to correct!\n");
return;
}
struct read* leftRead = readsGetNext(reads[LEFT]);
struct read* rightRead = readsGetNext(reads[RIGHT]);
// Always correct the read immediately after loading.
correctionFunction(leftRead, correction);
correctionFunction(rightRead, correction);
while(readsHasNext(reads[LEFT]) && readsHasNext(reads[RIGHT]))
{
printProgress(current, maxReads, 20);
current++;
// The current pair matches.
// This is to accommodate read filtering which might have been done at
// the input level, which existed in the past.
if(leftRead->number == rightRead->number)
{
// Garbage the pair of reads only if BOTH are bad.
if (correction->filtering && leftRead->type == BAD && rightRead->type == BAD)
{
// write both to garbage pair files
outputRead(leftGarbageFile, leftRead);
outputRead(rightGarbageFile, rightRead);
}
else
{
// write both to corrected pair files
outputRead(leftCorrectedFile, leftRead);
outputRead(rightCorrectedFile, rightRead);
}
leftRead = readsGetNext(reads[LEFT]);
rightRead = readsGetNext(reads[RIGHT]);
correctionFunction(leftRead, correction);
correctionFunction(rightRead, correction);
}
// The left read is ahead of the right.
else if(leftRead->number > rightRead->number)
{
// write right to extra file
outputRead(extraFile, rightRead);
rightRead = readsGetNext(reads[RIGHT]);
correctionFunction(rightRead, correction);
}
// The right read is ahead of the left.
else if(leftRead->number < rightRead->number)
{
// write left to extra file
outputRead(extraFile, leftRead);
leftRead = readsGetNext(reads[LEFT]);
correctionFunction(leftRead, correction);
}
}
// Write leftover reads to extra:
while(readsHasNext(reads[LEFT]))
{
outputRead(extraFile, leftRead);
leftRead = readsGetNext(reads[LEFT]);
correctionFunction(leftRead, correction);
}
while(readsHasNext(reads[RIGHT]))
{
outputRead(extraFile, rightRead);
rightRead = readsGetNext(reads[RIGHT]);
correctionFunction(rightRead, correction);
}
printf("\n");
printCorrectionResults();
printf("\n");
}
int processSimpleCorrection(Correction* correction)
{
// Reads:
Reads** reads = correctionGetReads(correction);
unsigned int numReadSets = correctionGetNumReadSets(correction);
// Correction Function:
CorrectionFunction correctionFunction = correctionGetFunction(correction);
// Files:
char correctedFileName[1024];
char garbageFileName[1024];
FILE* correctedFile;
FILE* garbageFile;
char* outputDirectory = correctionGetOutputDirectory(correction);
char baseName[1024];
// Iterate over all files:
for(int file = 0; file < numReadSets; file++)
{
printf("Correcting file %d/%d...\n", file + 1, numReadSets);
// Prepare output:
strcpy(baseName, basename(readsGetFileName(reads[file])));
// We don't want the original to get changed.
strcpy(correctedFileName, outputDirectory);
strcpy(garbageFileName, outputDirectory);
if(correctedFileName[strlen(correctedFileName) - 1] != '/')
strcat(correctedFileName, "/");
if(garbageFileName[strlen(correctedFileName) - 1] != '/')
strcat(garbageFileName, "/");
strcat(correctedFileName, baseName);
strcat(correctedFileName, ".corrected");
correctedFile = fopen(correctedFileName, "w");
if(correction->filtering)
{
strcat(garbageFileName, baseName);
strcat(garbageFileName, ".low");
garbageFile = fopen(garbageFileName, "w");
}
// Safety check:
if(correctedFile == 0)
{
printf("Could not create file: %s\n", correctedFileName);
return 1;
}
if(correction->filtering)
{
if(garbageFile == 0)
{
printf("Could not create file: %s\n", garbageFileName);
return 1;
}
}
readsReset(reads[file]);
// Iterate over all reads:
for(int i = 0; readsHasNext(reads[file]); i++)
{
printProgress(i, readsGetCount(reads[file]), 20);
struct read* current = readsGetNext(reads[file]);
// Correction:
correctionFunction(current, correction);
// Output:
if (current->type != BAD || correction->filtering == false)
{
outputRead(correctedFile, current);
}
else
{
outputRead(garbageFile, current);
}
}
// Close file:
fclose(correctedFile);
if(correction->filtering)
{
fclose(garbageFile);
}
printf("\n");
printCorrectionResults();
printf("\n");
}
}
void processPairedCorrection(Correction* correction)
{
char leftCorrectedFileName[1024];
char rightCorrectedFileName[1024];
char leftGarbageFileName[1024];
char rightGarbageFileName[1024];
char extraFileName[1024];
char* outputDirectory = correctionGetOutputDirectory(correction);
char baseName[1024];
FILE* leftCorrectedFile;
FILE* rightCorrectedFile;
FILE* leftGarbageFile;
FILE* rightGarbageFile;
FILE* extraFile;
Reads** reads = correctionGetReads(correction);
// Left Corrected:
strcpy(baseName, basename(readsGetFileName(reads[LEFT])));
// We don't want the original to get changed.
strcpy(leftCorrectedFileName, outputDirectory);
if(leftCorrectedFileName[strlen(leftCorrectedFileName) - 1] != '/')
strcat(leftCorrectedFileName, "/");
strcat(leftCorrectedFileName, baseName);
strcat(leftCorrectedFileName, ".corrected");
leftCorrectedFile = fopen(leftCorrectedFileName, "w");
// Right Corrected:
strcpy(baseName, basename(readsGetFileName(reads[RIGHT])));
// We don't want the original to get changed.
strcpy(rightCorrectedFileName, outputDirectory);
if(rightCorrectedFileName[strlen(rightCorrectedFileName) - 1] != '/')
strcat(rightCorrectedFileName, "/");
strcat(rightCorrectedFileName, baseName);
strcat(rightCorrectedFileName, ".corrected");
rightCorrectedFile = fopen(rightCorrectedFileName, "w");
// Left Garbage:
if(correction->filtering)
{
strcpy(baseName, basename(readsGetFileName(reads[LEFT])));
// We don't want the original to get changed.
strcpy(leftGarbageFileName, outputDirectory);
if(leftGarbageFileName[strlen(leftGarbageFileName) - 1] != '/')
strcat(leftGarbageFileName, "/");
strcat(leftGarbageFileName, baseName);
strcat(leftGarbageFileName, ".low");
leftGarbageFile = fopen(leftGarbageFileName, "w");
// Right Garbage:
strcpy(baseName, basename(readsGetFileName(reads[RIGHT])));
// We don't want the original to get changed.
strcpy(rightGarbageFileName, outputDirectory);
if(rightGarbageFileName[strlen(rightGarbageFileName) - 1] != '/')
strcat(rightGarbageFileName, "/");
strcat(rightGarbageFileName, baseName);
strcat(rightGarbageFileName, ".low");
rightGarbageFile = fopen(rightGarbageFileName, "w");
}
// Extra:
strcpy(extraFileName, outputDirectory);
strcat(extraFileName, "/extra.corrected");
extraFile = fopen(extraFileName, "w");
executePairedCorrection(correction, leftCorrectedFile, rightCorrectedFile,
leftGarbageFile, rightGarbageFile, extraFile);
// Close files:
fclose(leftCorrectedFile);
fclose(rightCorrectedFile);
if(correction->filtering)
{
fclose(leftGarbageFile);
fclose(rightGarbageFile);
}
fclose(extraFile);
}
Correction* preprocessing(int numInputFiles, char* inputFileNames, char* outputDirectory)
{
unsigned int LOW_COVERAGE_THRESHOLD_DEFAULT = 3;
// DATA STRUCTURES:
KMerHashTable* kmers = newKMerHashTable();
Reads** reads = (Reads**)malloc(sizeof(Reads*) * numInputFiles);
Correction* correction = (Correction*)malloc(sizeof(Correction));
// OUTPUT DIRECTORY:
checkDirectoryExistsAndCreate(outputDirectory);
// CREATE READS:
printf("Creating read objects...\n");
for(int i = 0; i < numInputFiles; i++)
{
char* inputFileName = &(inputFileNames[i * 200]);
printf("Reading file: %s\n", inputFileName);
reads[i] = createReads(inputFileName);
}
printf("Finished creating read objects!\n\n");
// META OBJECT:
correction = createCorrection(reads, numInputFiles,
kmers, KMER_SIZE, LOW_COVERAGE_THRESHOLD_DEFAULT,
outputDirectory, NULL);
// CONSTRUCT KMERS:
printf("Constructing k-mers...\n");
hashReads(correction);
printf("Finished constructing k-mers!\n\n");
return correction;
}
int convertFASTQToFASTK(int numInputFiles, char* inputFileNames, char* outputDirectory)
{
printf("FASTK CONVERSION\n\n");
// META OBJECT:
Correction* correction = preprocessing(numInputFiles, inputFileNames, outputDirectory);
// # Reads:
Reads** reads = correctionGetReads(correction);
unsigned int numReadSets = correctionGetNumReadSets(correction);
// KMers:
KMerHashTable* kmers = correctionGetKMers(correction);
unsigned int kmerSize = correctionGetKMerSize(correction);
// Files:
char convertedFileName[1024];
FILE* convertedFile;
char baseName[1024];
// Iterate over all files:
for(int file = 0; file < numReadSets; file++)
{
printf("Converting file %d/%d...\n", file + 1, numReadSets);
// Prepare output:
strcpy(baseName, basename(readsGetFileName(reads[file])));
// We don't want the original to get changed.
strcpy(convertedFileName, outputDirectory);
if(convertedFileName[strlen(convertedFileName) - 1] != '/')
strcat(convertedFileName, "/");
strcat(convertedFileName, baseName);
strcat(convertedFileName, ".fastk");
convertedFile = fopen(convertedFileName, "w");
// Safety check:
if(convertedFile == 0)
{
printf("Could not create file: %s\n", convertedFileName);
return 1;
}
outputReadsFASTK(convertedFile, reads[file], kmers, kmerSize);
}
}
int processCorrection(int numInputFiles, char* inputFileNames, char* outputDirectory,
bool paired, bool substitutions, bool insertions, bool deletions, bool homopolymers,
bool filtering, bool qualityUpdating)
{
printf("ERROR CORRECTION\n\n");
// META OBJECT:
Correction* correction = preprocessing(numInputFiles, inputFileNames, outputDirectory);
// CORRECTION FUNCTION:
correction->correctionFunction = &correctRead;
// ENABLED CORRECTIONS:
correction->substitutions = substitutions;
correction->insertions = insertions;
correction->deletions = deletions;
correction->homopolymers = homopolymers;
correction->filtering = filtering;
correction->qualityUpdating = qualityUpdating;
// CORRECT READS:
printf("Correcting reads...\n");
if(paired)
{
processPairedCorrection(correction);
}
else
{
processSimpleCorrection(correction);
}
printf("Finished correcting reads!\n\n");
return 0;
}