/*
Pollux
Copyright (C) 2014 Eric Marinier
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include "Reads.h"
#include "Encoding.h"
#include "Utility.h"
int BATCH_SIZE = 200000; // Number of reads loaded in memory.
int NUCLEOTIDE = 0; // [0, 1, 2, 4] : replaces N's deterministically
Reads* createReads(char* fileName)
{
Reads* reads = (Reads*)malloc(sizeof(Reads));
FILE* file = fopen(fileName, "r");
if (file == 0)
{
printf("Could not open file location: %s for reading.\n", fileName);
return 0;
}
char ch;
int lines = 0;
// Count the number of lines in the file:
while (EOF != (ch = getc(file)))
{
if ('\n' == ch) {
++lines;
}
}
fclose(file);
file = fopen(fileName, "r");
if (file == 0)
{
printf("Could not open file location: %s for reading.\n", fileName);
return 0;
}
reads->fileName = fileName;
reads->file = file;
reads->total = lines / 4;
reads->current = 0;
reads->readData = 0;
reads->ID = 0;
return reads;
}
void freeReads(Reads* reads)
{
// Do we already have reads open?
if(reads->readData != 0)
{
int limit;
// We don't want to free memory we haven't allocated.
// How many reads do we need to free in this block? The last block
// might not need all freed.
// The entire block needs to be freed.
// This is the case when we're not in the last, or the last block was
// entirely full.
if(reads->current < reads->total || reads->total % BATCH_SIZE == 0)
{
limit = BATCH_SIZE;
}
// Only part of the block needs to be freed.
else
{
limit = reads->total % BATCH_SIZE;
}
// Free all the open reads.
for(int i = 0; i < limit; i++)
{
struct read* current = &(reads->readData[i]);
free(current->seqName1);
free(current->sequence);
free(current->seqName2);
free(current->quality);
free(current->basecontig);
}
free(reads->readData);
}
}
char getNextReplacementNucleotide()
{
char result = 'A';
switch(NUCLEOTIDE)
{
case 0:
result = 'A';
break;
case 1:
result = 'C';
break;
case 2:
result = 'G';
break;
case 3:
result = 'T';
break;
}
NUCLEOTIDE = (NUCLEOTIDE + 1) % 4;
return result;
}
// Replaces internal N's with other nucleotides.
void replaceN(char* string)
{
for(int i = 0; i < strlen(string); i++)
{
if(string[i] == 'N' || string[i] == 'n')
{
string[i] = getNextReplacementNucleotide();
}
}
}
void deleteCharacter(char* string, int pos)
{
int length = strlen(string);
if (pos < 0 || pos >= length)
return;
for(int i = pos; i < length; i++)
{
string[i] = string[i + 1];
}
string[length] = '\0';
}
void trimNs(char* sequence, char* quality)
{
// Delete leading N's.
while(strlen(sequence) >= 1 &&
(sequence[0] == 'N' || sequence[0] == 'n'))
{
deleteCharacter(sequence, 0);
deleteCharacter(quality, 0);
}
// Delete trailing N's.
while(strlen(sequence) >= 1 &&
(sequence[strlen(sequence) - 1] == 'N' || sequence[strlen(sequence) - 1] == 'n'))
{
deleteCharacter(sequence, strlen(sequence) - 1);
deleteCharacter(quality, strlen(quality) - 1);
}
}
void trimSpaces(char* string)
{
// Delete leading spaces.
while(strlen(string) >= 1 && isspace(string[0]))
{
deleteCharacter(string, 0);
}
// Delete trailing N's.
while(strlen(string) >= 1 && isspace(string[strlen(string) - 1]))
{
deleteCharacter(string, strlen(string) - 1);
}
}
void loadReads(Reads* reads)
{
freeReads(reads); // FREE FIRST!
reads->readData = (struct read*)malloc(BATCH_SIZE * sizeof(struct read)); // LOAD SECOND!
for(int i = 0; i < BATCH_SIZE && (reads->current + i < reads->total); i++)
{
struct read* current = &(reads->readData[i]);
reads->ID = reads->ID + 1;
// TODO: THESE LENGTHS SHOULD BE VARIABLE!!!
char seqName1[2048];
char sequence[2048];
char seqName2[2048];
char quality[2048];
// Get the 4 lines of a FASTQ file:
if( fgets(seqName1, 2048, reads->file) == NULL ||
fgets(sequence, 2048, reads->file) == NULL ||
fgets(seqName2, 2048, reads->file) == NULL ||
fgets(quality, 2048, reads->file) == NULL )
{
printf("CRITICAL: FAILED TO READ INPUT!\n");
exit(1);
}
trimSpaces(sequence);
trimSpaces(quality);
trimNs(sequence, quality);
replaceN(sequence);
// Sequence name 1:
current->seqName1 = malloc(strlen(seqName1) + 1);
strcpy(current->seqName1, seqName1);
// Encode sequences:
encode_sequence(current, sequence);
// Sequence name 2:
current->seqName2 = malloc(strlen(seqName2) + 1);
strcpy(current->seqName2, seqName2);
//Quality:
current->quality = malloc(strlen(quality) + 1);
strcpy(current->quality, quality);
current->basecontig = NULL;
current->correct_pos = 0;
current->number = reads->ID;
}
}
struct read* readsGetNext(Reads* reads)
{
struct read* result;
// Do we need to load more reads?
if(reads->current % BATCH_SIZE == 0)
{
loadReads(reads);
}
result = &(reads->readData[reads->current % BATCH_SIZE]);
reads->current = reads->current + 1;
return result;
}
bool readsHasNext(Reads* reads)
{
return (reads->current < reads->total);
}
int readsReset(Reads* reads)
{
freeReads(reads);
fclose(reads->file);
reads->file = fopen(reads->fileName, "r");
if (reads->file == 0)
{
printf("Could not open file location: %s for reading.\n", reads->fileName);
return 1;
}
reads->current = 0;
reads->readData = 0;
reads->ID = 0;
return 0;
}
void readsDestroy(Reads* reads)
{
freeReads(reads);
fclose(reads->file);
free(reads);
}
int readsGetCount(Reads* reads)
{
return reads->total;
}
char* readsGetFileName(Reads* reads)
{
return reads->fileName;
}