/*
Pollux
Copyright (C) 2014 Eric Marinier
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include "Utility.h"
unsigned int getNumMemoryBlocks(unsigned int length)
{
const int NUCLEOTIDES_PER_BLOCK = 32;
unsigned int blocks;
// Determine number of blocks:
blocks = length / NUCLEOTIDES_PER_BLOCK; // Number of full blocks.
if(length % NUCLEOTIDES_PER_BLOCK > 0)
{
blocks++; // Partially filled block.
}
return blocks;
}
unsigned long long int getKMer(unsigned long long int* sequence,
unsigned int startNucleotidePosition,
unsigned int endNucleotidePosition)
{
const unsigned long long int MASK = 0xFFFFFFFFFFFFFFFF;
unsigned int startSequenceIndex = startNucleotidePosition / 32;
unsigned int endNucleotideIndex = (endNucleotidePosition - 1) / 32;
unsigned long long int startSequence = sequence[startSequenceIndex];
unsigned long long int endSequence = sequence[endNucleotideIndex];
unsigned long long int result;
result = startSequence << ((startNucleotidePosition * 2) % 64);
result = result & (MASK << ((startNucleotidePosition * 2) % 64));
// Do we need to grab the bits at the start of the second sequence and
// append it to the end of the result?
if(startSequenceIndex != endNucleotideIndex)
{
result = result + (endSequence >> (64 - (startNucleotidePosition * 2) % 64));
}
//Mask away the extra bits:
result = result & (MASK << (64 - (endNucleotidePosition - startNucleotidePosition) * 2));
return result;
}
unsigned long long int getReverse(unsigned long long int sequence)
{
unsigned long long int MASK = 0x3;
unsigned long long int nucleotide;
unsigned long long int reverse = 0x0;
for(int i = 0; i < 32; i++)
{
// Get the last nucleotide:
nucleotide = (sequence >> (i * 2)) & MASK;
// Add the nucleotide to the reverse:
reverse = reverse | (nucleotide << ((32 - i - 1) * 2));
}
return reverse;
}
unsigned long long int* createReverseCompliment(unsigned long long int* sequence,
unsigned int length)
{
unsigned int endNucleotidePosition = getMax(0, (length - 1));
unsigned int endNucleotideIndex = endNucleotidePosition / 32;
unsigned int extraBases = (32 - length) % 32;
unsigned long long int reverse;
unsigned long long int* reverseCompliment =
(unsigned long long int*)malloc((endNucleotideIndex + 1) * sizeof(unsigned long long int*));
for(int i = 0; i <= endNucleotideIndex; i++)
{
reverse = getReverse(sequence[endNucleotideIndex - i]);
reverseCompliment[i] = ~(reverse);
}
//Collapse:
for(int i = 0; i <= endNucleotideIndex; i++)
{
//Shuffle:
reverseCompliment[i] = reverseCompliment[i] << (extraBases * 2);
//Fill trailing bits if necessary:
if(i < endNucleotideIndex && extraBases > 0)
{
reverseCompliment[i] = reverseCompliment[i]
| (reverseCompliment[i + 1] >> ((32 - extraBases) * 2));
}
}
return reverseCompliment;
}
void printAsNucleotides(unsigned long long int* sequence,
unsigned int startNucleotidePosition,
unsigned int endNucleotidePosition)
{
writeAsNucleotides(stdout, sequence, startNucleotidePosition,
endNucleotidePosition);
}
void writeAsNucleotides(FILE* file, unsigned long long int* sequence,
unsigned int startNucleotidePosition,
unsigned int endNucleotidePosition)
{
const unsigned long long int MASK = 0x3;
// Iterate over all nucleotides:
for(int i = startNucleotidePosition; i < endNucleotidePosition; i++)
{
// Determine the current sequence:
unsigned long long int currentSequence = sequence[i / 32];
unsigned int position = (i * 2) % 64; // bit position w/i sequence
unsigned long long int nucleotide = (currentSequence >> (62 - position)) & MASK;
// Print appropriate character:
if(nucleotide == 0x0)
{
fprintf(file, "A");
}
else if(nucleotide == 0x1)
{
fprintf(file, "G");
}
else if(nucleotide == 0x2)
{
fprintf(file, "C");
}
else if(nucleotide == 0x3)
{
fprintf(file, "T");
}
// Error:
else
{
fprintf(file, "E");
}
}
}
void writeAsNucleotidesSpaced(FILE* file, unsigned long long int* sequence,
unsigned int startNucleotidePosition,
unsigned int endNucleotidePosition)
{
const unsigned long long int MASK = 0x3;
// Iterate over all nucleotides:
for(int i = startNucleotidePosition; i < endNucleotidePosition; i++)
{
// Determine the current sequence:
unsigned long long int currentSequence = sequence[i / 32];
unsigned int position = (i * 2) % 64; // bit position w/i sequence
unsigned long long int nucleotide = (currentSequence >> (62 - position)) & MASK;
// Print appropriate character:
if(nucleotide == 0x0)
{
fprintf(file, " A ");
}
else if(nucleotide == 0x1)
{
fprintf(file, " G ");
}
else if(nucleotide == 0x2)
{
fprintf(file, " C ");
}
else if(nucleotide == 0x3)
{
fprintf(file, " T ");
}
// Error:
else
{
fprintf(file, " E ");
}
}
}
void printValueAsNucleotides(unsigned long long int value)
{
writeValueAsNucleotides(stdout, value);
}
void writeValueAsNucleotides(FILE* file, unsigned long long int value)
{
unsigned long long int mask = 0x3;
unsigned long long int nucleotide;
for(int i = 0; i < 32; i++)
{
nucleotide = (value >> ((32 - i - 1) * 2)) & mask;
if(nucleotide == 0x0)
{
fprintf(file, "A");
}
else if(nucleotide == 0x1)
{
fprintf(file, "G");
}
else if(nucleotide == 0x2)
{
fprintf(file, "C");
}
else if(nucleotide == 0x3)
{
fprintf(file, "T");
}
else
{
fprintf(file, "[E]");
}
}
}
int getMax(int value1, int value2)
{
if(value1 > value2)
{
return value1;
}
else
{
return value2;
}
}
int getMin(int value1, int value2)
{
if(value1 < value2)
{
return value1;
}
else
{
return value2;
}
}
char getBase(unsigned long long int* nucleotideSequence,
unsigned int nucleotidePosition)
{
unsigned long long int mask = 0x3;
unsigned long long int result;
char base = 'E';
unsigned int sequenceIndex = nucleotidePosition / 32;
unsigned long long int sequence = nucleotideSequence[sequenceIndex];
unsigned int shift = (31 - (nucleotidePosition % 32)) * 2;
result = (sequence >> shift) & mask;
if(result == 0x0)
{
base = 'A';
}
else if(result == 0x1)
{
base = 'G';
}
else if(result == 0x2)
{
base = 'C';
}
else if(result == 0x3)
{
base = 'T';
}
return base;
}
void setQuality(struct Sequence* sequence, int location, char quality)
{
// Within range?
if(0 <= location && location < sequence->length)
{
sequence->quality[location] = quality;
}
}
void insertQuality(struct Sequence* sequence, int location, char quality)
{
// Within range?
if(0 <= location && location < sequence->length)
{
// Make room:
sequence->quality = (char*) realloc (sequence->quality, strlen(sequence->quality) + 2);
// Make a temporary string:
char * temp = (char*) malloc (strlen(sequence->quality) + 1);
strcpy(temp, &(sequence->quality[location])); // Copy end bit in.
// Push end bit over by one after the insert location:
strcpy(&(sequence->quality[location + 1]), temp);
// Make insertion:
sequence->quality[location] = quality;
// Remove temporary string:
free(temp);
}
}
void deleteQuality(struct Sequence* sequence, int location)
{
// Within range?
if(0 <= location && location < sequence->length)
{
// Make a temporary string:
char * temp = (char*) malloc (strlen(sequence->quality) + 1);
strcpy(temp, &(sequence->quality[location + 1])); // Copy end bit in.
// Remove:
strcpy(&(sequence->quality[location]), temp);
// Shrink:
// THE STRING HAS ALREADY BEEN SHRUNK!
// The terminating character will be in the correct place.
// Adjust the memory to match the new string length.
// strlen(x) + 1 is the length plus 1 for the terminating character.
sequence->quality = realloc (sequence->quality, strlen(sequence->quality) + 1);
// Remove temporary string:
free(temp);
}
}
void setBase(unsigned long long int* nucleotideSequence,
unsigned int nucleotidePosition, char nucleotide)
{
unsigned long long int mask = 0x3;
unsigned long long int base;
unsigned int sequenceIndex = nucleotidePosition / 32;
unsigned int shift = (31 - (nucleotidePosition % 32)) * 2;
if (nucleotide == 'A')
{
base = 0x0;
}
else if(nucleotide == 'G')
{
base = 0x1;
}
else if(nucleotide == 'C')
{
base = 0x2;
}
else if(nucleotide == 'T')
{
base = 0x3;
}
//white out the bits to be changed
nucleotideSequence[sequenceIndex] &= ~(mask << shift);
//write the new bits:
nucleotideSequence[sequenceIndex] |= (base << shift);
}
void changeBase(struct Sequence* sequence, unsigned int nucleotidePosition,
char nucleotide, char quality)
{
setBase(sequence->sequence, nucleotidePosition, nucleotide);
setQuality(sequence, nucleotidePosition, quality);
}
void deleteBase(struct Sequence* sequence, unsigned int nucleotidePosition)
{
const unsigned long long int MASK = 0xFFFFFFFFFFFFFFFF;
// Indices involved in deletion:
unsigned int startIndex = nucleotidePosition / 32;
unsigned int endNucleotidePosition = getMax(0, (sequence->length - 1));
unsigned int endIndex = endNucleotidePosition / 32;
unsigned long long int temp;
char base;
// Handle the block of memory where the deletion occurs:
if(nucleotidePosition % 32 != 31)
{
// Not needed if deleting last nucleotide in block.
temp = sequence->sequence[startIndex] & (MASK >> ((nucleotidePosition * 2 % 64) + 2));
sequence->sequence[startIndex] = sequence->sequence[startIndex] & ~(MASK >> (nucleotidePosition * 2 % 64));
sequence->sequence[startIndex] = sequence->sequence[startIndex] | (temp << 2);
}
base = getBase(sequence->sequence, ((startIndex + 1) * 32));
setBase(sequence->sequence, ((startIndex + 1) * 32 - 1), base);
// Shift the rest of the memory to the left by two:
for(int i = startIndex + 1; i < endIndex; i++)
{
sequence->sequence[i] = sequence->sequence[i] << 2;
base = getBase(sequence->sequence, ((i + 1) * 32));
setBase(sequence->sequence, ((i + 1) * 32 - 1), base);
}
if(startIndex != endIndex) // Don't want to shift twice!
{
// Shift the last block of memory:
sequence->sequence[endIndex] = sequence->sequence[endIndex] << 2;
}
// White out the last base:
setBase(sequence->sequence, endNucleotidePosition, 'A');
// Update the qualities:
// NOTE: Needs to be done BEFORE length changes.
deleteQuality(sequence, nucleotidePosition);
// Update length:
sequence->length = sequence->length - 1;
}
void insertBase(struct Sequence* sequence, unsigned int nucleotidePosition,
char nucleotide, char quality)
{
const unsigned long long int MASK = 0xFFFFFFFFFFFFFFFF;
char base;
unsigned int sequenceIndex = nucleotidePosition / 32;
int blocksNeeded = getNumMemoryBlocks(sequence->length + 1);
// Do we need to allocate another block?
if(blocksNeeded > sequence->blocks)
{
sequence->sequence = realloc(sequence->sequence, (blocksNeeded * sizeof(unsigned long long int)));
sequence->blocks = blocksNeeded;
}
// Push everything after insert block over once.
for(int i = sequence->blocks - 1; i > sequenceIndex; i--)
{
sequence->sequence[i] = sequence->sequence[i] >> 2;
base = getBase(sequence->sequence, (i * 32 - 1));
setBase(sequence->sequence, i * 32, base);
}
// Handle the block of memory where the insertion occurs:
unsigned long long int temp = sequence->sequence[sequenceIndex] & (MASK >> (nucleotidePosition * 2 % 64));
sequence->sequence[sequenceIndex] = sequence->sequence[sequenceIndex] & ~(MASK >> (nucleotidePosition * 2 % 64));
sequence->sequence[sequenceIndex] = sequence->sequence[sequenceIndex] | (temp >> 2);
// Set the base in the newly created gap.
setBase(sequence->sequence, nucleotidePosition, nucleotide);
// Update the sequence.
sequence->length = sequence->length + 1;
// Update the qualities:
// NOTE: Needs to be done AFTER length changes.
insertQuality(sequence, nucleotidePosition, quality);
}
int getHomopolymerLength(unsigned long long int* nucleotideSequence,
int sequenceLength, unsigned int nucleotidePosition)
{
char base = getBase(nucleotideSequence, nucleotidePosition);
int count = 1;
// Left
for(int i = nucleotidePosition - 1; i >= 0 && getBase(nucleotideSequence, i) == base; i--)
{
count++;
}
// Right
for(int i = nucleotidePosition + 1; i < sequenceLength && getBase(nucleotideSequence, i) == base; i++)
{
count++;
}
return count;
}
int getHomopolymerLeftmostNucleotide(unsigned long long int* nucleotideSequence,
unsigned int nucleotidePosition)
{
char base = getBase(nucleotideSequence, nucleotidePosition);
int leftmostNucleotide = nucleotidePosition;
// Left
for(int i = nucleotidePosition - 1; i >= 0 && getBase(nucleotideSequence, i) == base; i--)
{
leftmostNucleotide--;
}
return leftmostNucleotide;
}
void setHomopolymerLength(struct Sequence* sequence,
unsigned int nucleotidePosition, unsigned int size,
char quality)
{
int originalSize = getHomopolymerLength(sequence->sequence, sequence->length, nucleotidePosition);
int difference = originalSize - size;
char base = getBase(sequence->sequence, nucleotidePosition);
char current;
int position;
// Currently too long.
if(difference > 0)
{
current = base;
// Locate leftmost nucleotide in homopolymer.
for(position = nucleotidePosition;
position >= 0 && getBase(sequence->sequence, position) == base;
position--);
// Shift back one if necessary.
if(position < 0)
{
position = 0;
}
if(getBase(sequence->sequence, position) != base)
{
position++;
}
// Delete bases.
for(int i = difference; i > 0; i--)
{
deleteBase(sequence, position);
}
}
// Currently too short.
else if(difference < 0)
{
// Insert bases.
for(int i = difference; i < 0; i++)
{
insertBase(sequence, nucleotidePosition, base, quality);
}
}
}