/* Pollux Copyright (C) 2014 Eric Marinier This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include "Utility.h" unsigned int getNumMemoryBlocks(unsigned int length) { const int NUCLEOTIDES_PER_BLOCK = 32; unsigned int blocks; // Determine number of blocks: blocks = length / NUCLEOTIDES_PER_BLOCK; // Number of full blocks. if(length % NUCLEOTIDES_PER_BLOCK > 0) { blocks++; // Partially filled block. } return blocks; } unsigned long long int getKMer(unsigned long long int* sequence, unsigned int startNucleotidePosition, unsigned int endNucleotidePosition) { const unsigned long long int MASK = 0xFFFFFFFFFFFFFFFF; unsigned int startSequenceIndex = startNucleotidePosition / 32; unsigned int endNucleotideIndex = (endNucleotidePosition - 1) / 32; unsigned long long int startSequence = sequence[startSequenceIndex]; unsigned long long int endSequence = sequence[endNucleotideIndex]; unsigned long long int result; result = startSequence << ((startNucleotidePosition * 2) % 64); result = result & (MASK << ((startNucleotidePosition * 2) % 64)); // Do we need to grab the bits at the start of the second sequence and // append it to the end of the result? if(startSequenceIndex != endNucleotideIndex) { result = result + (endSequence >> (64 - (startNucleotidePosition * 2) % 64)); } //Mask away the extra bits: result = result & (MASK << (64 - (endNucleotidePosition - startNucleotidePosition) * 2)); return result; } unsigned long long int getReverse(unsigned long long int sequence) { unsigned long long int MASK = 0x3; unsigned long long int nucleotide; unsigned long long int reverse = 0x0; for(int i = 0; i < 32; i++) { // Get the last nucleotide: nucleotide = (sequence >> (i * 2)) & MASK; // Add the nucleotide to the reverse: reverse = reverse | (nucleotide << ((32 - i - 1) * 2)); } return reverse; } unsigned long long int* createReverseCompliment(unsigned long long int* sequence, unsigned int length) { unsigned int endNucleotidePosition = getMax(0, (length - 1)); unsigned int endNucleotideIndex = endNucleotidePosition / 32; unsigned int extraBases = (32 - length) % 32; unsigned long long int reverse; unsigned long long int* reverseCompliment = (unsigned long long int*)malloc((endNucleotideIndex + 1) * sizeof(unsigned long long int*)); for(int i = 0; i <= endNucleotideIndex; i++) { reverse = getReverse(sequence[endNucleotideIndex - i]); reverseCompliment[i] = ~(reverse); } //Collapse: for(int i = 0; i <= endNucleotideIndex; i++) { //Shuffle: reverseCompliment[i] = reverseCompliment[i] << (extraBases * 2); //Fill trailing bits if necessary: if(i < endNucleotideIndex && extraBases > 0) { reverseCompliment[i] = reverseCompliment[i] | (reverseCompliment[i + 1] >> ((32 - extraBases) * 2)); } } return reverseCompliment; } void printAsNucleotides(unsigned long long int* sequence, unsigned int startNucleotidePosition, unsigned int endNucleotidePosition) { writeAsNucleotides(stdout, sequence, startNucleotidePosition, endNucleotidePosition); } void writeAsNucleotides(FILE* file, unsigned long long int* sequence, unsigned int startNucleotidePosition, unsigned int endNucleotidePosition) { const unsigned long long int MASK = 0x3; // Iterate over all nucleotides: for(int i = startNucleotidePosition; i < endNucleotidePosition; i++) { // Determine the current sequence: unsigned long long int currentSequence = sequence[i / 32]; unsigned int position = (i * 2) % 64; // bit position w/i sequence unsigned long long int nucleotide = (currentSequence >> (62 - position)) & MASK; // Print appropriate character: if(nucleotide == 0x0) { fprintf(file, "A"); } else if(nucleotide == 0x1) { fprintf(file, "G"); } else if(nucleotide == 0x2) { fprintf(file, "C"); } else if(nucleotide == 0x3) { fprintf(file, "T"); } // Error: else { fprintf(file, "E"); } } } void writeAsNucleotidesSpaced(FILE* file, unsigned long long int* sequence, unsigned int startNucleotidePosition, unsigned int endNucleotidePosition) { const unsigned long long int MASK = 0x3; // Iterate over all nucleotides: for(int i = startNucleotidePosition; i < endNucleotidePosition; i++) { // Determine the current sequence: unsigned long long int currentSequence = sequence[i / 32]; unsigned int position = (i * 2) % 64; // bit position w/i sequence unsigned long long int nucleotide = (currentSequence >> (62 - position)) & MASK; // Print appropriate character: if(nucleotide == 0x0) { fprintf(file, " A "); } else if(nucleotide == 0x1) { fprintf(file, " G "); } else if(nucleotide == 0x2) { fprintf(file, " C "); } else if(nucleotide == 0x3) { fprintf(file, " T "); } // Error: else { fprintf(file, " E "); } } } void printValueAsNucleotides(unsigned long long int value) { writeValueAsNucleotides(stdout, value); } void writeValueAsNucleotides(FILE* file, unsigned long long int value) { unsigned long long int mask = 0x3; unsigned long long int nucleotide; for(int i = 0; i < 32; i++) { nucleotide = (value >> ((32 - i - 1) * 2)) & mask; if(nucleotide == 0x0) { fprintf(file, "A"); } else if(nucleotide == 0x1) { fprintf(file, "G"); } else if(nucleotide == 0x2) { fprintf(file, "C"); } else if(nucleotide == 0x3) { fprintf(file, "T"); } else { fprintf(file, "[E]"); } } } int getMax(int value1, int value2) { if(value1 > value2) { return value1; } else { return value2; } } int getMin(int value1, int value2) { if(value1 < value2) { return value1; } else { return value2; } } char getBase(unsigned long long int* nucleotideSequence, unsigned int nucleotidePosition) { unsigned long long int mask = 0x3; unsigned long long int result; char base = 'E'; unsigned int sequenceIndex = nucleotidePosition / 32; unsigned long long int sequence = nucleotideSequence[sequenceIndex]; unsigned int shift = (31 - (nucleotidePosition % 32)) * 2; result = (sequence >> shift) & mask; if(result == 0x0) { base = 'A'; } else if(result == 0x1) { base = 'G'; } else if(result == 0x2) { base = 'C'; } else if(result == 0x3) { base = 'T'; } return base; } void setQuality(struct Sequence* sequence, int location, char quality) { // Within range? if(0 <= location && location < sequence->length) { sequence->quality[location] = quality; } } void insertQuality(struct Sequence* sequence, int location, char quality) { // Within range? if(0 <= location && location < sequence->length) { // Make room: sequence->quality = (char*) realloc (sequence->quality, strlen(sequence->quality) + 2); // Make a temporary string: char * temp = (char*) malloc (strlen(sequence->quality) + 1); strcpy(temp, &(sequence->quality[location])); // Copy end bit in. // Push end bit over by one after the insert location: strcpy(&(sequence->quality[location + 1]), temp); // Make insertion: sequence->quality[location] = quality; // Remove temporary string: free(temp); } } void deleteQuality(struct Sequence* sequence, int location) { // Within range? if(0 <= location && location < sequence->length) { // Make a temporary string: char * temp = (char*) malloc (strlen(sequence->quality) + 1); strcpy(temp, &(sequence->quality[location + 1])); // Copy end bit in. // Remove: strcpy(&(sequence->quality[location]), temp); // Shrink: // THE STRING HAS ALREADY BEEN SHRUNK! // The terminating character will be in the correct place. // Adjust the memory to match the new string length. // strlen(x) + 1 is the length plus 1 for the terminating character. sequence->quality = realloc (sequence->quality, strlen(sequence->quality) + 1); // Remove temporary string: free(temp); } } void setBase(unsigned long long int* nucleotideSequence, unsigned int nucleotidePosition, char nucleotide) { unsigned long long int mask = 0x3; unsigned long long int base; unsigned int sequenceIndex = nucleotidePosition / 32; unsigned int shift = (31 - (nucleotidePosition % 32)) * 2; if (nucleotide == 'A') { base = 0x0; } else if(nucleotide == 'G') { base = 0x1; } else if(nucleotide == 'C') { base = 0x2; } else if(nucleotide == 'T') { base = 0x3; } //white out the bits to be changed nucleotideSequence[sequenceIndex] &= ~(mask << shift); //write the new bits: nucleotideSequence[sequenceIndex] |= (base << shift); } void changeBase(struct Sequence* sequence, unsigned int nucleotidePosition, char nucleotide, char quality) { setBase(sequence->sequence, nucleotidePosition, nucleotide); setQuality(sequence, nucleotidePosition, quality); } void deleteBase(struct Sequence* sequence, unsigned int nucleotidePosition) { const unsigned long long int MASK = 0xFFFFFFFFFFFFFFFF; // Indices involved in deletion: unsigned int startIndex = nucleotidePosition / 32; unsigned int endNucleotidePosition = getMax(0, (sequence->length - 1)); unsigned int endIndex = endNucleotidePosition / 32; unsigned long long int temp; char base; // Handle the block of memory where the deletion occurs: if(nucleotidePosition % 32 != 31) { // Not needed if deleting last nucleotide in block. temp = sequence->sequence[startIndex] & (MASK >> ((nucleotidePosition * 2 % 64) + 2)); sequence->sequence[startIndex] = sequence->sequence[startIndex] & ~(MASK >> (nucleotidePosition * 2 % 64)); sequence->sequence[startIndex] = sequence->sequence[startIndex] | (temp << 2); } base = getBase(sequence->sequence, ((startIndex + 1) * 32)); setBase(sequence->sequence, ((startIndex + 1) * 32 - 1), base); // Shift the rest of the memory to the left by two: for(int i = startIndex + 1; i < endIndex; i++) { sequence->sequence[i] = sequence->sequence[i] << 2; base = getBase(sequence->sequence, ((i + 1) * 32)); setBase(sequence->sequence, ((i + 1) * 32 - 1), base); } if(startIndex != endIndex) // Don't want to shift twice! { // Shift the last block of memory: sequence->sequence[endIndex] = sequence->sequence[endIndex] << 2; } // White out the last base: setBase(sequence->sequence, endNucleotidePosition, 'A'); // Update the qualities: // NOTE: Needs to be done BEFORE length changes. deleteQuality(sequence, nucleotidePosition); // Update length: sequence->length = sequence->length - 1; } void insertBase(struct Sequence* sequence, unsigned int nucleotidePosition, char nucleotide, char quality) { const unsigned long long int MASK = 0xFFFFFFFFFFFFFFFF; char base; unsigned int sequenceIndex = nucleotidePosition / 32; int blocksNeeded = getNumMemoryBlocks(sequence->length + 1); // Do we need to allocate another block? if(blocksNeeded > sequence->blocks) { sequence->sequence = realloc(sequence->sequence, (blocksNeeded * sizeof(unsigned long long int))); sequence->blocks = blocksNeeded; } // Push everything after insert block over once. for(int i = sequence->blocks - 1; i > sequenceIndex; i--) { sequence->sequence[i] = sequence->sequence[i] >> 2; base = getBase(sequence->sequence, (i * 32 - 1)); setBase(sequence->sequence, i * 32, base); } // Handle the block of memory where the insertion occurs: unsigned long long int temp = sequence->sequence[sequenceIndex] & (MASK >> (nucleotidePosition * 2 % 64)); sequence->sequence[sequenceIndex] = sequence->sequence[sequenceIndex] & ~(MASK >> (nucleotidePosition * 2 % 64)); sequence->sequence[sequenceIndex] = sequence->sequence[sequenceIndex] | (temp >> 2); // Set the base in the newly created gap. setBase(sequence->sequence, nucleotidePosition, nucleotide); // Update the sequence. sequence->length = sequence->length + 1; // Update the qualities: // NOTE: Needs to be done AFTER length changes. insertQuality(sequence, nucleotidePosition, quality); } int getHomopolymerLength(unsigned long long int* nucleotideSequence, int sequenceLength, unsigned int nucleotidePosition) { char base = getBase(nucleotideSequence, nucleotidePosition); int count = 1; // Left for(int i = nucleotidePosition - 1; i >= 0 && getBase(nucleotideSequence, i) == base; i--) { count++; } // Right for(int i = nucleotidePosition + 1; i < sequenceLength && getBase(nucleotideSequence, i) == base; i++) { count++; } return count; } int getHomopolymerLeftmostNucleotide(unsigned long long int* nucleotideSequence, unsigned int nucleotidePosition) { char base = getBase(nucleotideSequence, nucleotidePosition); int leftmostNucleotide = nucleotidePosition; // Left for(int i = nucleotidePosition - 1; i >= 0 && getBase(nucleotideSequence, i) == base; i--) { leftmostNucleotide--; } return leftmostNucleotide; } void setHomopolymerLength(struct Sequence* sequence, unsigned int nucleotidePosition, unsigned int size, char quality) { int originalSize = getHomopolymerLength(sequence->sequence, sequence->length, nucleotidePosition); int difference = originalSize - size; char base = getBase(sequence->sequence, nucleotidePosition); char current; int position; // Currently too long. if(difference > 0) { current = base; // Locate leftmost nucleotide in homopolymer. for(position = nucleotidePosition; position >= 0 && getBase(sequence->sequence, position) == base; position--); // Shift back one if necessary. if(position < 0) { position = 0; } if(getBase(sequence->sequence, position) != base) { position++; } // Delete bases. for(int i = difference; i > 0; i--) { deleteBase(sequence, position); } } // Currently too short. else if(difference < 0) { // Insert bases. for(int i = difference; i < 0; i++) { insertBase(sequence, nucleotidePosition, base, quality); } } }