/* Pollux Copyright (C) 2014 Eric Marinier This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include "KMerHashTable.h" #include "Utility.h" #include "Correction.h" static inline void printProgress(int x, int n, int r) { // Only update r times. if (n/r == 0 || x % (n/r) != 0) return; // Mod 0 causes problems. double result; if(n / r != 1) result = x / (n/r) * (100/r); else result = (x + 1) / ((double)n/(double)r) * ((double)100/(double)r); printf("%d%% ", (int)result); fflush(stdout); } void addKMersToTable(KMerHashTable* table, unsigned long long int* sequence, unsigned int sequenceLength, unsigned int kmerSize) { //Variables: unsigned long long int kmer; unsigned long long int count; for(int i = 0; i <= (int)sequenceLength - (int)kmerSize; i++) { // Get the next k-mer: kmer = getKMer(sequence, i, i + kmerSize); // Does the k-mer exist? count = (unsigned long long int)hash_table_lookup(table->table, (HashTableKey)(kmer)); // The k-mer does exist: if(count != 0) { // Update: KMerTableInsert(table, kmer, (count + 1)); } // The k-mer doesn't exist: else { // Initialize: KMerTableInsert(table, kmer, 1); } } } void preprocessKMers(KMerHashTable* kmerTable, Correction* correction) { // Data structures: HashTable* hashTable = kmerTable->table; HashTableIterator iterator; unsigned long long int MAX_KMER_COUNT = (1024 + 1); unsigned long long int counts[MAX_KMER_COUNT]; unsigned long long int kmer; unsigned long long int count; unsigned long long int current = 0; unsigned long long int total = hash_table_num_entries(hashTable); unsigned long long int unique = 0; hash_table_iterate(hashTable, &iterator); // Initialize Counts: for (int i = 0; i < MAX_KMER_COUNT; i++) { counts[i] = 0; } // Iterate Over K-Mers: while(hash_table_iter_has_more(&iterator)) { printProgress(current, total, 20); current++; kmer = (unsigned long long int)hash_table_iter_next_key(&iterator); count = (unsigned long long int)hash_table_lookup(hashTable, (HashTableKey)(kmer)); // Tally Counts: if(count <= MAX_KMER_COUNT) { counts[count] += 1; } // Remove Unique: if(count == 1) { hash_table_remove(hashTable, (HashTableKey)(kmer), false); unique++; } } // Unique K-Mer Information: printf("\n"); printf("Removed %llu unique k-mers from the set of %llu total k-mers.\n", unique, total); printf("Resizing...\n"); hash_table_resize(hashTable); printf("Finished resizing...\n"); unsigned int currentKMerCount = 1; // Loop until we find a low-to-high number transitions: while (currentKMerCount <= MAX_KMER_COUNT && counts[currentKMerCount] > counts[currentKMerCount + 1]) { currentKMerCount++; } if(currentKMerCount < MAX_KMER_COUNT) { correction->lowKMerThreshold = currentKMerCount; } printf("Low k-mer count value was observed to be %d.\n", currentKMerCount); } int KMerHashFunctionEquals(HashTableValue value1, HashTableValue value2) { return (value1 == value2); } unsigned long KMerHashFunctionHash(HashTableKey kmer) { return (unsigned long)kmer; } KMerHashTable* newKMerHashTable() { HashTable* hashTable = hash_table_new(KMerHashFunctionHash, KMerHashFunctionEquals); KMerHashTable* kmerTable; if((kmerTable = malloc(sizeof *kmerTable)) != NULL) { kmerTable->table = hashTable; } return kmerTable; } int KMerTableInsert(KMerHashTable* kmerTable, unsigned long long int kmer, unsigned long long int count) { return hash_table_insert(kmerTable->table, (HashTableKey)(kmer), (HashTableValue)(count)); } /** * * BE EXTREMELY CAREFUL USING THIS LOCALLY!! * * SAFER: count = (unsigned long long int)hash_table_lookup(kmerTable->table, (HashTableKey)(kmer)); * - Might miss singletons. * */ unsigned long long int KMerTableLookup(KMerHashTable* kmerTable, unsigned long long int kmer) { unsigned long long int count; HashTableValue value = hash_table_lookup(kmerTable->table, (HashTableKey)(kmer)); count = (unsigned long long int)value; if(count == 0) { count = 1; } return count; } unsigned int getMaxKMerCount(KMerHashTable* kmerTable) { HashTable* hashTable = kmerTable->table; HashTableIterator iterator; unsigned long long int kmer; unsigned long long int current = 0; unsigned long long int max = 0; hash_table_iterate(hashTable, &iterator); while(hash_table_iter_has_more(&iterator)) { kmer = (unsigned long long int)hash_table_iter_next_key(&iterator); current = KMerTableLookup(kmerTable, kmer); max = getMax(current, max); } return max; } unsigned int* createDistribution(KMerHashTable* kmerTable, unsigned int max) { // Data structures: HashTable* hashTable = kmerTable->table; HashTableIterator iterator; unsigned int* distribution = (unsigned int*)malloc((max + 1) * sizeof(unsigned int*)); unsigned long long int kmer; unsigned long long int current; // Initialize: for(int i = 0; i <= max; i++) { distribution[i] = 0; } hash_table_iterate(hashTable, &iterator); while(hash_table_iter_has_more(&iterator)) { kmer = (unsigned long long int)hash_table_iter_next_key(&iterator); current = KMerTableLookup(kmerTable, kmer); distribution[current]++; } return distribution; } unsigned int getNumRepeats(KMerHashTable* kmerTable) { // Variables: unsigned int repeats = 0; unsigned int max = getMaxKMerCount(kmerTable); // Data Structures: unsigned int* distribution = createDistribution(kmerTable, max); for(int i = 2; i <= max; i++) { repeats += distribution[i] * i; } free(distribution); return repeats; }