/* This file is part of Jellyfish.
Jellyfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Jellyfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Jellyfish. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace err = jellyfish::err;
using jellyfish::file_header;
using jellyfish::RectangularBinaryMatrix;
using jellyfish::mer_dna;
using jellyfish::cpp_array;
typedef std::auto_ptr binary_reader_ptr;
typedef std::auto_ptr text_reader_ptr;
struct file_info {
std::ifstream is;
file_header header;
file_info(const char* path) :
is(path),
header(is)
{ }
};
typedef std::auto_ptr matrix_ptr;
template
void do_merge(cpp_array& files, std::ostream& out, writer_type& writer,
uint64_t min, uint64_t max) {
cpp_array readers(files.size());
typedef jellyfish::mer_heap::heap heap_type;
typedef typename heap_type::const_item_t heap_item;
heap_type heap(files.size());
for(size_t i = 0; i < files.size(); ++i) {
readers.init(i, files[i].is, &files[i].header);
if(readers[i].next())
heap.push(readers[i]);
}
heap_item head = heap.head();
mer_dna key;
while(heap.is_not_empty()) {
key = head->key_;
uint64_t sum = 0;
do {
sum += head->val_;
heap.pop();
if(head->it_->next())
heap.push(*head->it_);
head = heap.head();
} while(head->key_ == key && heap.is_not_empty());
if(sum >= min && sum <= max)
writer.write(out, key, sum);
}
}
// Merge files. Throws an error if unsuccessful.
void merge_files(std::vector input_files,
const char* out_file,
file_header& out_header,
uint64_t min, uint64_t max) {
unsigned int key_len = 0;
size_t max_reprobe_offset = 0;
size_t size = 0;
unsigned int out_counter_len = std::numeric_limits::max();
std::string format;
matrix_ptr matrix;
cpp_array files(input_files.size());
// create an iterator for each hash file
for(size_t i = 0; i < files.size(); i++) {
files.init(i, input_files[i]);
if(!files[i].is.good())
throw MergeError(err::msg() << "Failed to open input file '" << input_files[i] << "'");
file_header& h = files[i].header;
if(i == 0) {
key_len = h.key_len();
max_reprobe_offset = h.max_reprobe_offset();
size = h.size();
matrix.reset(new RectangularBinaryMatrix(h.matrix()));
out_header.size(size);
out_header.key_len(key_len);
format = h.format();
out_header.matrix(*matrix);
out_header.max_reprobe(h.max_reprobe());
size_t reprobes[h.max_reprobe() + 1];
h.get_reprobes(reprobes);
out_header.set_reprobes(reprobes);
out_counter_len = std::min(out_counter_len, h.counter_len());
} else {
if(format != h.format())
throw MergeError(err::msg() << "Can't merge files with different formats (" << format << ", " << h.format() << ")");
if(h.key_len() != key_len)
throw MergeError(err::msg() << "Can't merge hashes of different key lengths (" << key_len << ", " << h.key_len() << ")");
if(h.max_reprobe_offset() != max_reprobe_offset)
throw MergeError("Can't merge hashes with different reprobing strategies");
if(h.size() != size)
throw MergeError(err::msg() << "Can't merge hash with different size (" << size << ", " << h.size() << ")");
if(h.matrix() != *matrix)
throw MergeError("Can't merge hash with different hash function");
}
}
mer_dna::k(key_len / 2);
std::ofstream out(out_file);
if(!out.good())
throw MergeError(err::msg() << "Can't open out file '" << out_file << "'");
out_header.format(format);
if(!format.compare(binary_dumper::format)) {
out_header.counter_len(out_counter_len);
out_header.write(out);
binary_writer writer(out_counter_len, key_len);
do_merge(files, out, writer, min, max);
} else if(!format.compare(text_dumper::format)) {
out_header.write(out);
text_writer writer;
do_merge(files, out, writer, min, max);
} else {
throw MergeError(err::msg() << "Unknown format '" << format << "'");
}
out.close();
}