/* This file is part of Jellyfish.
Jellyfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Jellyfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Jellyfish. If not, see .
*/
#ifndef __JELLYFISH_BINARY_DUMPER_HPP__
#define __JELLYFISH_BINARY_DUMPER_HPP__
#include
#include
#include
namespace jellyfish {
template
class binary_writer {
int val_len_;
Val max_val_;
int key_len_; // length of output key field in bytes
public:
binary_writer(int val_len, // length of value field in bytes
int key_len) : // length of key field in bits
val_len_(val_len),
max_val_(((Val)1 << (8 * val_len)) - 1),
key_len_(key_len / 8 + (key_len % 8 != 0))
{ }
int val_len() const { return val_len_; }
Val max_val() const { return max_val_; }
int key_len() const { return key_len_; }
void write(std::ostream& out, const Key& key, const Val val) {
out.write((const char*)key.data(), key_len_);
Val v = std::min(max_val_, val);
out.write((const char*)&v, val_len_);
}
};
/// Dump a hash array in sorted binary format. The key/value pairs are
/// written in a sorted list according to the hash function order. The
/// k-mer and count are written in binary, byte aligned.
template
class binary_dumper : public sorted_dumper, storage_t> {
typedef sorted_dumper, storage_t> super;
binary_writer writer;
public:
static const char* format;
binary_dumper(int val_len, // length of value field in bytes
int key_len, // length of key field in bits
int nb_threads, const char* file_prefix,
file_header* header = 0) :
super(nb_threads, file_prefix, header),
writer(val_len, key_len)
{ }
virtual void _dump(storage_t* ary) {
if(super::header_) {
super::header_->update_from_ary(*ary);
super::header_->format(format);
super::header_->counter_len(writer.val_len());
}
super::_dump(ary);
}
void write_key_value_pair(std::ostream& out, typename super::heap_item item) {
writer.write(out, item->key_, item->val_);
}
};
template
const char* jellyfish::binary_dumper::format = "binary/sorted";
/// Reader of the format written by binary_dumper. Behaves like an
/// iterator (has next() method which behaves similarly to the next()
/// method of the hash array).
/// The header should be of format binary/sorted, but no check is made.
template
class binary_reader {
std::istream& is_;
const int val_len_;
Key key_;
Val val_;
const RectangularBinaryMatrix m_;
const size_t size_mask_;
public:
binary_reader(std::istream& is, // stream containing data (past any header)
file_header* header) : // header which contains counter_len, matrix, size and key_len
is_(is), val_len_(header->counter_len()), key_(header->key_len() / 2),
m_(header->matrix()),
size_mask_(header->size() - 1)
{ }
const Key& key() const { return key_; }
const Val& val() const { return val_; }
size_t pos() const { return m_.times(key_) & size_mask_; }
bool next() {
key_.template read<1>(is_);
val_ = 0;
is_.read((char*)&val_, val_len_);
return is_.good();
}
};
template
class binary_query_base {
const char* const data_;
const unsigned int val_len_; // In bytes
const unsigned int key_len_; // In bytes
const RectangularBinaryMatrix m_;
const size_t mask_;
const size_t record_len_;
const size_t last_id_;
Key first_key_, last_key_;
mutable Key mid_key_;
uint64_t first_pos_, last_pos_;
public:
// key_len passed in bits
binary_query_base(const char* data, unsigned int key_len, unsigned int val_len, const RectangularBinaryMatrix& m, size_t mask,
size_t size) :
data_(data),
val_len_(val_len),
key_len_(key_len / 8 + (key_len % 8 != 0)),
m_(m),
mask_(mask),
record_len_(val_len + key_len_),
last_id_(size / record_len_),
first_key_(key_len / 2),
last_key_(key_len / 2),
mid_key_(key_len / 2)
{
if(size % record_len_ != 0)
throw std::length_error(err::msg() << "Size of database (" << size << ") must be a multiple of the length of a record ("
<< record_len_ << ")");
key_at(0, first_key_);
first_pos_ = key_pos(first_key_);
key_at(last_id_ - 1, last_key_);
last_pos_ = key_pos(last_key_);
}
bool val_id(const Key& key, Val* res, uint64_t* id) const {
if(last_id_ == 0) return false;
uint64_t first = 0;
uint64_t last = last_id_;
uint64_t first_pos = first_pos_;
uint64_t last_pos = last_pos_;
const uint64_t pos = key_pos(key);
uint64_t cid = 0;
if(key == first_key_) goto found;
cid = last_id_ - 1;
if(key == last_key_) goto found;
if(pos < first_pos_ || pos > last_pos_) return false;
// First a guided binary search
for(uint64_t diff = last - first; diff >= 8; diff = last - first) {
cid = first + lrint(diff * ((double)(pos - first_pos) / (double)(last_pos - first_pos)));
cid = std::max(first + 1, cid);
cid = std::min(cid, last - 1);
key_at(cid, mid_key_);
if(key == mid_key_) goto found;
uint64_t mid_pos = key_pos(mid_key_);
if(mid_pos > pos || (mid_pos == pos && mid_key_ > key)) {
last = cid;
last_pos = mid_pos;
} else {
first = cid;
first_pos = mid_pos;
}
}
// Then a linear search (avoids matrix computation)
for(cid = first + 1; cid < last; ++cid) {
key_at(cid, mid_key_);
if(key == mid_key_) goto found;
}
return false;
found:
val_at(cid, res);
*id = cid;
return true;
}
Val operator[](const Key& key) const {
Val res;
uint64_t id;
if(!val_id(key, &res, &id))
return 0;
return res;
}
inline Val check(const Key& key) const { return (*this)[key]; }
protected:
void key_at(size_t id, Key& key) const {
memcpy(key.data__(), data_ + id * record_len_, key_len_);
key.clean_msw();
}
void val_at(size_t id, Val* val) const {
*val = 0;
memcpy(val, data_ + id * record_len_ + key_len_, val_len_);
}
uint64_t key_pos(const Key& key) const {
return m_.times(key) & mask_;
}
};
}
#endif /* __JELLYFISH_BINARY_DUMPER_HPP__ */