/* This file is part of Jellyfish.
Jellyfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Jellyfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Jellyfish. If not, see .
*/
#ifndef __JELLYFISH_MER_DNA_HPP__
#define __JELLYFISH_MER_DNA_HPP__
#ifdef HAVE_CONFIG_H
#include
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef HAVE_INT128
#include
#endif
namespace jellyfish { namespace mer_dna_ns {
#define R -1
#define I -2
#define O -3
#define A 0
#define C 1
#define G 2
#define T 3
static const int codes[256] = {
O, O, O, O, O, O, O, O, O, O, I, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, R, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, A, R, C, R, O, O, G, R, O, O, R, O, R, R, O,
O, O, R, R, T, O, R, R, R, R, O, O, O, O, O, O,
O, A, R, C, R, O, O, G, R, O, O, R, O, R, R, O,
O, O, R, R, T, O, R, R, R, R, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O,
O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O
};
#undef R
#undef I
#undef O
#undef A
#undef C
#undef G
#undef T
static const char rev_codes[4] = { 'A', 'C', 'G', 'T' };
extern const char* const error_different_k;
extern const char* const error_short_string;
// Checkered mask. cmask is every other bit on
// (0x55). cmask is two bits one, two bits off (0x33). Etc.
template
struct cmask {
static const U v =
(cmask::v << (2 * len)) | (((U)1 << len) - 1);
};
template
struct cmask {
static const U v = 0;
};
// Fast reverse complement of one word through bit tweedling.
inline uint32_t word_reverse_complement(uint32_t w) {
typedef uint64_t U;
w = ((w >> 2) & cmask::v) | ((w & cmask::v) << 2);
w = ((w >> 4) & cmask::v) | ((w & cmask::v) << 4);
w = ((w >> 8) & cmask::v) | ((w & cmask::v) << 8);
w = ( w >> 16 ) | ( w << 16);
return ((U)-1) - w;
}
inline uint64_t word_reverse_complement(uint64_t w) {
typedef uint64_t U;
w = ((w >> 2) & cmask::v) | ((w & cmask::v) << 2);
w = ((w >> 4) & cmask::v) | ((w & cmask::v) << 4);
w = ((w >> 8) & cmask::v) | ((w & cmask::v) << 8);
w = ((w >> 16) & cmask::v) | ((w & cmask::v) << 16);
w = ( w >> 32 ) | ( w << 32);
return ((U)-1) - w;
}
#ifdef HAVE_INT128
inline unsigned __int128 word_reverse_complement(unsigned __int128 w) {
typedef unsigned __int128 U;
w = ((w >> 2) & cmask::v) | ((w & cmask::v) << 2);
w = ((w >> 4) & cmask::v) | ((w & cmask::v) << 4);
w = ((w >> 8) & cmask::v) | ((w & cmask::v) << 8);
w = ((w >> 16) & cmask::v) | ((w & cmask::v) << 16);
w = ((w >> 32) & cmask::v) | ((w & cmask::v) << 32);
w = ( w >> 64 ) | ( w << 64);
return ((U)-1) - w;
}
#endif
template
class base_proxy {
public:
typedef T base_type;
base_proxy(base_type* w, unsigned int i) :
word_(w), i_(i) { }
base_proxy& operator=(char base) { return this->operator=(codes[(int)(unsigned char)base]); }
base_proxy& operator=(int code) {
base_type mask = (base_type)0x3 << i_;
*word_ = (*word_ & ~mask) | ((base_type)code << i_);
return *this;
}
int code() const { return (*word_ >> i_) & (base_type)0x3; }
operator char() const { return rev_codes[code()]; }
private:
base_type* const word_;
unsigned int i_;
};
// enum { CODE_A, CODE_C, CODE_G, CODE_T,
// CODE_RESET = -1, CODE_IGNORE = -2, CODE_COMMENT = -3 };
template
struct mer_dna_traits { };
template
class mer_base {
public:
typedef typename mer_dna_traits::base_type base_type;
enum { CODE_A, CODE_C, CODE_G, CODE_T,
CODE_RESET = -1, CODE_IGNORE = -2, CODE_COMMENT = -3 };
explicit mer_base(unsigned int k) :
_data(new base_type[derived::nb_words(k)])
{
memset(_data, '\0', nb_words(k) * sizeof(base_type));
}
mer_base(const mer_base &m) :
_data(new base_type[nb_words(static_cast(&m)->k())])
{
memcpy(_data, m._data, nb_words(static_cast(&m)->k()) * sizeof(base_type));
}
template
mer_base(const unsigned int k, const U& rhs) :
_data(new base_type[nb_words(k)])
{
for(unsigned int i = 0; i < k; ++i)
_data[i] = rhs[i];
clean_msw();
}
~mer_base() {
delete [] _data;
}
operator derived() { return *static_cast(this); }
operator const derived() const { return *static_cast(this); }
unsigned int k() const { return static_cast(this)->k(); }
/// Direct access to data. No bound or consistency check. Use with
/// caution!
// base_type operator[](unsigned int i) { return _data[i]; }
base_type word(unsigned int i) const { return _data[i]; }
base_type operator[](unsigned int i) const { return _data[i]; }
/// Direct access to the data array.
const base_type* data() const { return _data; }
/// Same as above, but can modify directly content. Use at your own
/// risk!
base_type& word__(unsigned int i) { return _data[i]; }
base_type* data__() { return _data; }
template
void read(std::istream& is) {
const unsigned int k = static_cast(this)->k();
const unsigned int l = k / (4 * alignment) + (k % (4 * alignment) != 0);
is.read((char*)_data, l);
}
bool operator==(const mer_base& rhs) const {
unsigned int i = nb_words() - 1;
bool res = (_data[i] & msw()) == (rhs._data[i] & msw());
while(res && i > 7) {
i -= 8;
res = res && (_data[i+7] == rhs._data[i+7]);
res = res && (_data[i+6] == rhs._data[i+6]);
res = res && (_data[i+5] == rhs._data[i+5]);
res = res && (_data[i+4] == rhs._data[i+4]);
res = res && (_data[i+3] == rhs._data[i+3]);
res = res && (_data[i+2] == rhs._data[i+2]);
res = res && (_data[i+1] == rhs._data[i+1]);
res = res && (_data[i] == rhs._data[i] );
}
switch(i) {
case 7: res = res && (_data[6] == rhs._data[6]);
case 6: res = res && (_data[5] == rhs._data[5]);
case 5: res = res && (_data[4] == rhs._data[4]);
case 4: res = res && (_data[3] == rhs._data[3]);
case 3: res = res && (_data[2] == rhs._data[2]);
case 2: res = res && (_data[1] == rhs._data[1]);
case 1: res = res && (_data[0] == rhs._data[0]);
}
return res;
}
bool operator!=(const mer_base& rhs) const { return !this->operator==(rhs); }
bool operator<(const mer_base& rhs) const {
unsigned int i = nb_words();
while(i >= 8) {
i -= 8;
if(_data[i+7] != rhs._data[i+7]) return _data[i+7] < rhs._data[i+7];
if(_data[i+6] != rhs._data[i+6]) return _data[i+6] < rhs._data[i+6];
if(_data[i+5] != rhs._data[i+5]) return _data[i+5] < rhs._data[i+5];
if(_data[i+4] != rhs._data[i+4]) return _data[i+4] < rhs._data[i+4];
if(_data[i+3] != rhs._data[i+3]) return _data[i+3] < rhs._data[i+3];
if(_data[i+2] != rhs._data[i+2]) return _data[i+2] < rhs._data[i+2];
if(_data[i+1] != rhs._data[i+1]) return _data[i+1] < rhs._data[i+1];
if(_data[i] != rhs._data[i]) return _data[i] < rhs._data[i];
}
switch(i) {
case 7: if(_data[6] != rhs._data[6]) return _data[6] < rhs._data[6];
case 6: if(_data[5] != rhs._data[5]) return _data[5] < rhs._data[5];
case 5: if(_data[4] != rhs._data[4]) return _data[4] < rhs._data[4];
case 4: if(_data[3] != rhs._data[3]) return _data[3] < rhs._data[3];
case 3: if(_data[2] != rhs._data[2]) return _data[2] < rhs._data[2];
case 2: if(_data[1] != rhs._data[1]) return _data[1] < rhs._data[1];
case 1: if(_data[0] != rhs._data[0]) return _data[0] < rhs._data[0];
}
return false;
}
bool operator<=(const mer_base& rhs) const {
return *this < rhs || *this == rhs;
}
bool operator>(const mer_base& rhs) const {
return !(*this <= rhs);
}
bool operator>=(const mer_base& rhs) const {
return !(*this < rhs);
}
base_proxy base(unsigned int i) { return base_proxy(_data + i / wbases, 2 * (i % wbases)); }
const base_proxy base(unsigned int i) const { return base_proxy(_data + i / wbases, 2 * (i % wbases)); }
// Make current k-mer all As.
void polyA() { memset(_data, 0x00, sizeof(base_type) * nb_words()); clean_msw(); }
void polyC() { memset(_data, 0x55, sizeof(base_type) * nb_words()); clean_msw(); }
void polyG() { memset(_data, 0xaa, sizeof(base_type) * nb_words()); clean_msw(); }
void polyT() { memset(_data, 0xff, sizeof(base_type) * nb_words()); clean_msw(); }
void randomize() {
for(unsigned int i = 0; i < nb_words(); ++i)
_data[i] = random_bits(wbits);
clean_msw();
}
bool is_homopolymer() const {
const base_type base = _data[0] & c3;
const unsigned int barrier = nb_words();
unsigned int i = 0;
for( ; i + 5 < barrier; i += 4) {
if(_data[i ] != ((_data[i ] << 2) | base)) return false;
if(_data[i + 1] != ((_data[i + 1] << 2) | base)) return false;
if(_data[i + 2] != ((_data[i + 2] << 2) | base)) return false;
if(_data[i + 3] != ((_data[i + 3] << 2) | base)) return false;
}
switch(nb_words() - i) {
case 5: if(_data[i] != ((_data[i] << 2) | base) ) return false; ++i;
case 4: if(_data[i] != ((_data[i] << 2) | base) ) return false; ++i;
case 3: if(_data[i] != ((_data[i] << 2) | base) ) return false; ++i;
case 2: if(_data[i] != ((_data[i] << 2) | base) ) return false; ++i;
case 1: if(_data[i] != (((_data[i] << 2) | base) & msw())) return false;
}
return true;
}
derived& operator=(const mer_base& rhs) {
memcpy(_data, rhs._data, nb_words() * sizeof(base_type));
return *static_cast(this);
}
derived& operator=(const char* s) {
if(strlen(s) < static_cast(this)->k())
throw std::length_error(error_short_string);
from_chars(s);
return *static_cast(this);
}
derived& operator=(const std::string& s) {
if(s.size() < static_cast(this)->k())
throw std::length_error(error_short_string);
from_chars(s.c_str());
return *static_cast(this);
}
// Shift the k-mer by 1 base, left or right. The char version take
// a base 'A', 'C', 'G', or 'T'. The base_type version takes a code
// in [0, 3] (not check of validity of argument, taken modulo
// 4). The return value is the base that was pushed off the side
// ('N' if the input character is not a valid base).
base_type shift_left(int c) {
const base_type r = (_data[nb_words()-1] >> lshift()) & c3;
const unsigned int barrier = nb_words() & (~c3);
base_type c2; // c2 and c1: carries
base_type c1 = (base_type)c & c3;
unsigned int i = 0;
for( ; i < barrier; i += 4) {
c2 = _data[i] >> wshift; _data[i] = (_data[i] << 2) | c1;
c1 = _data[i+1] >> wshift; _data[i+1] = (_data[i+1] << 2) | c2;
c2 = _data[i+2] >> wshift; _data[i+2] = (_data[i+2] << 2) | c1;
c1 = _data[i+3] >> wshift; _data[i+3] = (_data[i+3] << 2) | c2;
}
c2 = c1;
switch(nb_words() - i) {
case 3: c2 = _data[i] >> wshift; _data[i] = (_data[i] << 2) | c1; ++i;
case 2: c1 = _data[i] >> wshift; _data[i] = (_data[i] << 2) | c2; ++i;
case 1: _data[i] = (_data[i] << 2) | c1;
}
clean_msw();
return r;
}
base_type shift_right(int c) {
const base_type r = _data[0] & c3;
if(nb_words() > 1){
const unsigned int barrier = (nb_words() - 1) & (~c3);
unsigned int i = 0;
for( ; i < barrier; i += 4) {
_data[i] = (_data[i] >> 2) | (_data[i+1] << wshift);
_data[i+1] = (_data[i+1] >> 2) | (_data[i+2] << wshift);
_data[i+2] = (_data[i+2] >> 2) | (_data[i+3] << wshift);
_data[i+3] = (_data[i+3] >> 2) | (_data[i+4] << wshift);
}
switch(nb_words() - 1 - i) {
case 3: _data[i] = (_data[i] >> 2) | (_data[i+1] << wshift); ++i;
case 2: _data[i] = (_data[i] >> 2) | (_data[i+1] << wshift); ++i;
case 1: _data[i] = (_data[i] >> 2) | (_data[i+1] << wshift);
}
}
_data[nb_words() - 1] =
((_data[nb_words() - 1] & msw()) >> 2) | (((base_type)c & c3) << lshift());
return r;
}
// Non DNA codes are negative
inline static bool not_dna(int c) { return c < 0; }
inline static int code(char c) { return codes[(int)(unsigned char)c]; }
inline static char rev_code(int x) { return rev_codes[x]; }
static int complement(int x) { return (base_type)3 - x; }
static char complement(char c) {
switch(c) {
case 'A': case 'a': return 'T';
case 'C': case 'c': return 'G';
case 'G': case 'g': return 'C';
case 'T': case 't': return 'A';
}
return 'N';
}
char shift_left(char c) {
int x = code(c);
if(x == -1)
return 'N';
return rev_code(shift_left(x));
}
char shift_right(char c) {
int x = code(c);
if(x == -1)
return 'N';
return rev_code(shift_right(x));
}
void reverse_complement() {
base_type *low = _data;
base_type *high = _data + nb_words() - 1;
for( ; low < high; ++low, --high) {
base_type tmp = word_reverse_complement(*low);
*low = word_reverse_complement(*high);
*high = tmp;
}
if(low == high)
*low = word_reverse_complement(*low);
unsigned int rs = wbits - nb_msb();
if(rs > 0)
large_shift_right(rs);
}
void canonicalize() {
derived rc = this->get_reverse_complement();
if(rc < *this)
*this = rc;
}
derived get_reverse_complement() const {
derived res(*this);
res.reverse_complement();
return res;
}
derived get_canonical() const {
derived rc = this->get_reverse_complement();
return rc < *this ? rc : *this;
}
// Transfomr the k-mer into a C++ string.
std::string to_str() const {
std::string res(static_cast(this)->k(), '\0');
to_chars(res.begin());
return res;
}
// Transform the k-mer into a string. For the char * version,
// assume that the buffer is large enough to receive k+1
// characters (space for '\0' at end of string).
void to_str(char* s) const {
s = to_chars(s);
*s = '\0';
}
// Copy bases as char to the output iterator it. No '\0' is added
// or check made that there is enough space. The iterator pointed
// after the last base is returned.
template
OutputIterator to_chars(OutputIterator it) const {
int shift = lshift(); // Number of bits to shift to get base
for(int j = nb_words() - 1; j >= 0; --j) {
base_type w = _data[j];
for( ; shift >= 0; shift -= 2, ++it)
*it = rev_code((w >> shift) & c3);
shift = wshift;
}
return it;
}
// Get bits [start, start+len). start must be < 2k, len <=
// sizeof(base_type) and start+len < 2k. No checks
// performed. start and len are in bits, not bases.
base_type get_bits(unsigned int start, unsigned int len) const {
unsigned int q = start / wbits;
unsigned int r = start % wbits;
base_type res = _data[q] >> r;
if(len > wbits - r)
res |= _data[q + 1] << (wbits - r);
return len < (unsigned int)wbits ? res & (((base_type)1 << len) - 1) : res;
}
// Set bits [start, start+len). Same restriction as get_bits. In
// some rare cases, the value written can be larger than the bits
// occupied by the mer itself. The mer is then not valid if some MSB
// are set to 1.
template
void set_bits(unsigned int start, unsigned int len, base_type v) {
unsigned int q = start / wbits;
unsigned int r = start % wbits;
unsigned int left = wbits - r;
base_type mask;
if(len > left) {
mask = ((base_type)1 << r) - 1;
_data[q] = (_data[q] & mask) | (v << r);
mask = ((base_type)1 << (len - left)) - 1;
_data[q + 1] = (_data[q + 1] & ~mask) | (v >> (left));
} else {
mask = (len < (unsigned int)wbits ? (((base_type)1 << len) - 1) : (base_type)-1) << r;
_data[q] = (_data[q] & ~mask) | (v << r);
}
if(zero_msw)
clean_msw();
}
// Internal stuff
// Number of words in _data
inline static unsigned int nb_words(unsigned int k) { return (k / wbases) + (k % wbases != 0); }
inline unsigned int nb_words() const { return nb_words(static_cast(this)->k()); }
// Mask of highest word
inline base_type msw() const {
const base_type m = std::numeric_limits::max();
return m >> (wbits - nb_msb());
}
// Nb of bits used in highest word
inline unsigned int nb_msb() const {
base_type nb = (static_cast(this)->k() % wbases) * 2;
return nb ? nb : wbits;
}
// How much to shift last base in last word of _data
inline unsigned int lshift() const { return nb_msb() - 2; }
// Make sure the highest bits are all zero
inline void clean_msw() { _data[nb_words() - 1] &= msw(); }
template
bool from_chars(InputIterator it) {
int shift = lshift();
clean_msw();
for(int j = nb_words() - 1; j >= 0; --j) {
base_type& w = _data[j];
w = 0;
for( ; shift >= 0; shift -= 2, ++it) {
int c = code(*it);
if(not_dna(c))
return false;
w |= (base_type)c << shift;
}
shift = wshift;
}
return true;
}
protected:
static const base_type c3 = (base_type)0x3;
static const int wshift = sizeof(base_type) * 8 - 2; // left shift in 1 word
static const int wbases = 4 * sizeof(base_type); // bases in a word
static const int wbits = 8 * sizeof(base_type); // bits in a word
base_type * _data;
// Shift to the right by rs bits (Note bits, not bases)
void large_shift_right(unsigned int rs) {
if(nb_words() > 1) {
const unsigned int barrier = (nb_words() - 1) & (~c3);
const unsigned int ls = wbits - rs;
unsigned int i = 0;
for( ; i < barrier; i += 4) {
_data[i] = (_data[i] >> rs) | (_data[i+1] << ls);
_data[i+1] = (_data[i+1] >> rs) | (_data[i+2] << ls);
_data[i+2] = (_data[i+2] >> rs) | (_data[i+3] << ls);
_data[i+3] = (_data[i+3] >> rs) | (_data[i+4] << ls);
}
switch(nb_words() - 1 - i) {
case 3: _data[i] = (_data[i] >> rs) | (_data[i+1] << ls); ++i;
case 2: _data[i] = (_data[i] >> rs) | (_data[i+1] << ls); ++i;
case 1: _data[i] = (_data[i] >> rs) | (_data[i+1] << ls);
}
}
_data[nb_words() - 1] >>= rs;
clean_msw();
}
};
// Mer type where the length is kept in each mer object: allows to
// manipulate mers of different size within the same application.
template
class mer_base_dynamic : public mer_base > {
public:
typedef T base_type;
typedef mer_base > super;
explicit mer_base_dynamic(unsigned int k) : super(k), k_(k) { }
mer_base_dynamic(const mer_base_dynamic& rhs) : super(rhs), k_(rhs.k()) { }
mer_base_dynamic(unsigned int k, const char* s) : super(k), k_(k) {
super::from_chars(s);
}
explicit mer_base_dynamic(const char* s) : super(strlen(s)), k_(strlen(s)) {
super::from_chars(s);
}
explicit mer_base_dynamic(const std::string& s) : super(s.size()), k_(s.size()) {
super::from_chars(s.begin());
}
template
explicit mer_base_dynamic(unsigned int k, const U& rhs) : super(k, rhs), k_(k) { }
~mer_base_dynamic() { }
mer_base_dynamic& operator=(const mer_base_dynamic rhs) {
if(k_ != rhs.k_)
throw std::length_error(error_different_k);
super::operator=(rhs);
return *this;
}
unsigned int k() const { return k_; }
static unsigned int k(unsigned int k) { return k; }
private:
const unsigned int k_;
};
template
struct mer_dna_traits > {
typedef T base_type;
};
// Mer type where the length is a static variable: the mer size is
// fixed for all k-mers in the application.
//
// The CI (Class Index) template parameter allows to have more than one such
// class with different length in the same application. Each class has
// its own static variable associated with it.
template
class mer_base_static : public mer_base > {
public:
typedef T base_type;
typedef mer_base > super;
static const int class_index = CI;
mer_base_static() : super(k_) { }
explicit mer_base_static(unsigned int k) : super(k_) {
if(k != k_)
throw std::length_error(error_different_k);
}
mer_base_static(const mer_base_static& rhs) : super(rhs) { }
mer_base_static(unsigned int k, const char* s) : super(k_) {
super::from_chars(s);
}
explicit mer_base_static(const char* s) : super(k_) {
super::from_chars(s);
}
explicit mer_base_static(const std::string& s) : super(k_) {
super::from_chars(s.begin());
}
template
mer_base_static(unsigned int k, const U& rhs) : super(k_, rhs) {
if(k != k_)
throw std::length_error(error_different_k);
}
mer_base_static& operator=(const char* s) { return super::operator=(s); }
mer_base_static& operator=(const std::string& s) { return super::operator=(s); }
~mer_base_static() { }
static unsigned int k(); // { return k_; }
static unsigned int k(unsigned int k) { std::swap(k, k_); return k; }
private:
static unsigned int k_;
};
template
unsigned int mer_base_static::k_ = 22;
template
unsigned int mer_base_static::k() { return k_; }
template
const int mer_base_static::class_index;
template
struct mer_dna_traits > {
typedef T base_type;
};
typedef std::ostream_iterator ostream_char_iterator;
template
inline std::ostream& operator<<(std::ostream& os, const mer_base& mer) {
// char s[static_cast(mer).k() + 1];
char s[mer.k() + 1];
mer.to_str(s);
return os << s;
}
typedef std::istream_iterator istream_char_iterator;
template
inline std::istream& operator>>(std::istream& is, mer_base& mer) {
if(is.flags() & std::ios::skipws) {
while(isspace(is.peek())) { is.ignore(1); }
}
char buffer[mer.k() + 1];
is.read(buffer, mer.k());
if(is.gcount() != mer.k())
goto error;
buffer[mer.k()] = '\0';
if(!mer.from_chars(buffer))
goto error;
return is;
error:
is.setstate(std::ios::failbit);
return is;
}
} // namespace mer_dna_ns
typedef mer_dna_ns::mer_base_static mer_dna32;
typedef mer_dna_ns::mer_base_static mer_dna64;
#ifdef HAVE_INT128
typedef mer_dna_ns::mer_base_static mer_dna128;
#endif
typedef mer_dna64 mer_dna;
} // namespace jellyfish
#endif