/* * inc/seq_util.h * * This file is part of SOAPdenovo. * * Part of this file is refered and modified from SparseAssembler * (See ). * */ #ifndef _SEQ_UTIL_H #define _SEQ_UTIL_H #include "stdinc.h" #include "core.h" extern inline void Init_Read ( string &seq, struct read_t &read ); extern inline uint64_t *str2bitsarr ( const char *c_str, int len, uint64_t *b_str, int arr_sz ); extern inline char *bitsarr2str ( uint64_t *b_seq, int len, char *c_str, int arr_sz ); extern inline void get_sub_arr ( uint64_t *bitsarr_in, int bitsarr_len, int begin_pos, int sub_sz, uint64_t *bitsarr_out ); extern inline void L_shift_NC ( uint64_t *bitsarr, int shift_sz, int arr_sz ); extern inline void R_shift_NC ( uint64_t *bitsarr, int shift_sz, int arr_sz ); extern inline int uint64_t_cmp ( uint64_t *A, uint64_t *B, int Kmer_arr_sz ); extern inline uint64_t *get_rev_comp_seq_arr ( uint64_t *seq_arr, int seq_size, int arr_sz ); extern inline uint64_t get_rev_comp_seq ( uint64_t seq, int seq_size ); extern inline uint64_t MurmurHash64A ( const void *key, int len, unsigned int seed ); inline void Init_Read ( string &seq, struct read_t &read ) { read.readLen = ( int ) seq.size(); int Read_arr_sz = read.readLen / 32 + 1; int rem = read.readLen % 32; if ( rem == 0 ) { Read_arr_sz--; } str2bitsarr ( seq.c_str(), ( int ) seq.size(), read.read_bits, Read_arr_sz ); } inline uint64_t *str2bitsarr ( const char *c_str, int len, uint64_t *b_str, int arr_sz ) { for ( int k = 0; k < arr_sz; ++k ) { b_str[k] = 0; } int arr_sz_needed = len / 32 + 1; int rem = len % 32; if ( rem == 0 ) { arr_sz_needed--; } int beg_arr_idx = arr_sz - arr_sz_needed; if ( rem == 0 && arr_sz_needed > 0 ) { rem = 32; } for ( int k = 0; k < len; k++ ) { if ( rem == 0 ) { beg_arr_idx++; rem = 32; } switch ( c_str[k] ) { case ( 'A' ) : case ( 'a' ) : case ( '0' ) : b_str[beg_arr_idx] <<= 2; rem--; break; case ( 'C' ) : case ( 'c' ) : case ( '1' ) : b_str[beg_arr_idx] <<= 2; ++b_str[beg_arr_idx]; rem--; break; case 'G': case 'g': case '2': b_str[beg_arr_idx] <<= 1; ++b_str[beg_arr_idx]; b_str[beg_arr_idx] <<= 1; rem--; break; case 'T': case 't': case '3': b_str[beg_arr_idx] <<= 1; ++b_str[beg_arr_idx]; b_str[beg_arr_idx] <<= 1; ++b_str[beg_arr_idx]; rem--; break; default: return b_str; } } return b_str; } inline char *bitsarr2str ( uint64_t *b_seq, int len, char *c_str, int arr_sz ) { int tot_bits = arr_sz * 64; for ( int i = 0; i < len; ++i ) { uint64_t temp, temp2[100]; for ( int k = 0; k < arr_sz; ++k ) { temp2[k] = b_seq[k]; } L_shift_NC ( temp2, tot_bits - ( len - i ) * 2, arr_sz ); R_shift_NC ( temp2, tot_bits - 2, arr_sz ); temp = temp2[arr_sz - 1]; switch ( temp ) { case 0: c_str[i] = 'A'; break; case 1: c_str[i] = 'C'; break; case 2: c_str[i] = 'G'; break; case 3: c_str[i] = 'T'; break; } } c_str[len] = '\0'; return c_str; } inline void get_sub_arr ( uint64_t *bitsarr_in, int bitsarr_len, int begin_pos, int sub_sz, uint64_t *bitsarr_out ) { if ( bitsarr_len < sub_sz ) { cout << "Error! Input kmer too short." << bitsarr_len << " " << sub_sz << endl; return; } int arr_sz_in = bitsarr_len / 32 + 1; int rem = bitsarr_len % 32; if ( rem == 0 ) { arr_sz_in--; } int arr_sz_out = sub_sz / 32 + 1; if ( sub_sz % 32 == 0 ) { arr_sz_out--; } uint64_t temp_arr[10]; memset ( temp_arr, 0, sizeof ( temp_arr ) ); memset ( bitsarr_out, 0, sizeof ( uint64_t ) *arr_sz_out ); int rem2 = ( 32 - rem + begin_pos ) % 32; int block_beg = ( 32 - rem + begin_pos ) / 32; if ( rem == 0 ) { block_beg--; } int rem3 = ( 32 - rem + begin_pos + sub_sz ) % 32; int block_end = ( 32 - rem + begin_pos + sub_sz ) / 32; if ( rem3 != 0 ) { rem3 = 32 - rem3; } else { block_end--; } if ( rem == 0 ) { block_end--; } int orig_sz = ( block_end - block_beg + 1 ); memcpy ( temp_arr, &bitsarr_in[block_beg], orig_sz * sizeof ( uint64_t ) ); L_shift_NC ( temp_arr, rem2 * 2, orig_sz ); R_shift_NC ( temp_arr, ( rem2 + rem3 ) % 32 * 2, arr_sz_out ); memcpy ( bitsarr_out, temp_arr, arr_sz_out * sizeof ( uint64_t ) ); } inline void L_shift_NC ( uint64_t *bitsarr, int shift_sz, int arr_sz ) { uint64_t temp_arr[100]; for ( int i = 0; i < arr_sz; ++i ) { temp_arr[i] = 0; } int jmp = shift_sz / 64; int offset = shift_sz % 64; for ( int i = 0; i < arr_sz; ++i ) { if ( i + jmp + 1 < arr_sz ) { uint64_t tt = 0; if ( offset == 0 ) { tt = 0; } else { tt = ( bitsarr[i + jmp + 1] >> ( 64 - offset ) ); } temp_arr[i] = ( ( bitsarr[i + jmp] << offset ) | tt ); } if ( i + jmp + 1 == arr_sz ) { temp_arr[i] = bitsarr[i + jmp] << offset; } if ( i + jmp + 1 > arr_sz ) { temp_arr[i] = 0; } } for ( int i = 0; i < arr_sz; ++i ) { bitsarr[i] = temp_arr[i]; } } inline void R_shift_NC ( uint64_t *bitsarr, int shift_sz, int arr_sz ) { uint64_t temp_arr[100]; for ( int i = 0; i < arr_sz; ++i ) { temp_arr[i] = 0; } int jmp = shift_sz / 64; int offset = shift_sz % 64; if ( offset == 0 ) //to fix the move 64bit bug { for ( int i = arr_sz - 1; i >= 0; --i ) { if ( i - jmp > 0 ) { temp_arr[i] = bitsarr[i - jmp]; } if ( i - jmp == 0 ) { temp_arr[i] = bitsarr[i - jmp]; } if ( i - jmp < 0 ) { temp_arr[i] = 0; } } } else { for ( int i = arr_sz - 1; i >= 0; --i ) { if ( i - jmp > 0 ) { temp_arr[i] = ( bitsarr[i - jmp] >> offset ) | ( bitsarr[i - jmp - 1] << ( 64 - offset ) ); } if ( i - jmp == 0 ) { temp_arr[i] = ( bitsarr[i - jmp] >> offset ); } if ( i - jmp < 0 ) { temp_arr[i] = 0; } } } for ( int i = 0; i < arr_sz; ++i ) { bitsarr[i] = temp_arr[i]; } } inline int uint64_t_cmp ( uint64_t *A, uint64_t *B, int Kmer_arr_sz ) { int flag = 0; for ( int jj = 0; jj < Kmer_arr_sz; ++jj ) { if ( A[jj] > B[jj] ) { flag = 1; break; } if ( A[jj] < B[jj] ) { flag = -1; break; } if ( A[jj] == B[jj] ) { continue; } } return flag; } //for 63mer inline uint64_t *get_rev_comp_seq_arr ( uint64_t *seq_arr, int seq_size, int arr_sz ) { if ( seq_size < 32 && arr_sz == 2 ) { seq_arr[1] = get_rev_comp_seq ( seq_arr[1], seq_size ); if ( seq_arr[0] != 0 ) { fprintf ( stderr, "ERROR: in get_rev_comp_seq_arr \n" ); exit ( -1 ); } return seq_arr; } int tot_bits = arr_sz * 64; for ( int i = 0; i < arr_sz; ++i ) { seq_arr[i] = ~seq_arr[i]; seq_arr[i] = ( ( seq_arr[i] & 0x3333333333333333 ) << 2 ) | ( ( seq_arr[i] & 0xCCCCCCCCCCCCCCCC ) >> 2 ); seq_arr[i] = ( ( seq_arr[i] & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq_arr[i] & 0xF0F0F0F0F0F0F0F0 ) >> 4 ); seq_arr[i] = ( ( seq_arr[i] & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq_arr[i] & 0xFF00FF00FF00FF00 ) >> 8 ); seq_arr[i] = ( ( seq_arr[i] & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq_arr[i] & 0xFFFF0000FFFF0000 ) >> 16 ); seq_arr[i] = ( ( seq_arr[i] & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq_arr[i] & 0xFFFFFFFF00000000 ) >> 32 ); } int j = 0, k = arr_sz - 1; for ( ; j < k; ++j, --k ) { uint64_t temp; temp = seq_arr[j]; seq_arr[j] = seq_arr[k]; seq_arr[k] = temp; } R_shift_NC ( seq_arr, tot_bits - ( seq_size * 2 ), arr_sz ); return seq_arr; } inline uint64_t get_rev_comp_seq ( uint64_t seq, int seq_size ) { seq = ~seq; seq = ( ( seq & 0x3333333333333333 ) << 2 ) | ( ( seq & 0xCCCCCCCCCCCCCCCC ) >> 2 ); seq = ( ( seq & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq & 0xF0F0F0F0F0F0F0F0 ) >> 4 ); seq = ( ( seq & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq & 0xFF00FF00FF00FF00 ) >> 8 ); seq = ( ( seq & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq & 0xFFFF0000FFFF0000 ) >> 16 ); seq = ( ( seq & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq & 0xFFFFFFFF00000000 ) >> 32 ); return seq >> ( 64 - ( seq_size * 2 ) ); } //for 64bit platform inline uint64_t MurmurHash64A ( const void *key, int len, unsigned int seed ) { const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; uint64_t h = seed ^ ( len * m ); const uint64_t *data = ( const uint64_t * ) key; const uint64_t *end = data + ( len / 8 ); while ( data != end ) { uint64_t k = *data++; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; } const unsigned char *data2 = ( const unsigned char * ) data; switch ( len & 7 ) { case 7: h ^= uint64_t ( data2[6] ) << 48; case 6: h ^= uint64_t ( data2[5] ) << 40; case 5: h ^= uint64_t ( data2[4] ) << 32; case 4: h ^= uint64_t ( data2[3] ) << 24; case 3: h ^= uint64_t ( data2[2] ) << 16; case 2: h ^= uint64_t ( data2[1] ) << 8; case 1: h ^= uint64_t ( data2[0] ); h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; } //for 32bit platform inline uint64_t MurmurHash64B ( const void *key, int len, unsigned int seed ) { const unsigned int m = 0x5bd1e995; const int r = 24; unsigned int h1 = seed ^ len; unsigned int h2 = 0; const unsigned int *data = ( const unsigned int * ) key; while ( len >= 8 ) { unsigned int k1 = *data++; k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; unsigned int k2 = *data++; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; } if ( len >= 4 ) { unsigned int k1 = *data++; k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; } switch ( len ) { case 3: h2 ^= ( ( unsigned char * ) data ) [2] << 16; case 2: h2 ^= ( ( unsigned char * ) data ) [1] << 8; case 1: h2 ^= ( ( unsigned char * ) data ) [0]; h2 *= m; }; h1 ^= h2 >> 18; h1 *= m; h2 ^= h1 >> 22; h2 *= m; h1 ^= h2 >> 17; h1 *= m; h2 ^= h1 >> 19; h2 *= m; uint64_t h = h1; h = ( h << 32 ) | h2; return h; } #endif