/*
* build_graph.cpp
*
* This file is part of SOAPdenovo.
*
* Part of this file is refered and modified from SparseAssembler
* (See ).
*
*/
#include "stdinc.h"
#include "core.h"
#include "global.h"
#include "seq_util.h"
#include "io_func.h"
#include "build_graph.h"
static void process_round1_threaded ( struct read_t *read, struct hashtable2 *ht, pthread_spinlock_t *locks, size_t *bucket_count, int K_size, int gap );
//static void process_round2_threaded(struct read_t *read,struct hashtable2 *ht,pthread_spinlock_t *locks,size_t *edge_cnt,int K_size,int gap);
//for debug
//static void process_round1_threaded_d(struct read_t *read, struct hashtable2 *ht,pthread_spinlock_t *locks,size_t *bucket_count,int K_size,int gap);
static void process_round2_threaded_d ( struct read_t *read, struct hashtable2 *ht, pthread_spinlock_t *locks, size_t *edge_cnt, int K_size, int gap );
/*************************************************
Function:
run_process_threaded
Description:
Builds the sparse de-Brujin graph from reads.
It calls function "process_round1_threaded" for round1 and
"process_round2_threaded_d" for round 2 building process.
Input:
1. ht: the graph hashtable
2. locks: the locks array for hashtable
3. K_size: kmer size
3. gap: the skipped distance
4. read_num: the number of reads for processing
5. thrd_num_s: the thread number for building sparse de-Brujin graph
6. thrd_id: current thread's id
7. round: current building round (1,2)
Output:
None.
Return:
None.
*************************************************/
void run_process_threaded ( struct hashtable2 *ht, pthread_spinlock_t *locks, int K_size, int gap, size_t read_num, int thrd_num_s, int thrd_id, int round )
{
read_t read_tmp;
for ( int i = thrd_id; i < read_num; i += thrd_num_s )
{
int bad_flag = 0;
filter_N ( seq_t[i], bad_flag );
if ( bad_flag )
{
seq_t[i].clear();
continue;
}
Init_Read ( seq_t[i], read_tmp );
if ( round == 1 )
{
//cout << "round 1:"<< seq_t[i] <readLen;
int OverlappingKmers = readLen - K_size + 1;
if ( gap >= OverlappingKmers )
{
return;
}
int Read_arr_sz = readLen / 32 + 1;
int rem = readLen % 32;
if ( rem == 0 )
{
Read_arr_sz--;
}
#ifdef _63MER_
int Kmer_arr_sz = 2;
int tot_bits = Read_arr_sz * 64;
#endif
#ifdef _127MER_
int Kmer_arr_sz = 4;
int tot_bits = Read_arr_sz * 128;
#endif
size_t ht_sz = ht->ht_sz;
bool flip[500], found[500];
size_t hash_idx[500];
memset ( flip, 0, sizeof ( flip ) );
memset ( found, 0, sizeof ( found ) );
kmer_t2 seq[500], f_seq[500];
memset ( seq, 0, sizeof ( seq ) );
uint64_t hv[500], temp_bits[500];
bucket2 **bktptr[500];
char c_str[500];
for ( int j = 0; j < OverlappingKmers; j++ )
{
get_sub_arr ( read->read_bits, read->readLen, j, K_size, seq[j].kmer );
#ifdef _63MER_
if ( K_size <= 31 ) //fix the represent bug
{
( seq[j].kmer ) [1] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [0] = 0;
}
#endif
#ifdef _127MER_ //fix the represent bug
if ( K_size <= 31 ) //fix the represent bug
{
( seq[j].kmer ) [3] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [0] = 0;
}
else if ( K_size <= 63 )
{
( seq[j].kmer ) [3] = ( seq[j].kmer ) [1];
( seq[j].kmer ) [2] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [1] = 0;
( seq[j].kmer ) [0] = 0;
}
else if ( K_size <= 95 )
{
( seq[j].kmer ) [3] = ( seq[j].kmer ) [2];
( seq[j].kmer ) [2] = ( seq[j].kmer ) [1];
( seq[j].kmer ) [1] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [0] = 0;
}
#endif
memcpy ( &f_seq[j], &seq[j], Kmer_arr_sz * sizeof ( uint64_t ) );
get_rev_comp_seq_arr ( ( f_seq[j].kmer ), K_size, Kmer_arr_sz ); //TODO ,add 127mer support
if ( uint64_t_cmp ( seq[j].kmer, f_seq[j].kmer, Kmer_arr_sz ) > 0 )
{
flip[j] = 1;
}
if ( flip[j] == 1 )
{
memcpy ( temp_bits, & ( seq[j].kmer ), Kmer_arr_sz * sizeof ( uint64_t ) );
memcpy ( & ( seq[j].kmer ), & ( f_seq[j].kmer ), Kmer_arr_sz * sizeof ( uint64_t ) );
memcpy ( & ( f_seq[j].kmer ), temp_bits, Kmer_arr_sz * sizeof ( uint64_t ) );
}
hv[j] = MurmurHash64A ( ( seq[j].kmer ), sizeof ( seq[j] ), 0 );
hash_idx[j] = ( size_t ) ( hv[j] % ht_sz );
bktptr[j] = & ( ht->store_pos[hash_idx[j]] );
}
int g, h;
g = 0;
for ( int k = 0; k < gap; ++k )
{
pthread_spin_lock ( &locks[hash_idx[k]] );
found[k] = look_up_in_a_list2_r1 ( &seq[k], ( struct bucket2_r1 ** * ) &bktptr[k] );
pthread_spin_unlock ( &locks[hash_idx[k]] );
if ( found[k] == 1 )
{
g = k;
break;
}
}
for ( int j = g; j < OverlappingKmers; )
{
h = gap;
for ( int k = 0; k < gap; ++k )
{
if ( ( j + k ) >= OverlappingKmers - 1 )
{
h = k + 1; //ÉáÆúµô×îºóÒ»¸ökmer
break;
}
pthread_spin_lock ( &locks[hash_idx[j + k]] );
found[j + k] = look_up_in_a_list2_r1 ( &seq[j + k], ( bucket2_r1 ** * ) &bktptr[j + k] ); //lock...
pthread_spin_unlock ( &locks[hash_idx[j + k]] );
if ( k > 0 && found[j + k] == 1 )
{
h = k;
break;
}
}
pthread_spin_lock ( &locks[hash_idx[j]] );
found[j] = look_up_in_a_list2_r1 ( &seq[j], ( bucket2_r1 ** * ) &bktptr[j] ); //lock...
//pthread_spin_unlock ( &locks[hash_idx[j]] );
if ( found[j] == 0 )
{
//pthread_spin_lock ( &locks[hash_idx[j]] );
* ( bktptr[j] ) = ( struct bucket2 * ) malloc ( sizeof ( struct bucket2_r1 ) ); //lock ...
memset ( * ( bktptr[j] ), 0, sizeof ( struct bucket2_r1 ) );
memcpy ( & ( ( ( struct bucket2_r1 * ) * ( bktptr[j] ) )->kmer_t2.kmer ), & ( seq[j].kmer ), Kmer_arr_sz * sizeof ( uint64_t ) );
( ( struct bucket2_r1 * ) * ( bktptr[j] ) )->kmer_info.cov1 = 0;
// the cvg is useless in round 1
//pthread_spin_unlock ( &locks[hash_idx[j]] );
( *bucket_count ) ++;
}
pthread_spin_unlock(&locks[hash_idx[j]]);
j = j + h;
if ( j >= OverlappingKmers )
{
break;
}
}
}
/*************************************************
Function:
process_round2_threaded_d
Description:
Processes one read in round2:
1. Chops read to kmers
2. Searches the selected sparse-kmers
3. Builds the kmer-edges (the connection between sparse-kmers)
Input:
1. read: a read
2. ht: the graph hashtable
3. locks: the locks array for hashtable
4. edge_cnt: useless
5. K_size: kmer size
6. gap: the skipped distance
Output:
None.
Return:
None.
*************************************************/
static void process_round2_threaded_d ( struct read_t *read, struct hashtable2 *ht, pthread_spinlock_t *locks, size_t *edge_cnt, int K_size, int gap )
{
static size_t i;
int readLen = read->readLen;
int OverlappingKmers = readLen - K_size + 1;
if ( gap >= OverlappingKmers )
{
return;
}
int Read_arr_sz = readLen / 32 + 1;
int rem = readLen % 32;
if ( rem == 0 )
{
Read_arr_sz--;
}
#ifdef _63MER_
int Kmer_arr_sz = 2;
int tot_bits = Read_arr_sz * 64;
#endif
#ifdef _127MER_
int Kmer_arr_sz = 4;
int tot_bits = Read_arr_sz * 128;
#endif
size_t ht_sz = ht->ht_sz;
bool flip[500], found[500];
size_t hash_idx[500];
memset ( flip, 0, sizeof ( flip ) );
memset ( found, 0, sizeof ( found ) );
kmer_t2 seq[500], f_seq[500];
uint64_t hv[500], temp_bits[500];
bucket2 **bktptr[500];
char c_str[500];
for ( int j = 0; j < OverlappingKmers; j++ )
{
get_sub_arr ( read->read_bits, read->readLen, j, K_size, seq[j].kmer );
#ifdef _63MER_
if ( K_size <= 31 ) //fix the represent bug
{
( seq[j].kmer ) [1] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [0] = 0;
}
#endif
#ifdef _127MER_ //fix the represent bug
if ( K_size <= 31 ) //fix the represent bug
{
( seq[j].kmer ) [3] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [0] = 0;
}
else if ( K_size <= 63 )
{
( seq[j].kmer ) [3] = ( seq[j].kmer ) [1];
( seq[j].kmer ) [2] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [1] = 0;
( seq[j].kmer ) [0] = 0;
}
else if ( K_size <= 95 )
{
( seq[j].kmer ) [3] = ( seq[j].kmer ) [2];
( seq[j].kmer ) [2] = ( seq[j].kmer ) [1];
( seq[j].kmer ) [1] = ( seq[j].kmer ) [0];
( seq[j].kmer ) [0] = 0;
}
#endif
memcpy ( &f_seq[j], &seq[j], Kmer_arr_sz * sizeof ( uint64_t ) );
get_rev_comp_seq_arr ( ( f_seq[j].kmer ), K_size, Kmer_arr_sz );
if ( uint64_t_cmp ( seq[j].kmer, f_seq[j].kmer, Kmer_arr_sz ) > 0 )
{
flip[j] = 1;
}
if ( flip[j] == 1 )
{
memcpy ( temp_bits, & ( seq[j].kmer ), Kmer_arr_sz * sizeof ( uint64_t ) );
memcpy ( & ( seq[j].kmer ), & ( f_seq[j].kmer ), Kmer_arr_sz * sizeof ( uint64_t ) );
memcpy ( & ( f_seq[j].kmer ), temp_bits, Kmer_arr_sz * sizeof ( uint64_t ) );
}
hv[j] = MurmurHash64A ( ( seq[j].kmer ), sizeof ( seq[j] ), 0 );
hash_idx[j] = ( size_t ) ( hv[j] % ht_sz );
bktptr[j] = & ( ht->store_pos[hash_idx[j]] );
found[j] = look_up_in_a_list2 ( &seq[j], &bktptr[j] );
}
int last_found = -1;
int cur_found = -1;
int h = -1;
for ( int i = 0; i < OverlappingKmers; ++i )
{
if ( found[i] )
{
pthread_spin_lock ( &locks[hash_idx[i]] );
if ( ( * ( bktptr[i] ) )->kmer_info.cov1 < 0xffff )
{
( * ( bktptr[i] ) )->kmer_info.cov1++;
}
pthread_spin_unlock ( &locks[hash_idx[i]] );
cur_found = i;
if ( last_found != -1 )
{
if ( cur_found - last_found > gap )
{
fprintf ( stderr, "ERROR: cur_found - last_found > gap !\n" );
exit ( -1 );
}
//add edge ...
h = cur_found - last_found;
uint64_t left_bits;
get_sub_arr ( read->read_bits, read->readLen, last_found, h, &left_bits );
pthread_spin_lock ( &locks[hash_idx[cur_found]] ); //lock for cur_found node to add left edge
if ( flip[cur_found] == 0 )
{
struct edge_node **edge_node_p2p = & ( ( * ( bktptr[cur_found] ) )->kmer_info.left );
while ( ( *edge_node_p2p ) != NULL )
{
if ( ( *edge_node_p2p )->edge == ( uint64_t ) left_bits && ( ( *edge_node_p2p )->len + 1 ) == h )
{
if ( ( *edge_node_p2p )->edge_cov < 0x7f )
{
( *edge_node_p2p )->edge_cov++;
}
break;
}
edge_node_p2p = & ( ( *edge_node_p2p )->nxt_edge );
}
if ( ( *edge_node_p2p ) == NULL )
{
( *edge_node_p2p ) = ( struct edge_node * ) malloc ( sizeof ( struct edge_node ) );
( *edge_cnt ) ++;
memset ( *edge_node_p2p, 0, sizeof ( struct edge_node ) );
( *edge_node_p2p )->edge = ( uint64_t ) left_bits;
( *edge_node_p2p )->edge_cov = 1;
( *edge_node_p2p )->len = h - 1;
}
}
else
{
left_bits = get_rev_comp_seq ( left_bits, h );
struct edge_node **edge_node_p2p = & ( ( * ( bktptr[cur_found] ) )->kmer_info.right );
while ( ( *edge_node_p2p ) != NULL )
{
if ( ( *edge_node_p2p )->edge == ( uint64_t ) left_bits && ( ( *edge_node_p2p )->len + 1 ) == h )
{
if ( ( *edge_node_p2p )->edge_cov < 0x7f )
{
( *edge_node_p2p )->edge_cov++;
}
break;
}
edge_node_p2p = & ( ( *edge_node_p2p )->nxt_edge );
}
if ( ( *edge_node_p2p ) == NULL )
{
( *edge_node_p2p ) = ( struct edge_node * ) malloc ( sizeof ( struct edge_node ) );
( *edge_cnt ) ++;
memset ( *edge_node_p2p, 0, sizeof ( struct edge_node ) );
( *edge_node_p2p )->edge = ( uint64_t ) left_bits;
( *edge_node_p2p )->edge_cov = 1;
( *edge_node_p2p )->len = h - 1;
}
}
pthread_spin_unlock ( &locks[hash_idx[cur_found]] );
uint64_t right_bits;
get_sub_arr ( read->read_bits, read->readLen, last_found + K_size, h, &right_bits );
pthread_spin_lock ( &locks[hash_idx[last_found]] ); //lock ...
if ( flip[last_found] == 1 )
{
right_bits = get_rev_comp_seq ( right_bits, h );
struct edge_node **edge_node_p2p = & ( ( * ( bktptr[last_found] ) )->kmer_info.left );
while ( ( *edge_node_p2p ) != NULL )
{
if ( ( *edge_node_p2p )->edge == ( uint64_t ) right_bits && ( ( *edge_node_p2p )->len + 1 ) == h )
{
if ( ( *edge_node_p2p )->edge_cov < 0x7f )
{
( *edge_node_p2p )->edge_cov++;
}
break;
}
edge_node_p2p = & ( ( *edge_node_p2p )->nxt_edge );
}
if ( ( *edge_node_p2p ) == NULL )
{
( *edge_node_p2p ) = ( struct edge_node * ) malloc ( sizeof ( struct edge_node ) );
( *edge_cnt ) ++;
memset ( *edge_node_p2p, 0, sizeof ( struct edge_node ) );
( *edge_node_p2p )->edge = ( uint64_t ) right_bits;
( *edge_node_p2p )->edge_cov = 1;
( *edge_node_p2p )->len = h - 1;
}
}
else
{
struct edge_node **edge_node_p2p = & ( ( * ( bktptr[last_found] ) )->kmer_info.right );
while ( ( *edge_node_p2p ) != NULL )
{
if ( ( *edge_node_p2p )->edge == ( uint64_t ) right_bits && ( ( *edge_node_p2p )->len + 1 == h ) )
{
if ( ( *edge_node_p2p )->edge_cov < 0x7f )
{
( *edge_node_p2p )->edge_cov++;
}
break;
}
edge_node_p2p = & ( ( *edge_node_p2p )->nxt_edge );
}
if ( ( *edge_node_p2p ) == NULL )
{
( *edge_node_p2p ) = ( struct edge_node * ) malloc ( sizeof ( struct edge_node ) );
( *edge_cnt ) ++;
memset ( *edge_node_p2p, 0, sizeof ( struct edge_node ) );
( *edge_node_p2p )->edge = ( uint64_t ) right_bits;
( *edge_node_p2p )->edge_cov = 1;
( *edge_node_p2p )->len = h - 1;
}
}
pthread_spin_unlock ( &locks[hash_idx[last_found]] ); //lock ...
}
last_found = cur_found;
}
}
}
/*************************************************
Function:
SwitchBuckets
Description:
Switches struct bucket form round1 bucket to round2 bucket.
Input:
1. ht: the graph hashtable
2. K_size: the kmer size
Output:
None.
Return:
None.
*************************************************/
void SwitchBuckets ( hashtable2 *ht2, int K_size )
{
size_t ht_sz;
ht_sz = ht2->ht_sz;
bucket2_r1 *store_pos_o, *store_pos_t;
bucket2 *store_pos_n;
bucket2 **bktp2p;
for ( size_t i = 0; i < ht_sz; ++i )
{
bktp2p = & ( ht2->store_pos[i] );
store_pos_o = ( bucket2_r1 * ) ht2->store_pos[i];
while ( store_pos_o != NULL )
{
store_pos_n = ( bucket2 * ) malloc ( sizeof ( struct bucket2 ) );
memset ( store_pos_n, 0, sizeof ( bucket2 ) );
store_pos_n->kmer_t2 = store_pos_o->kmer_t2;
store_pos_n->kmer_info.cov1 = store_pos_o->kmer_info.cov1;
*bktp2p = store_pos_n;
bktp2p = & ( store_pos_n->nxt_bucket );
store_pos_t = store_pos_o;
store_pos_o = store_pos_o->nxt_bucket;
free ( store_pos_t );
}
}
}
/* c++ implementation ..
void SavingSparseKmerGraph2(hashtable2 *ht,char * outfile)
{
ofstream o_ht_idx,o_ht_content;
string ht_idx_name,ht_content_name,kmer_freq;
ht_idx_name.append(outfile).append(".ht_idx");
ht_content_name.append(outfile).append(".ht_content");
kmer_freq.append(outfile).append(".kmerFreq");
map cov_hist;
//string ht_idx_name="HT_idx.txt",ht_content_name="HT_content";
o_ht_idx.open(ht_idx_name.c_str(),ios_base::out|ios_base::binary);
o_ht_content.open(ht_content_name.c_str(),ios_base::out|ios_base::binary);
o_ht_idx<<"Hashtable size: "<ht_sz<ht_sz;++i)
{
size_t list_sz=0;
bktptr=ht->store_pos[i];
while(bktptr!=NULL)
{
if(o_ht_content.write((char*) bktptr,sizeof(struct bucket2)))
{
edge_ptr=bktptr->kmer_info.left;
while(edge_ptr!=NULL)
{
o_ht_content.write((char*) edge_ptr,sizeof(struct edge_node));
edge_ptr=edge_ptr->nxt_edge;
}
edge_ptr=bktptr->kmer_info.right;
while(edge_ptr!=NULL)
{
o_ht_content.write((char*) edge_ptr,sizeof(struct edge_node));
edge_ptr=edge_ptr->nxt_edge;
}
int cov=bktptr->kmer_info.cov1;
cov_hist[cov]++;
bktptr=bktptr->nxt_bucket;
list_sz++;
}
else
{cout<<"Write error!"<::iterator mit;
for(mit=cov_hist.begin();mit!=cov_hist.end();++mit)
{
o_cov<first<<" "<second<ht_sz );
bucket2 *bktptr = NULL;
struct edge_node *edge_ptr;
for ( size_t i = 0; i < ht->ht_sz; ++i )
{
size_t list_sz = 0;
bktptr = ht->store_pos[i];
while ( bktptr != NULL )
{
if ( fwrite ( ( char * ) bktptr, sizeof ( struct bucket2 ), 1, o_ht_content ) )
{
edge_ptr = bktptr->kmer_info.left;
while ( edge_ptr != NULL )
{
fwrite ( ( char * ) edge_ptr, sizeof ( struct edge_node ), 1, o_ht_content );
edge_ptr = edge_ptr->nxt_edge;
}
edge_ptr = bktptr->kmer_info.right;
while ( edge_ptr != NULL )
{
fwrite ( ( char * ) edge_ptr, sizeof ( struct edge_node ), 1, o_ht_content );
edge_ptr = edge_ptr->nxt_edge;
}
int cov = bktptr->kmer_info.cov1;
if ( cov >= 255 )
{
cov_hist[255]++;
}
else
{
cov_hist[cov]++;
}
bktptr = bktptr->nxt_bucket;
list_sz++;
}
else
{
cerr << "Write error!" << endl;
}
}
fprintf ( o_ht_idx, "%llu\n", list_sz );
}
for ( int i = 1; i < 256; i++ )
{
fprintf ( o_cov, "%d\t%llu\n", i, cov_hist[i] );
}
fclose ( o_ht_idx );
fclose ( o_ht_content );
fclose ( o_cov );
}
void LoadingSparseKmerGraph2 ( hashtable2 *ht, char *outfile )
{
string ht_idx_name, ht_content_name;
ht_idx_name.append ( outfile ).append ( ".ht_idx" );
ht_content_name.append ( outfile ).append ( ".ht_content" );
ifstream in_ht_idx ( ht_idx_name.c_str(), ios_base::in | ios_base::binary ), in_ht_content ( ht_content_name.c_str(), ios_base::in | ios_base::binary );
size_t ht_sz;
string s;
getline ( in_ht_idx, s );
getline ( in_ht_idx, s );
ht_sz = atoi ( s.c_str() ); //cerr<store_pos[i] );
*bktp2p = NULL;
for ( int j = 0; j < list_sz; ++j )
{
*bktp2p = ( struct bucket2 * ) malloc ( sizeof ( struct bucket2 ) );
if ( in_ht_content.read ( ( char * ) ( *bktp2p ), sizeof ( struct bucket2 ) ) )
{
( *bktp2p )->nxt_bucket = NULL;
( *bktp2p )->kmer_info.used = 0;
edge_p2p = & ( ( *bktp2p )->kmer_info.left );
while ( ( *edge_p2p ) != NULL )
{
( *edge_p2p ) = ( struct edge_node * ) malloc ( sizeof ( struct edge_node ) );
in_ht_content.read ( ( char * ) ( *edge_p2p ), sizeof ( struct edge_node ) );
edge_p2p = & ( ( *edge_p2p )->nxt_edge );
}
edge_p2p = & ( ( *bktp2p )->kmer_info.right );
while ( ( *edge_p2p ) != NULL )
{
( *edge_p2p ) = ( struct edge_node * ) malloc ( sizeof ( struct edge_node ) );
in_ht_content.read ( ( char * ) ( *edge_p2p ), sizeof ( struct edge_node ) );
edge_p2p = & ( ( *edge_p2p )->nxt_edge );
}
bktp2p = & ( ( *bktp2p )->nxt_bucket );
}
else
{
cerr << "Read error!" << endl;
}
}
}
}