//================================================================= // This file is a part of cdhit-dup. // By Limin Fu (phoolimin@gmail.com, lmfu@ucsd.edu) //================================================================= #include #include #include "minString.hxx" #include "minArray.hxx" #include "minMap.hxx" #include "bioSequence.hxx" using namespace Min; using namespace Bio; char base_mapping[128] = {0}; char rev_comp_mapping[128] = {0}; uint64_t base_powers[33]; FILE *fout_log = stdout; FILE *fout_rep = NULL; FILE *fout_clstr = NULL; uint64_t EncodeWord( const char *word, unsigned char W ) { unsigned char i; uint64_t *power = base_powers; uint64_t code = 0; for(i=0; i { int id; int abundance; int chiHead; int chiTail; public: SequenceCluster( Sequence *rep = NULL ){ id = 0; abundance = 0; chiHead = chiTail = 0; if( rep ) Append( ClusterMember( rep ) ); } String GetDescription( Sequence *seq, int deslen=0 ); int GetID()const{ return id; } void SetID( int i ){ id = i; } int GetAbundance()const{ return abundance; } void SetAbundance( int ab ){ abundance = ab; } int GetChimericParent1()const{ return chiHead; } int GetChimericParent2()const{ return chiTail; } void SetChimericParent( int head, int tail ){ chiHead = head; chiTail = tail; } void Write( FILE *fout = stdout, int id = 0, int deslen=0, const char *des=NULL ); bool operator<( const SequenceCluster & other ){ assert( Size() && other.Size() ); return (*this)[0].seq->Length() < other[0].seq->Length(); } }; String SequenceCluster::GetDescription( Sequence *seq, int deslen ) { String des = seq->Description(); int i = 0; if( des[i] == '>' || des[i] == '@' || des[i] == '+' ) i += 1; if( des[i] == ' ' || des[i] == '\t' ) i += 1; if( deslen == 0 || deslen > des.Size() ) deslen = des.Size(); while( i < deslen and ! isspace( des[i] ) ) i += 1; des.Resize( i ); return des; } void SequenceCluster::Write( FILE *fout, int id, int deslen, const char *cdes ) { //Array & seqs = *this; SequenceCluster & seqs = *this; String des = GetDescription( seqs[0].seq, deslen ); String & rep = seqs[0].seq->SequenceData(); int len2 = seqs[0].seq->Length(); int i = 0, n = Size(); fprintf( fout, ">Cluster %i%s\n", id, cdes ? cdes : "" ); fprintf( fout, "%i\t%int, >%s... *\n", 0, seqs[0].seq->Length(), des.Data() ); for(i=1; iSequenceData(); String des = GetDescription( seqs[i].seq, deslen ); int len = seqs[i].seq->Length(); int startQ = seqs[i].offset + 1; int startR = seqs[i].offset2 + 1; int endQ = len; int endR = len2; int mm = 0; if( (endQ - startQ) < (endR - startR) ){ endR = startR + endQ - startQ; }else{ endQ = startQ + endR - startR; } if( seqs[i].revcomp ){ startQ = len - startQ + 1; endQ = len - endQ + 1; } fprintf( fout, "%i\t%int, >%s... at %i:%i:%i:%i/%c/%.2f%%\n", i, len, des.Data(), startQ, endQ, startR, endR, seqs[i].revcomp ? '-' : '+', 100*(len-mm)/(float)len ); } } void WriteClusters( Array & clusters, const String & name = "temp.txt", int deslen = 0 ) { char cdes[200]; int i, n = clusters.Size(); int k1 = 0, k2 = 0; for(i=0; iPrint( fout_rep ); cluster.SetID( k1 ); cluster.Write( fout_clstr, k1++, deslen ); } } int HashingDepth( int len, int min ) { assert( len >= min ); return (int)sqrt( (len - min) / 10); } int HashingLength( int dep, int min ) { return min + 10 * dep * dep; } #define USE_WORD void ClusterOverlap( SequenceList & seqlist, Array & clusters, int min, float minper ) { String reverse; Array clusters2; #ifdef USE_WORD Hash > headHashes; Hash > tailHashes; Node > *node; #else Hash > headHashes; Hash > tailHashes; Node > *node; #endif int i, i2, j, k, m, N = seqlist.Count(); int WORD = min; #ifndef USE_WORD if( WORD > 30 ) WORD = 30; #endif for(i=0; iLength(); int min2 = len * minper; bool clustered = false; if( min2 < min ) min2 = min; String *ss = & seq->SequenceData(); reverse.ResetSize( len ); for(j=0; jData()[len - j - 1] ]; for(j=(len-WORD-1); j>=min2-WORD; j--){ for(i2=0; i2<2; i2++){ String *ss = & seq->SequenceData(); if( i2 ) ss = & reverse; //if( i2 ) break; #ifdef USE_WORD uint64_t hash = EncodeWord( ss->Data() + j, WORD ); #else unsigned int hash = MakeHash( *ss, j, WORD ); #endif node = tailHashes.Find( hash ); if( node == NULL ) continue; Array & clusts = node->value; for(int j2=0, m=clusts.Size(); j2SequenceData(); if( rep.Size() < (j + WORD) ) continue; const char *r = rep.Data() + (rep.Size() - j - WORD); const char *q = ss->Data(); int M = j + WORD; for(k=0; kSequenceData(); if( i2 ) ss = & reverse; //if( i2 ) break; #ifdef USE_WORD uint64_t hash = EncodeWord( ss->Data() + j, WORD ); #else unsigned int hash = MakeHash( *ss, j, WORD ); #endif node = headHashes.Find( hash ); if( node == NULL ) continue; Array & clusts = node->value; for(int j2=0, m=clusts.Size(); j2SequenceData(); if( rep.Size() < (len - j) ) continue; const char *r = rep.Data(); const char *q = ss->Data() + j; int M = len - j; for(k=0; kSequenceData().Data(), WORD ); uint64_t tail = EncodeWord( seq->SequenceData().Data() + (len - WORD), WORD ); #else unsigned int head = MakeHash( seq->SequenceData(), WORD ); unsigned int tail = MakeHash( seq->SequenceData(), (len - WORD), WORD ); #endif clusters2.Append( new SequenceCluster( seq ) ); headHashes[head].Append( clusters2.Back() ); tailHashes[tail].Append( clusters2.Back() ); } if( (i+1)%100000 == 0 ) fprintf( fout_log, "Clustered %9i sequences with %9i clusters ...\n", i+1, clusters2.Size() ); } for(i=0, m=clusters2.Size(); i clusters; fprintf( fout_log, "Start clustering duplicated sequences ...\n" ); ClusterOverlap( seqlist, clusters, minlen, minper ); fprintf( fout_log, "Number of clusters found: %i\n", clusters.Size() ); WriteClusters( clusters, output, deslen ); fprintf( fout_log, "Done!\n" ); fclose( fout_log ); fclose( fout_rep ); fclose( fout_clstr ); return 0; }