//================================================================= // This file is a part of cdhit-dup. // By Limin Fu (phoolimin@gmail.com, lmfu@ucsd.edu) //================================================================= #include #include #include "minString.hxx" #include "minArray.hxx" #include "minMap.hxx" #include "bioSequence.hxx" using namespace Min; using namespace Bio; class SequenceCluster : public Array { int id; int abundance; int chiHead; int chiTail; public: SequenceCluster( Sequence *rep = NULL ){ id = 0; abundance = 0; chiHead = chiTail = 0; if( rep ) Append( rep ); } String GetDescription( Sequence *seq, int deslen=0 ); int GetID()const{ return id; } void SetID( int i ){ id = i; } int GetAbundance()const{ return abundance; } void SetAbundance( int ab ){ abundance = ab; } int GetChimericParent1()const{ return chiHead; } int GetChimericParent2()const{ return chiTail; } void SetChimericParent( int head, int tail ){ chiHead = head; chiTail = tail; } void Write( FILE *fout = stdout, int id = 0, int deslen=0, const char *des=NULL ); bool operator<( const SequenceCluster & other ){ assert( Size() && other.Size() ); return (*this)[0]->Length() < other[0]->Length(); } }; String SequenceCluster::GetDescription( Sequence *seq, int deslen ) { String des = seq->Description(); int i = 0; if( des[i] == '>' || des[i] == '@' || des[i] == '+' ) i += 1; if( des[i] == ' ' || des[i] == '\t' ) i += 1; if( deslen == 0 || deslen > des.Size() ) deslen = des.Size(); while( i < deslen and ! isspace( des[i] ) ) i += 1; des.Resize( i ); return des; } void SequenceCluster::Write( FILE *fout, int id, int deslen, const char *cdes ) { //Array & seqs = *this; SequenceCluster & seqs = *this; String des = GetDescription( seqs[0], deslen ); int i = 0, n = Size(); fprintf( fout, ">Cluster %i%s\n", id, cdes ? cdes : "" ); fprintf( fout, "%i\t%int, >%s... *\n", 0, seqs[0]->Length(), des.Data() ); for(i=1; iLength(); int mm = CountMismatch( seqs[0]->SequenceData(), seqs[i]->SequenceData() ); fprintf( fout, "%i\t%int, >%s... at 1:%i:1:%i/+/%.2f%%\n", i, len, des.Data(), len, len, 100*(len-mm)/(float)len ); } } void WriteClusters( Array & clusters, const String & name = "temp.txt", int deslen = 0 ) { String cfile = name + ".clstr"; String cfile2 = name + "2.clstr"; FILE *fout1 = fopen( name.Data(), "w" ); FILE *fout2 = fopen( cfile.Data(), "w" ); FILE *fout3 = fopen( cfile2.Data(), "w" ); char cdes[200]; int i, n = clusters.Size(); int k1 = 0, k2 = 0; for(i=0; iPrint( fout1 ); cluster.SetID( k1 ); cluster.Write( fout2, k1++, deslen ); }else{ head = clusters[head].GetID(); tail = clusters[tail].GetID(); sprintf( cdes, " chimeric_parent1=%i,chimeric_parent2=%i", head, tail ); cluster.Write( fout3, k2++, deslen, cdes ); } } fclose( fout1 ); fclose( fout2 ); fclose( fout3 ); } //liwz //skip fout1 since R1 being modified due to padding void WriteClusters_clstronly( Array & clusters, const String & name = "temp.txt", int deslen = 0 ) { String cfile = name + ".clstr"; String cfile2 = name + "2.clstr"; // FILE *fout1 = fopen( name.Data(), "w" ); FILE *fout2 = fopen( cfile.Data(), "w" ); FILE *fout3 = fopen( cfile2.Data(), "w" ); char cdes[200]; int i, n = clusters.Size(); int k1 = 0, k2 = 0; for(i=0; iPrint( fout1 ); cluster.SetID( k1 ); cluster.Write( fout2, k1++, deslen ); }else{ head = clusters[head].GetID(); tail = clusters[tail].GetID(); sprintf( cdes, " chimeric_parent1=%i,chimeric_parent2=%i", head, tail ); cluster.Write( fout3, k2++, deslen, cdes ); } } // fclose( fout1 ); fclose( fout2 ); fclose( fout3 ); } void WriteClusters_seqonly( Array & clusters, const String & name = "temp.txt", int deslen = 0 ) { FILE *fout1 = fopen( name.Data(), "w" ); int i, n = clusters.Size(); for(i=0; iPrint( fout1 ); } } fclose( fout1 ); } void SortByAbundance( Array & clusters ) { int i, max = 0, min = 0; int N = clusters.Size(); if( N <= 1 ) return; max = min = clusters[0].GetAbundance(); for(i=1; i max ) max = ab; if( ab < min ) min = ab; } int M = max - min + 1; Min::Array count( M, 0 ); // count for each size = max_len - i Min::Array accum( M, 0 ); // count for all size > max_len - i Min::Array offset( M, 0 ); // offset from accum[i] when filling sorting Array sorted( N ); for (i=0; i= min ); return (int)sqrt( (len - min) / 10); } int HashingLength( int dep, int min ) { return min + 10 * dep * dep; } struct ChimericSource { int index; int head; int tail; ChimericSource( int i=0, int h=0, int t=0 ){ index = i; head = h; tail = t; } }; struct HashHit { int index; int offset; HashHit( int i = 0, int o = 0 ){ index = i; offset = o; } }; #define MAX_OFFSETS 5 struct HashHit3 { int index; int size; int offsets[MAX_OFFSETS]; int counts[MAX_OFFSETS]; // number of word hits with the corresponding offsets; HashHit3( int i = 0, int o = 0 ){ memset( offsets, 0, MAX_OFFSETS*sizeof(int) ); memset( counts, 0, MAX_OFFSETS*sizeof(int) ); index = i; size = 1; offsets[0] = o; counts[0] = 1; } void Update( int o ){ int imin = 0, min = counts[0]; for(int i=0; i max ){ max = counts[i]; imax = i; } } //printf( "BestOffset: %i %i %i %i\n", max, imax, size, offsets[imax] ); return offsets[imax]; } }; typedef Hash > HashHitTable; void UpdateHashTables( Array > & middles, String & seq, int id, int shared, int min, int primer ) { unsigned int hash = MakeHash( seq, shared ); int j, k; for(j=primer; (j+shared)<=seq.Size(); j+=shared){ int dep = j ? 0 : HashingDepth( seq.Size()-j, min ); for(k=0; k<=dep; k++){ hash = MakeHash( seq, j, HashingLength( k, min ) ); middles[j][k][hash].Append( id ); } } } void ClusterDuplicate( SequenceList & seqlist, Array & clusters, bool mlen, int errors = 0, float errors2 = 0.0 ) { int primer = 0; int maxmm = errors; unsigned int hash; int i, j, k, K, K2, M, N = seqlist.Count(); int shared, min, count = 0; int max = seqlist.MaxLength(); Hash > wholes; Array > > > middles; Node > *node; String & first = seqlist[0]->SequenceData(); primer = first.Size(); for(i=1; iSequenceData(), 0, primer, 0 ); if( primer < 15 ){ primer = 0; break; } } if( errors == 0 ) maxmm = max * errors2; printf( "primer = %i\n", primer ); shared = (seqlist.MinLength() - primer) / (maxmm + 1); if( shared < 30 ) shared = 30; min = shared; middles.Resize( max ); for(i=0; (i+shared)<=max; i++) middles[i].Resize( HashingDepth( max - i, min ) + 1 ); for(i=0; iSequenceData(); int qlen = seq.Size(); bool clustered = false; maxmm = errors; if( errors == 0 ) maxmm = qlen * errors2; if( i && (i%10000 == 0) ) printf( "Clustered %9i sequences with %9i clusters ...\n", i, (int)clusters.Size() ); hash = MakeHash( seq ); node = wholes.Find( hash ); if( node != NULL ){ Array & hits = node->value; for(j=0, M=hits.Size(); jSequenceData(); if( qlen == rep.Size() && Compare( seq, rep ) == 0 ){ clust.Append( seqlist[i] ); clustered = true; break; } } if( clustered ) continue; for(j=0, M=hits.Size(); jSequenceData(); if( mlen && rep.Size() != qlen ) continue; K = ComparePrefix( seq, rep, maxmm ); if( K == qlen ){ clust.Append( seqlist[i] ); clustered = true; break; } } if( clustered ) continue; } int dep = HashingDepth( qlen - primer, min ); int hashlen = HashingLength( dep, min ); hash = MakeHash( seq, primer, hashlen ); node = middles[primer][ dep ].Find( hash ); if( node ){ Array & hits = node->value; for(j=0, M=hits.Size(); jSequenceData(); if( mlen && rep.Size() != qlen ) continue; K = ComparePrefix( seq, rep, maxmm ); if( K == qlen ){ clustered = true; clust.Append( seqlist[i] ); break; } } } //seqlists[i]->Print(); int start = primer; dep = 0;//HashingDepth( shared, min ); while( maxmm == 0 && clustered == false && (start+2*shared) <= qlen && start <= primer + (maxmm+1) * shared ){ hash = MakeHash( seq, start, shared ); node = middles[start][ dep ].Find( hash ); start += shared; if( node == NULL ) continue; Array & hits = node->value; for(j=0, M=hits.Size(); jSequenceData(); if( mlen && rep.Size() != qlen ) continue; K = ComparePrefix( seq, rep, maxmm ); if( K == qlen ){ clustered = true; clust.Append( seqlist[i] ); break; } } } if( clustered == false ){ hash = MakeHash( seq ); UpdateHashTables( middles, seq, clusters.Size(), shared, min, primer ); wholes[ hash ].Append( clusters.Size() ); clusters.Append( SequenceCluster( seqlist[i] ) ); } } } void PrintChimeric( FILE *fout, Sequence *SQ, Sequence *SA, Sequence *SB, int IX, int GAP = 0 ) { String A( SA->SequenceData() ), B( SB->SequenceData() ), Q( SQ->SequenceData() ); fprintf( fout, "\nQuery = %s\n", SQ->Description().Data() ); fprintf( fout, "Parent A = %s\n", SA->Description().Data() ); fprintf( fout, "Parent B = %s\n", SB->Description().Data() ); fprintf( fout, "Crossover = %i\n", IX ); //if( offset != 0 ) fprintf( fout, "Offset = %i\n", offset ); //fprintf( fout, "LQA = %i\n", LQA ); //fprintf( fout, "LQB = %i\n", LQB ); //fprintf( fout, "RQA = %i\n", RQA ); //fprintf( fout, "RQB = %i\n", RQB ); if( GAP > 0 ){ while( GAP-- > 0 ) B.Insert( '-', IX ); }else if( GAP < 0 ){ while( GAP++ < 0 ){ A.Insert( '-', IX ); Q.Insert( '-', IX ); } } A.Insert( "|X|", IX ); Q.Insert( "|X|", IX ); B.Insert( "|X|", IX ); int I, J, W = 80; int LA = A.Size(); int LB = B.Size(); int LQ = Q.Size(); int L = LA; if( L < LB ) L = LB; if( L < LQ ) L = LQ; for(I=0; ISequenceData() ), B( SB->SequenceData() ), Q( SQ->SequenceData() ); fprintf( fout, "\nQuery = %s\n", SQ->Description().Data() ); fprintf( fout, "Parent A = %s\n", SA->Description().Data() ); fprintf( fout, "Parent B = %s\n", SB->Description().Data() ); fprintf( fout, "Crossover = %i\n", IX ); //if( offset != 0 ) fprintf( fout, "Offset = %i\n", offset ); //fprintf( fout, "LQA = %i\n", LQA ); //fprintf( fout, "LQB = %i\n", LQB ); //fprintf( fout, "RQA = %i\n", RQA ); //fprintf( fout, "RQB = %i\n", RQB ); if( offset > 0 ){ while( offset-- > 0 ){ A.Insert( '-', 0 ); Q.Insert( '-', 0 ); } }else if( offset < 0 ){ while( offset++ < 0 ) B.Insert( '-', 0 ); } A.Insert( "|X|", IX ); Q.Insert( "|X|", IX ); B.Insert( "|X|", IX ); int I, J, W = 80; int LA = A.Size(); int LB = B.Size(); int LQ = Q.Size(); int L = LA; if( L < LB ) L = LB; if( L < LQ ) L = LQ; for(I=0; Ioffset = offset; this->count = count; } bool operator<( const OffsetCount & other )const{ if( count != other.count ) return count > other.count; return offset < other.offset; } }; void DetectChimeric( Array & clusters, Array & chistat, int max, int shared, float percent, float abratio = 1, int minabu = 2 ) { int primer = 20; unsigned int hash; int i, j, k, K, K2, M, N = clusters.Size(); int min, count = 0, maxmm = 2, maxmm2 = maxmm + maxmm; Hash > hashTable; Node > *node; Hash chimap; Array hitMapping; Array hashes; Array hitList; Array parentAList; Array parentBList; Array offsetCounts; FILE *debug = NULL; if( N <= 2 ) return; if( shared < 20 ) shared = 20; min = shared; //debug = fopen( "debug.txt", "w+" ); hitList.Resize( N ); hitMapping.Resize( N ); String & first = clusters[0][0]->SequenceData(); primer = first.Size(); for(i=1; iSequenceData(), 0, primer, 0 ); if( primer < 15 ){ primer = 0; break; } } printf( "primer = %i\n", primer ); printf( "Searching for chimeric clusters ...\n" ); for(i=0; iSequenceData(); int qlen = seq.Size(); int qn = clusters[i].GetAbundance(); int maxmm = (int)(0.01*qlen*percent); int maxmm2 = (int)(0.015*qlen*percent); if( i && i % 1000 ==0 ) printf( "Checked %9i clusters, detected %9i chimeric clusters\n", i, (int)chistat.Size() ); if( qn < minabu ) break; hashes.ResetSize( primer ); for(j=primer; j<=qlen-shared; j++) hashes.Append( MakeHash( seq, j, shared ) ); if( qlen < (2*shared + primer) ){ for(j=primer,k=hashes.Size(); jPrint(); HashHit3 *hit = & hitList[0]; for(j=0,M=hitList.Size(); jindex] = 0; hitList.ResetSize( 0 ); bool detected = false; bool duplicate = false; int start = primer; int dep = 0; int proto = -1; int protomm = max; while( (start + shared) <= qlen ){ hash = hashes[start]; node = hashTable.Find( hash ); start += 1; if( node == NULL ) continue; Array & hits = node->value; for(j=0, M=hits.Size(); j & hits = hitList; for(j=0, M=hits.Size(); jSequenceData(); if( clusters[hit].GetAbundance() < qn*abratio ) continue; offsetCounts.ResetSize( 0 ); for(k=0; k= (primer + shared) ) parentAList.Append( ParentInfo( hit, 0, head, 0 ) ); int best = offsetCounts[0].offset; for(k=0; k<2; k++){ // check the best two offsets: best = offsetCounts[k].offset; if( qlen + best < rep.Size() ){ tail = CompareSuffix( seq, qlen-1, rep, qlen+best-1, 0, maxmm ); if( tail >= shared ){ parentBList.Append( ParentInfo( hit, qlen - tail + best, tail, best ) ); break; } }else{ tail = CompareSuffix( seq, rep.Size()-best-1, rep, rep.Size()-1, 0, maxmm ); if( tail >= shared ){ parentBList.Append( ParentInfo( hit, rep.Size() - tail, tail, best ) ); break; } } tail = 0; } if( (head + tail) >= qlen ){ int X = (head + qlen - tail) / 2; int mm = CountMismatch2( seq, rep, 0, X, maxmm2 ); mm += CountMismatch3( seq, X, rep, X+best, qlen, maxmm2 ); //printf( "\noffset = %5i; mm = %5i\n", best, mm ); //PrintChimeric2( stdout, clusters[i][0], clusters[hit][0], clusters[hit][0], X, best ); if( mm <= maxmm2 /*&& rep.Size() >= qlen*/ ){ if( mm < protomm ){ protomm = mm; proto = hit; } } }else{ int mm = CountMismatch2( seq, rep, 0, qlen, maxmm2 ); if( mm <= maxmm2 /*&& rep.Size() >= qlen*/ ){ if( mm < protomm ){ protomm = mm; proto = hit; } } } } if( protomm <= maxmm ){ if( chimap.Find( proto ) ){ ChimericSource cs = chistat[chimap[proto]]; chimap[i] = chistat.Size(); chistat.Append( ChimericSource( i, cs.head, cs.tail ) ); } continue; } //printf( "debug: %9i %9i %9i\n", (int)hitList.Size(), (int)parentAList.Size(), (int)parentBList.Size() ); parentBList.QuickSort(); for(j=0, M=parentAList.Size(); jSequenceData(); for(k=0; kSequenceData(); if( infoA.index == infoB.index ) continue; if( (infoB.start - infoB.offset) > infoA.len ) break; int XA = (infoA.len + infoB.start - infoB.offset) / 2; int XB = XA + infoB.offset; int LQA = CountMismatch( seq, repA, 0, XA ); int RQA = CountMismatch( seq, repA, XA, qlen ); int LQB = CountMismatch3( seq, 0, repB, 0, XA, qlen ); int RQB = CountMismatch3( seq, XA, repB, XB, qlen - XA, qlen ); if( proto < 0 ) protomm = maxmm2; if( LQB > (protomm + maxmm) && RQA > (protomm + maxmm) && (LQA + RQB) <= protomm ) { //PrintChimeric2( stdout, clusters[i][0], clusters[infoA.index][0], clusters[infoB.index][0], XA, infoB.offset ); detected = true; chimap[i] = chistat.Size(); chistat.Append( ChimericSource( i, infoA.index, infoB.index ) ); count += 1; break; } } if( detected ) break; } //if( (i+1) % 1000 ==0 ) printf( "Checked %9i clusters, detected %9i chimeric clusters\n", i+1, chistat.Size() ); //if( detected == false && duplicate == false ){ if( detected == false ){ for(j=primer; j max2 ? max1 : max2; if( uselen <= 0 ) uselen = max; if( uselen > max ) uselen = max; N = seqlist.Count(); for(k=0; kQualityScore().Size(); String & s1 = first->SequenceData(); String & s2 = second->SequenceData(); String & qs1 = first->QualityScore(); String & qs2 = second->QualityScore(); int n1 = s1.Size(); int n2 = s2.Size(); int i, j, m = 1; if( n1 < uselen ){ s1.Resize( uselen, 'N' ); qs1.Resize( uselen, 0 ); } if( n2 < uselen ){ s2.Resize( uselen, 'N' ); qs2.Resize( uselen, 0 ); seqlist2_modified = 1; } seqlist[k]->SequenceData().Chop( n1 - uselen ); seqlist[k]->SequenceData() += s2.Data(); seqlist[k]->SequenceData().Chop( n2 - uselen ); if( qs ){ seqlist[k]->QualityScore().Chop( n1 - uselen ); seqlist[k]->QualityScore() += qs2.Data(); seqlist[k]->QualityScore().Chop( n2 - uselen ); } } seqlist_modified = 1; }else if( uselen > 0 ){ N = seqlist.Count(); for(k=0; kQualityScore().Size(); String & s1 = first->SequenceData(); String & qs1 = first->QualityScore(); int n1 = s1.Size(); int i, j, m = 1; if( n1 < uselen ) continue; seqlist_modified = 1; seqlist[k]->SequenceData().Chop( n1 - uselen ); if( qs ) seqlist[k]->QualityScore().Chop( n1 - uselen ); } } if( seqlist.Count() == 0 ){ printf( "Abort: no sequence available for the analysis!\n" ); return 1; } if( seqlist.MaxLength() != seqlist.MinLength() ){ seqlist.SortByLength(); printf( "Sorted by length ...\n" ); } Array clusters; Array chistat; printf( "Start clustering duplicated sequences ...\n" ); ClusterDuplicate( seqlist, clusters, matchLength, errors, errors2 ); printf( "Number of reads: %i\n", seqlist.Count() ); printf( "Number of clusters found: %i\n", clusters.Size() ); for(i=0, m=clusters.Size(); iDescription(); int ab = 1; int pos = des.Find( "_abundance_" ); if( pos ) ab = strtol( des.Data() + pos + 10, NULL, 10 ); clusters[i].SetAbundance( ab ); } } SortByAbundance( clusters ); DetectChimeric( clusters, chistat, seqlist.MaxLength(), shared, percent, abratio, abundance ); for(i=0, m=0; i seqlist N = seqlist.Count(); for(k=0; kQualityScore().Size(); seqlist[k]->SequenceData().Reset(); seqlist[k]->SequenceData() += seqlist2[k]->SequenceData(); seqlist[k]->Description().Reset(); seqlist[k]->Description() += seqlist2[k]->Description(); if( qs ){ seqlist[k]->QualityScore().Reset(); seqlist[k]->QualityScore() += seqlist2[k]->QualityScore(); } seqlist_modified = 1; } // write R2 printf( "Write R2 reads\n" ); WriteClusters_seqonly( clusters, output2, deslen ); } if (seqlist_modified){ // since seqlist was nodified, re-read from file, this time use seqlist2 as storage seqlist2.Clear(); if( not seqlist2.ReadFastAQ( input ) ){ printf( "File openning failed: %s\n", input.Data() ); return 1; } // copy seqlist2 => seqlist N = seqlist.Count(); for(k=0; kQualityScore().Size(); seqlist[k]->SequenceData().Reset(); seqlist[k]->SequenceData() += seqlist2[k]->SequenceData(); seqlist[k]->Description().Reset(); seqlist[k]->Description() += seqlist2[k]->Description(); if( qs ){ seqlist[k]->QualityScore().Reset(); seqlist[k]->QualityScore() += seqlist2[k]->QualityScore(); } } // overwrite R1 if R1 was modified printf( "Write R1 reads\n" ); WriteClusters_seqonly( clusters, output, deslen ); } printf( "Done!\n" ); return 0; }