/* * localAsm.c * * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * * SOAPdenovo is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SOAPdenovo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SOAPdenovo. If not, see . * */ #include "stdinc.h" #include "newhash.h" #include "kmerhash.h" #include "extfunc.h" #include "extvab.h" #define CTGendLen 35 // shouldn't larger than max_read_len #define UPlimit 5000 #define MaxRouteNum 10 static void kmerSet_mark ( KmerSet *set ); static void trace4Repeat ( Kmer currW, int steps, int min, int max, int *num_route, KmerSet *kset, Kmer kmerDest, int overlap, Kmer WORDF, int *traceCounter, int maxRoute, kmer_t **soFarNode, short *multiOccu1, short *multiOccu2, int *routeLens, char **foundRoutes, char *soFarSeq, long long *soFarLinks, double *avgLinks ); #ifdef MER127 static Kmer prevKmerLocal ( Kmer next, char ch, int overlap ) { Kmer word = KmerRightBitMoveBy2 ( next ); if ( 2 * ( overlap - 1 ) < 64 ) { word.low2 |= ( ( ( ubyte8 ) ch ) << 2 * ( overlap - 1 ) ); } if ( 2 * ( overlap - 1 ) >= 64 && 2 * ( overlap - 1 ) < 128 ) { word.high2 |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 64 ); } if ( 2 * ( overlap - 1 ) >= 128 && 2 * ( overlap - 1 ) < 192 ) { word.low1 |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 128 ); } if ( 2 * ( overlap - 1 ) >= 192 && 2 * ( overlap - 1 ) < 256 ) { word.high1 |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 192 ); } return word; } static Kmer nextKmerLocal ( Kmer prev, char ch, Kmer WordFilter ) { Kmer word = KmerLeftBitMoveBy2 ( prev ); word = KmerAnd ( word, WordFilter ); word.low2 |= ch; return word; } #else static Kmer prevKmerLocal ( Kmer next, char ch, int overlap ) { Kmer word = KmerRightBitMoveBy2 ( next ); if ( 2 * ( overlap - 1 ) < 64 ) { word.low |= ( ( ( ubyte8 ) ch ) << 2 * ( overlap - 1 ) ); } else { word.high |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 64 ); } return word; } static Kmer nextKmerLocal ( Kmer prev, char ch, Kmer WordFilter ) { Kmer word = KmerLeftBitMoveBy2 ( prev ); word = KmerAnd ( word, WordFilter ); word.low |= ch; return word; } #endif static void singleKmer ( int t, KmerSet *kset, int flag, Kmer *kmerBuffer, char *prevcBuffer, char *nextcBuffer ) { kmer_t *pos; put_kmerset ( kset, kmerBuffer[t], prevcBuffer[t], nextcBuffer[t], &pos ); if ( pos->inEdge == flag ) { return; } else if ( pos->inEdge == 0 ) { pos->inEdge = flag; } else if ( pos->inEdge == 1 && flag == 2 ) { pos->inEdge = 3; } else if ( pos->inEdge == 2 && flag == 1 ) { pos->inEdge = 3; } } static void putKmer2DBgraph ( KmerSet *kset, int flag, int kmer_c, Kmer *kmerBuffer, char *prevcBuffer, char *nextcBuffer ) { int t; for ( t = 0; t < kmer_c; t++ ) { singleKmer ( t, kset, flag, kmerBuffer, prevcBuffer, nextcBuffer ); } } static void getSeqFromRead ( READNEARBY read, char *src_seq ) { int len_seq = read.len; int j; char *tightSeq = ( char * ) darrayGet ( readSeqInGap, read.seqStarter ); for ( j = 0; j < len_seq; j++ ) { src_seq[j] = getCharInTightString ( tightSeq, j ); } } #ifdef MER127 static void chopKmer4Ctg ( Kmer *kmerCtg, int lenCtg, int overlap, char *src_seq, Kmer WORDF ) { int index, j; Kmer word; word.high1 = word.low1 = word.high2 = word.low2 = 0; for ( index = 0; index < overlap; index++ ) { word = KmerLeftBitMoveBy2 ( word ); word.low2 |= src_seq[index]; } index = 0; kmerCtg[index++] = word; for ( j = 1; j <= lenCtg - overlap; j++ ) { word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF ); kmerCtg[index++] = word; } } #else static void chopKmer4Ctg ( Kmer *kmerCtg, int lenCtg, int overlap, char *src_seq, Kmer WORDF ) { int index, j; Kmer word; word.high = word.low = 0; for ( index = 0; index < overlap; index++ ) { word = KmerLeftBitMoveBy2 ( word ); word.low |= src_seq[index]; } index = 0; kmerCtg[index++] = word; for ( j = 1; j <= lenCtg - overlap; j++ ) { word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF ); kmerCtg[index++] = word; } } #endif static void chopKmer4read ( int len_seq, int overlap, char *src_seq, char *bal_seq, Kmer *kmerBuffer, char *prevcBuffer, char *nextcBuffer, int *kmer_c, Kmer WORDF ) { int j, bal_j; Kmer word, bal_word; int index; char InvalidCh = 4; if ( len_seq < overlap + 1 ) { *kmer_c = 0; return; } #ifdef MER127 word.high1 = word.low1 = word.high2 = word.low2 = 0; for ( index = 0; index < overlap; index++ ) { word = KmerLeftBitMoveBy2 ( word ); word.low2 |= src_seq[index]; } #else word.high = word.low = 0; for ( index = 0; index < overlap; index++ ) { word = KmerLeftBitMoveBy2 ( word ); word.low |= src_seq[index]; } #endif reverseComplementSeq ( src_seq, len_seq, bal_seq ); // complementary node bal_word = reverseComplement ( word, overlap ); bal_j = len_seq - 0 - overlap; // 0; index = 0; if ( KmerSmaller ( word, bal_word ) ) { kmerBuffer[index] = word; prevcBuffer[index] = InvalidCh; nextcBuffer[index++] = src_seq[0 + overlap]; } else { kmerBuffer[index] = bal_word; prevcBuffer[index] = bal_seq[bal_j - 1]; nextcBuffer[index++] = InvalidCh; } for ( j = 1; j <= len_seq - overlap; j++ ) { word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF ); bal_j = len_seq - j - overlap; // j; bal_word = prevKmerLocal ( bal_word, bal_seq[bal_j], overlap ); if ( KmerSmaller ( word, bal_word ) ) { kmerBuffer[index] = word; prevcBuffer[index] = src_seq[j - 1]; if ( j < len_seq - overlap ) { nextcBuffer[index++] = src_seq[j + overlap]; } else { nextcBuffer[index++] = InvalidCh; } //printf("%dth: %p with %p\n",kmer_c-1,word,hashBanBuffer[kmer_c-1]); } else { // complementary node kmerBuffer[index] = bal_word; if ( bal_j > 0 ) { prevcBuffer[index] = bal_seq[bal_j - 1]; } else { prevcBuffer[index] = InvalidCh; } nextcBuffer[index++] = bal_seq[bal_j + overlap]; //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]); } } *kmer_c = index; } static void headTightStr ( char *tightStr, int length, int start, int headLen, int revS, char *src_seq ) { int i, index = 0; if ( !revS ) { for ( i = start; i < start + headLen; i++ ) { src_seq[index++] = getCharInTightString ( tightStr, i ); } } else { for ( i = length - 1 - start; i >= length - headLen - start; i-- ) { src_seq[index++] = int_comp ( getCharInTightString ( tightStr, i ) ); } } } static int getSeqFromCtg ( CTGinSCAF *ctg, boolean fromHead, unsigned int len, int originOverlap, char *src_seq ) { unsigned int ctgId = ctg->ctgID; unsigned int bal_ctg = getTwinCtg ( ctgId ); if ( contig_array[ctgId].length < 1 ) { return 0; } unsigned int length = contig_array[ctgId].length + originOverlap; len = len < length ? len : length; if ( fromHead ) { if ( contig_array[ctgId].seq ) { headTightStr ( contig_array[ctgId].seq, length, 0, len, 0, src_seq ); } else { headTightStr ( contig_array[bal_ctg].seq, length, 0, len, 1, src_seq ); } } else { if ( contig_array[ctgId].seq ) { headTightStr ( contig_array[ctgId].seq, length, length - len, len, 0, src_seq ); } else { headTightStr ( contig_array[bal_ctg].seq, length, length - len, len, 1, src_seq ); } } return len; } static KmerSet *readsInGap2DBgraph ( READNEARBY *rdArray, int num, CTGinSCAF *ctg1, CTGinSCAF *ctg2, int originOverlap, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, Kmer WordFilter ) { int kmer_c; Kmer *kmerBuffer; char *nextcBuffer, *prevcBuffer; int i; int buffer_size = maxReadLen > CTGendLen ? maxReadLen : CTGendLen; KmerSet *kmerS = NULL; int lenCtg1; int lenCtg2; char *bal_seq; char *src_seq; src_seq = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); bal_seq = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); lenCtg1 = getSeqFromCtg ( ctg1, 0, CTGendLen, originOverlap, src_seq ); lenCtg2 = getSeqFromCtg ( ctg2, 1, CTGendLen, originOverlap, src_seq ); if ( lenCtg1 <= overlap || lenCtg2 <= overlap ) { free ( ( void * ) src_seq ); free ( ( void * ) bal_seq ); return kmerS; } kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) ); kmerS = init_kmerset ( 1024, 0.77f ); for ( i = 0; i < num; i++ ) { getSeqFromRead ( rdArray[i], src_seq ); chopKmer4read ( rdArray[i].len, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter ); putKmer2DBgraph ( kmerS, 0, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer ); } lenCtg1 = getSeqFromCtg ( ctg1, 0, CTGendLen, originOverlap, src_seq ); chopKmer4Ctg ( kmerCtg1, lenCtg1, overlap, src_seq, WordFilter ); chopKmer4read ( lenCtg1, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter ); putKmer2DBgraph ( kmerS, 1, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer ); lenCtg2 = getSeqFromCtg ( ctg2, 1, CTGendLen, originOverlap, src_seq ); chopKmer4Ctg ( kmerCtg2, lenCtg2, overlap, src_seq, WordFilter ); chopKmer4read ( lenCtg2, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter ); putKmer2DBgraph ( kmerS, 2, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer ); /* if(ctg1->ctgID==3733&&ctg2->ctgID==3067){ for(i=0;i= 32 && overlap < 64 ) { bit4 = 32; bit3 = overlap - 32; bit2 = 0; bit1 = 0; } if ( overlap >= 64 && overlap < 96 ) { bit4 = 32; bit3 = 32; bit2 = overlap - 64; bit1 = 0; } if ( overlap >= 96 && overlap < 128 ) { bit4 = 32; bit3 = 32; bit2 = 32; bit1 = overlap - 96; } for ( i = bit1 - 1; i >= 0; i-- ) { ch = kmer.high1 & 0x3; kmer.high1 >>= 2; kmerSeq[i] = ch; } for ( i = bit2 - 1; i >= 0; i-- ) { ch = kmer.low1 & 0x3; kmer.low1 >>= 2; kmerSeq[i + bit1] = ch; } for ( i = bit3 - 1; i >= 0; i-- ) { ch = kmer.high2 & 0x3; kmer.high2 >>= 2; kmerSeq[i + bit1 + bit2] = ch; } for ( i = bit4 - 1; i >= 0; i-- ) { ch = kmer.low2 & 0x3; kmer.low2 >>= 2; kmerSeq[i + bit1 + bit2 + bit3] = ch; } for ( i = 0; i < overlap; i++ ) { fprintf ( fp, "%c", int2base ( ( int ) kmerSeq[i] ) ); } } #else static void printKmerSeqLocal ( FILE *fp, Kmer kmer, int overlap ) { int i, bit1, bit2; char ch; char kmerSeq[64]; bit2 = overlap > 32 ? 32 : overlap; bit1 = overlap > 32 ? overlap - 32 : 0; for ( i = bit1 - 1; i >= 0; i-- ) { ch = kmer.high & 0x3; kmer.high >>= 2; kmerSeq[i] = ch; } for ( i = bit2 - 1; i >= 0; i-- ) { ch = kmer.low & 0x3; kmer.low >>= 2; kmerSeq[i + bit1] = ch; } for ( i = 0; i < overlap; i++ ) { fprintf ( fp, "%c", int2base ( ( int ) kmerSeq[i] ) ); } } #endif static void kmerSet_mark ( KmerSet *set ) { int i, in_num, out_num, cvgSingle; kmer_t *rs; long long counter = 0, linear = 0; Kmer word; set->iter_ptr = 0; while ( set->iter_ptr < set->size ) { if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) ) { in_num = out_num = 0; rs = set->array + set->iter_ptr; word = rs->seq; for ( i = 0; i < 4; i++ ) { cvgSingle = get_kmer_left_cov ( *rs, i ); if ( cvgSingle > 0 ) { in_num++; } cvgSingle = get_kmer_right_cov ( *rs, i ); if ( cvgSingle > 0 ) { out_num++; } } if ( rs->single ) { counter++; } if ( in_num == 1 && out_num == 1 ) { rs->linear = 1; linear++; } } set->iter_ptr++; } //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear); } static kmer_t *searchNode ( Kmer word, KmerSet *kset, int overlap ) { Kmer bal_word = reverseComplement ( word, overlap ); kmer_t *node; boolean found; if ( KmerSmaller ( word, bal_word ) ) { found = search_kmerset ( kset, word, &node ); } else { found = search_kmerset ( kset, bal_word, &node ); } if ( found ) { return node; } else { return NULL; } } static int searchKmerOnCtg ( Kmer currW, Kmer *kmerDest, int num ) { int i; for ( i = 0; i < num; i++ ) { if ( KmerEqual ( currW, kmerDest[i] ) ) { return i; } } return -1; } // pick on from n items randomly static int nPick1 ( int *array, int n ) { int m, i; m = n - 1; //(int)(drand48()*n); int value = array[m]; for ( i = m; i < n - 1; i++ ) { array[i] = array[i + 1]; } return value; } static void traceAlongDBgraph ( Kmer currW, int steps, int min, int max, int *num_route, KmerSet *kset, Kmer *kmerDest, int num, int overlap, Kmer WORDF, char **foundRoutes, int *routeEndOnCtg2, int *routeLens, char *soFarSeq, int *traceCounter, int maxRoute, kmer_t **soFarNode, boolean *multiOccu, long long *soFarLinks, double *avgLinks ) { ( *traceCounter ) ++; if ( *traceCounter > UPlimit ) { /* if(overlap==19&&kmerDest[0]==pubKmer) printf("UPlimit\n"); */ return; } if ( steps > max || *num_route >= maxRoute ) { /* if(overlap==19&&kmerDest[0]==pubKmer) printf("max steps/maxRoute\n"); */ return; } Kmer word = reverseComplement ( currW, overlap ); boolean isSmaller = KmerSmaller ( currW, word ); int i; char ch; unsigned char links; if ( isSmaller ) { word = currW; } kmer_t *node; boolean found = search_kmerset ( kset, word, &node ); // #ifdef DEBUG if ( !found ) { fprintf ( stderr, "%s Trace: can't find kmer ", __FUNCTION__ ); PrintKmer ( stderr, word ); fprintf ( stderr, " (input " ); PrintKmer ( stderr, currW ); fprintf ( stderr, ") at step %d.\n", steps ); /* #ifdef MER127 fprintf (stderr, "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d.\n", __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1, currW.high2, currW.low2, steps ); #else printf ( "Trace: can't find kmer %llx %llx (input %llx %llx) at step %d\n", word.high, word.low, currW.high, currW.low, steps ); #endif */ return; } // #else // if (!found) return; // #endif if ( node->twin > 1 ) { return; } if ( soFarNode ) { soFarNode[steps] = node; } if ( steps > 0 ) { soFarSeq[steps - 1] = lastCharInKmer ( currW ); } int index, end; int linkCounter = *soFarLinks; if ( steps >= min && node->inEdge > 1 && ( end = searchKmerOnCtg ( currW, kmerDest, num ) ) >= 0 ) { index = *num_route; if ( steps > 0 ) { avgLinks[index] = ( double ) linkCounter / steps; } else { avgLinks[index] = 0; } //find node that appears more than once in the path multiOccu[index] = 0; for ( i = 0; i < steps + 1; i++ ) { soFarNode[i]->deleted = 0; } for ( i = 0; i < steps + 1; i++ ) { if ( soFarNode[i]->deleted ) { multiOccu[index] = 1; break; } soFarNode[i]->deleted = 1; } routeEndOnCtg2[index] = end; routeLens[index] = steps; char *array = foundRoutes[index]; for ( i = 0; i < steps; i++ ) { array[i] = soFarSeq[i]; } if ( i < max ) { array[i] = 4; } //indicate the end of the sequence *num_route = ++index; return; } steps++; if ( isSmaller ) { int array[] = { 0, 1, 2, 3 }; for ( i = 4; i > 0; i-- ) { ch = nPick1 ( array, i ); links = get_kmer_right_cov ( *node, ch ); if ( !links ) { continue; } *soFarLinks = linkCounter + links; word = nextKmerLocal ( currW, ch, WORDF ); traceAlongDBgraph ( word, steps, min, max, num_route, kset, kmerDest, num, overlap, WORDF, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute, soFarNode, multiOccu, soFarLinks, avgLinks ); } } else { int array[] = { 0, 1, 2, 3 }; for ( i = 4; i > 0; i-- ) { ch = nPick1 ( array, i ); links = get_kmer_left_cov ( *node, ch ); if ( !links ) { continue; } *soFarLinks = linkCounter + links; word = nextKmerLocal ( currW, int_comp ( ch ), WORDF ); traceAlongDBgraph ( word, steps, min, max, num_route, kset, kmerDest, num, overlap, WORDF, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute, soFarNode, multiOccu, soFarLinks, avgLinks ); } } } static int searchFgap ( KmerSet *kset, CTGinSCAF *ctg1, CTGinSCAF *ctg2, Kmer *kmerCtg1, Kmer *kmerCtg2, unsigned int origOverlap, int overlap, DARRAY *gapSeqArray, int len1, int len2, Kmer WordFilter, int *offset1, int *offset2, char *seqGap, int *cut1, int *cut2 ) { int i; int ret = 0; kmer_t *node, **soFarNode; int num_route; int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; //0531 int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; char **foundRoutes; char *soFarSeq; int traceCounter; int *routeEndOnCtg2; int *routeLens; boolean *multiOccu; long long soFarLinks; double *avgLinks; //mask linear internal linear kmer on contig1 end routeEndOnCtg2 = ( int * ) ckalloc ( MaxRouteNum * sizeof ( int ) ); routeLens = ( int * ) ckalloc ( MaxRouteNum * sizeof ( int ) ); multiOccu = ( boolean * ) ckalloc ( MaxRouteNum * sizeof ( boolean ) ); short *MULTI1 = ( short * ) ckalloc ( MaxRouteNum * sizeof ( short ) ); short *MULTI2 = ( short * ) ckalloc ( MaxRouteNum * sizeof ( short ) ); soFarSeq = ( char * ) ckalloc ( max * sizeof ( char ) ); soFarNode = ( kmer_t ** ) ckalloc ( ( max + 1 ) * sizeof ( kmer_t * ) ); foundRoutes = ( char ** ) ckalloc ( MaxRouteNum * sizeof ( char * ) );; avgLinks = ( double * ) ckalloc ( MaxRouteNum * sizeof ( double ) );; for ( i = 0; i < MaxRouteNum; i++ ) { foundRoutes[i] = ( char * ) ckalloc ( max * sizeof ( char ) ); } for ( i = len1 - 1; i >= 0; i-- ) { num_route = traceCounter = soFarLinks = 0; int steps = 0; traceAlongDBgraph ( kmerCtg1[i], steps, min, max, &num_route, kset, kmerCtg2, len2, overlap, WordFilter, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, &traceCounter, MaxRouteNum, soFarNode, multiOccu, &soFarLinks, avgLinks ); if ( num_route > 0 ) { int m, minEnd = routeEndOnCtg2[0]; for ( m = 0; m < num_route; m++ ) { if ( routeLens[m] < 0 ) { continue; } if ( routeEndOnCtg2[m] < minEnd ) { minEnd = routeEndOnCtg2[m]; } } /* else if(minFreq>1){ for(m=0;m3) break; printf("%c",int2base((int)foundRoutes[m][j])); } printf(": %4.2f\n",avgLinks[m]); } } */ num_route = traceCounter = soFarLinks = 0; steps = 0; trace4Repeat ( kmerCtg1[i], steps, min, max, &num_route, kset, kmerCtg2[minEnd], overlap, WordFilter, &traceCounter, MaxRouteNum, soFarNode, MULTI1, MULTI2, routeLens, foundRoutes, soFarSeq, &soFarLinks, avgLinks ); int j, best = 0; int maxLen = routeLens[0]; double maxLink = avgLinks[0]; char *pt; boolean repeat = 0, sameLen = 1; int leftMost = max, rightMost = max; #ifdef DEBUG if ( num_route < 1 ) { fprintf ( stderr, "After trace4Repeat: non route was found.\n" ); continue; } #else if ( num_route < 1 ) { continue; } #endif if ( num_route > 1 ) { // if multi paths are found, we check on the repeatative occurrences and links/length for ( m = 0; m < num_route; m++ ) { if ( routeLens[m] < 0 ) { continue; } if ( MULTI1[m] >= 0 && MULTI2[m] >= 0 ) { repeat = 1; leftMost = leftMost > MULTI1[m] ? MULTI1[m] : leftMost; rightMost = rightMost > MULTI2[m] ? MULTI2[m] : rightMost; } if ( routeLens[m] != maxLen ) { sameLen = 0; } if ( routeLens[m] < maxLen ) { maxLen = routeLens[m]; } if ( avgLinks[m] > maxLink ) { maxLink = avgLinks[m]; best = m; } } } if ( repeat ) { *offset1 = *offset2 = *cut1 = *cut2 = 0; int index = 0; char ch; for ( j = 0; j < leftMost; j++ ) { if ( routeLens[0] < j + overlap + 1 ) { break; } else { ch = foundRoutes[0][j]; } for ( m = 1; m < num_route; m++ ) { if ( routeLens[m] < 0 ) { continue; } if ( ch != foundRoutes[m][j] ) { break; } } if ( m == num_route ) { seqGap[index++] = ch; } else { break; } } *offset1 = index; index = 0; for ( j = 0; j < rightMost; j++ ) { if ( routeLens[0] - overlap - 1 < j ) { break; } else { ch = foundRoutes[0][routeLens[0] - overlap - 1 - j]; } for ( m = 1; m < num_route; m++ ) { if ( routeLens[m] < 0 ) { continue; } if ( ch != foundRoutes[m][routeLens[m] - overlap - 1 - j] ) { break; } } if ( m == num_route ) { index++; } else { break; } } *offset2 = index; for ( j = 0; j < *offset2; j++ ) { seqGap[*offset1 + *offset2 - 1 - j] = foundRoutes[0][routeLens[0] - overlap - 1 - j]; } if ( *offset1 > 0 || *offset2 > 0 ) { *cut1 = len1 - i - 1; *cut2 = minEnd; //fprintf(stderr,"\n"); for ( m = 0; m < num_route; m++ ) { for ( j = 0; j < max; j++ ) { if ( foundRoutes[m][j] > 3 ) { break; } //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j])); } //fprintf(stderr,": %4.2f\n",avgLinks[m]); } /* fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2); for(index=0;index<*offset1+*offset2;index++) fprintf(stderr,"%c",int2base(seqGap[index])); fprintf(stderr,"\n"); */ } ret = 3; break; } if ( overlap + ( len1 - i - 1 ) + minEnd - routeLens[best] > ( int ) origOverlap ) { continue; } ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = routeLens[best]; if ( !darrayPut ( gapSeqArray, ctg1->gapSeqOffset + maxLen / 4 ) ) { continue; } pt = ( char * ) darrayPut ( gapSeqArray, ctg1->gapSeqOffset ); /* printKmerSeqLocal(stderr,kmerCtg1[i],overlap); fprintf(stderr,"-"); */ for ( j = 0; j < max; j++ ) { if ( foundRoutes[best][j] > 3 ) { break; } writeChar2tightString ( foundRoutes[best][j], pt, j ); //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j])); } //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]); ctg1->cutTail = len1 - i - 1; ctg2->cutHead = overlap + minEnd; ctg2->scaftig_start = 0; ret = 1; break; /* }if(num_route>1){ ret = 2; break; */ } else //mark node which leads to dead end { node = searchNode ( kmerCtg1[i], kset, overlap ); if ( node ) { node->twin = 2; } } } for ( i = 0; i < MaxRouteNum; i++ ) { free ( ( void * ) foundRoutes[i] ); } free ( ( void * ) soFarSeq ); free ( ( void * ) soFarNode ); free ( ( void * ) multiOccu ); free ( ( void * ) MULTI1 ); free ( ( void * ) MULTI2 ); free ( ( void * ) foundRoutes ); free ( ( void * ) routeEndOnCtg2 ); free ( ( void * ) routeLens ); return ret; } static void trace4Repeat ( Kmer currW, int steps, int min, int max, int *num_route, KmerSet *kset, Kmer kmerDest, int overlap, Kmer WORDF, int *traceCounter, int maxRoute, kmer_t **soFarNode, short *multiOccu1, short *multiOccu2, int *routeLens, char **foundRoutes, char *soFarSeq, long long *soFarLinks, double *avgLinks ) { ( *traceCounter ) ++; if ( *traceCounter > UPlimit ) { return; } if ( steps > max || *num_route >= maxRoute ) { return; } Kmer word = reverseComplement ( currW, overlap ); boolean isSmaller = KmerSmaller ( currW, word ); char ch; unsigned char links; int index, i; if ( isSmaller ) { word = currW; } kmer_t *node; boolean found = search_kmerset ( kset, word, &node ); // #ifdef DEBUG if ( !found ) { fprintf ( stderr, "%s Trace: can't find kmer ", __FUNCTION__ ); PrintKmer ( stderr, word ); fprintf ( stderr, " (input " ); PrintKmer ( stderr, currW ); fprintf ( stderr, ") at step %d.\n", steps ); /* #ifdef MER127 printf ( "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d\n", __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1, currW.high2, currW.low2, steps ); #else printf ( "Trace: can't find kmer %llx %llx (input %llx %llx) at step %d\n", word.high, word.low, currW.high, currW.low, steps ); #endif */ return; } // #else // if (!found) return; // #endif if ( soFarNode ) { soFarNode[steps] = node; } if ( soFarSeq && steps > 0 ) { soFarSeq[steps - 1] = lastCharInKmer ( currW ); } int linkCounter; if ( soFarLinks ) { linkCounter = *soFarLinks; } if ( steps >= min && KmerEqual ( currW, kmerDest ) ) { index = *num_route; if ( avgLinks && steps > 0 ) { avgLinks[index] = ( double ) linkCounter / steps; } else if ( avgLinks ) { avgLinks[index] = 0; } //find node that appears more than once in the path if ( multiOccu1 && multiOccu2 ) { for ( i = 0; i < steps + 1; i++ ) { soFarNode[i]->deleted = 0; } int rightMost = 0; boolean MULTI = 0; for ( i = 0; i < steps + 1; i++ ) { if ( soFarNode[i]->deleted ) { rightMost = rightMost < i - 1 ? i - 1 : rightMost; MULTI = 1; } soFarNode[i]->deleted = 1; } if ( !MULTI ) { multiOccu1[index] = multiOccu2[index] = -1; } else { multiOccu2[index] = steps - 2 - rightMost < 0 ? 0 : steps - 2 - rightMost; //[0 steps-2] for ( i = 0; i < steps + 1; i++ ) { soFarNode[i]->deleted = 0; } int leftMost = steps - 2; for ( i = steps; i >= 0; i-- ) { if ( soFarNode[i]->deleted ) { leftMost = leftMost > i - 1 ? i - 1 : leftMost; } soFarNode[i]->deleted = 1; } multiOccu1[index] = leftMost < 0 ? 0 : leftMost; //[0 steps-2] } } if ( routeLens ) { routeLens[index] = steps; } if ( soFarSeq ) { char *array = foundRoutes[index]; for ( i = 0; i < steps; i++ ) { array[i] = soFarSeq[i]; } if ( i < max ) { array[i] = 4; } //indicate the end of the sequence } *num_route = ++index; } steps++; if ( isSmaller ) { int array[] = { 0, 1, 2, 3 }; for ( i = 4; i > 0; i-- ) { ch = nPick1 ( array, i ); links = get_kmer_right_cov ( *node, ch ); if ( !links ) { continue; } if ( soFarLinks ) { *soFarLinks = linkCounter + links; } word = nextKmerLocal ( currW, ch, WORDF ); trace4Repeat ( word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode, multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks ); } } else { int array[] = { 0, 1, 2, 3 }; for ( i = 4; i > 0; i-- ) { ch = nPick1 ( array, i ); links = get_kmer_left_cov ( *node, ch ); if ( !links ) { continue; } if ( soFarLinks ) { *soFarLinks = linkCounter + links; } word = nextKmerLocal ( currW, int_comp ( ch ), WORDF ); trace4Repeat ( word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode, multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks ); } } } //found repeat node on contig ends static void maskRepeatNode ( KmerSet *kset, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, int len1, int len2, int max, Kmer WordFilter ) { int i; int num_route, steps; int min = 1, maxRoute = 1; int traceCounter; Kmer word, bal_word; kmer_t *node; boolean found; int counter = 0; for ( i = 0; i < len1; i++ ) { word = kmerCtg1[i]; bal_word = reverseComplement ( word, overlap ); if ( KmerLarger ( word, bal_word ) ) { word = bal_word; } found = search_kmerset ( kset, word, &node ); if ( !found || node->linear ) { //printf("Found no node for kmer %llx\n",word); continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat ( word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); if ( num_route < 1 ) { continue; } counter++; node->checked = 1; } for ( i = 0; i < len2; i++ ) { word = kmerCtg2[i]; bal_word = reverseComplement ( word, overlap ); if ( KmerLarger ( word, bal_word ) ) { word = bal_word; } found = search_kmerset ( kset, word, &node ); if ( !found || node->linear ) { //printf("Found no node for kmer %llx\n",word); continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat ( word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); if ( num_route < 1 ) { continue; } counter++; node->checked = 1; } //printf("MR: %d(%d)\n",counter,len1+len2); } /* static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq, KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal, Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2) { int index,j=0,bal_j; Kmer word,bal_word; int flag=0,bal_flag=0; int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end; int seqStart,bal_start,seqEnd,bal_end; kmer_t *node; boolean found; if(len_seqlinear&&!node->checked){ if(!flag&&node->inEdge==1){ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); if(ctg1start>0){ flag = 1; seqStart = j + overlap-1; } } if(!bal_flag&&node->inEdge==2){ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(bal_ctg2end>0){ bal_flag = 2; bal_end = bal_j+overlap-1; } } } for(j = 1; j <= len_seq - overlap; j ++) { word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF); bal_j = len_seq-j-overlap; // j; bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap); if(wordlinear&&!node->checked){ if(!flag&&node->inEdge==1){ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); if(ctg1start>0){ flag = 1; seqStart = j + overlap-1; } }else if(flag==1&&node->inEdge==1){ index = searchKmerOnCtg(word,KmerCtg1,len1); if(index>ctg1start){ // choose hit closer to gap ctg1start = index; seqStart = j + overlap-1; } }else if(flag==1&&node->inEdge==2){ ctg2end = searchKmerOnCtg(word,KmerCtg2,len2); if(ctg2end>0){ flag = 3; seqEnd = j+overlap-1; break; } } if(!bal_flag&&node->inEdge==2){ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(bal_ctg2end>0){ bal_flag = 2; bal_end = bal_j+overlap-1; } }else if(bal_flag==2&&node->inEdge==2){ index = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(indexinEdge==1){ bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1); if(bal_ctg1start>0){ bal_flag = 3; bal_start = bal_j+overlap-1; break; } } } } if(flag==3){ *start = seqStart; *end = seqEnd; *bal = 0; *index1 = ctg1start; *index2 = ctg2end; return 1; }else if(bal_flag==3){ *start = bal_start; *end = bal_end; *bal = 1; *index1 = bal_ctg1start; *index2 = bal_ctg2end; return 1; } return 0; } static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray, Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2, CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max) { int i,j,start,end,startOnCtg1,endOnCtg2; char *bal_seq; char *src_seq; char *pt; boolean bal,ret=0,FILL; src_seq = (char *)ckalloc(maxReadLen*sizeof(char)); bal_seq = (char *)ckalloc(maxReadLen*sizeof(char)); for(i=0;imax) continue; fprintf(stderr,"Read across\n"); //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end); if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap) continue; // contig1 and contig2 could not overlap more than origOverlap bases ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = end-start; if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4)) continue; pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset); for(j=start+1;j<=end;j++){ if(bal) writeChar2tightString(bal_seq[j],pt,j-start-1); else writeChar2tightString(src_seq[j],pt,j-start-1); } ctg1->cutTail = len1-startOnCtg1-1; ctg2->cutHead = overlap + endOnCtg2; ctg2->scaftig_start = 0; ret = 1; break; } free((void*)src_seq); free((void*)bal_seq); return ret; } */ static void kmerSet_markTandem ( KmerSet *set, Kmer WordFilter, int overlap ); static boolean readsCrossGap ( READNEARBY *rdArray, int num, int originOverlap, DARRAY *gapSeqArray, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, CTGinSCAF *ctg1, CTGinSCAF *ctg2, KmerSet *kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2, char *seqGap, char *seqCtg1, char *seqCtg2, int cut1, int cut2 ); int localGraph ( READNEARBY *rdArray, int num, CTGinSCAF *ctg1, CTGinSCAF *ctg2, int origOverlap, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, DARRAY *gapSeqArray, char *seqCtg1, char *seqCtg2, char *seqGap ) { /**************** put kmer in DBgraph ****************/ KmerSet *kmerSet; Kmer WordFilter = createFilter ( overlap ); /* if(ctg1->ctgID==56410&&ctg2->ctgID==61741) printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID); */ kmerSet = readsInGap2DBgraph ( rdArray, num, ctg1, ctg2, origOverlap, kmerCtg1, kmerCtg2, overlap, WordFilter ); if ( !kmerSet ) { //printf("no kmer found\n"); return 0; } time_t tt; time ( &tt ); //srand48 ( ( int ) tt ); /* int i,j; for(i=0;i<2;i++){ int array[] = {0,1,2,3}; for(j=4;j>0;j--) fprintf(stderr,"%d ", nPick1(array,j)); } fprintf(stderr,"\n"); */ /***************** search path to connect contig ends ********/ int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; //count kmer number for contig1 and contig2 ends int len1, len2; len1 = CTGendLen < contig_array[ctg1->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg1->ctgID].length + origOverlap; len2 = CTGendLen < contig_array[ctg2->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg2->ctgID].length + origOverlap; len1 -= overlap - 1; len2 -= overlap - 1; //int pathNum = 2; int offset1 = 0, offset2 = 0, cut1 = 0, cut2 = 0; int pathNum = searchFgap ( kmerSet, ctg1, ctg2, kmerCtg1, kmerCtg2, origOverlap, overlap, gapSeqArray, len1, len2, WordFilter, &offset1, &offset2, seqGap, &cut1, &cut2 ); //printf("SF: %d K %d\n",pathNum,overlap); if ( pathNum == 0 ) { free_kmerset ( kmerSet ); return 0; } else if ( pathNum == 1 ) { free_kmerset ( kmerSet ); return 1; } /* else{ printf("ret %d\n",pathNum); free_kmerset(kmerSet); return 0; } */ /******************* cross the gap by single reads *********/ //kmerSet_markTandem(kmerSet,WordFilter,overlap); maskRepeatNode ( kmerSet, kmerCtg1, kmerCtg2, overlap, len1, len2, max, WordFilter ); boolean found = readsCrossGap ( rdArray, num, origOverlap, gapSeqArray, kmerCtg1, kmerCtg2, overlap, ctg1, ctg2, kmerSet, WordFilter, min, max, offset1, offset2, seqGap, seqCtg1, seqCtg2, cut1, cut2 ); if ( found ) { //fprintf(stderr,"read across\n"); free_kmerset ( kmerSet ); return found; } else { free_kmerset ( kmerSet ); return 0; } } static void kmerSet_markTandem ( KmerSet *set, Kmer WordFilter, int overlap ) { kmer_t *rs; long long counter = 0; int num_route, steps; int min = 1, max = overlap, maxRoute = 1; int traceCounter; set->iter_ptr = 0; while ( set->iter_ptr < set->size ) { if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) ) { rs = set->array + set->iter_ptr; if ( rs->inEdge > 0 ) { set->iter_ptr++; continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat ( rs->seq, steps, min, max, &num_route, set, rs->seq, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); if ( num_route < 1 ) { set->iter_ptr++; continue; } /* printKmerSeqLocal(stderr,rs->seq,overlap); fprintf(stderr, "\n"); */ rs->checked = 1; counter++; } set->iter_ptr++; } } /******************* the following is for read-crossing gaps *************************/ #define MAXREADLENGTH 100 static const int INDEL = 0; static const int SIM[4][4] = { {1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1} }; static char fastSequence[MAXREADLENGTH]; static char slowSequence[MAXREADLENGTH]; static int Fmatrix[MAXREADLENGTH + 1][MAXREADLENGTH + 1]; static int slowToFastMapping[MAXREADLENGTH + 1]; static int fastToSlowMapping[MAXREADLENGTH + 1]; static int max ( int A, int B, int C ) { A = A >= B ? A : B; return ( A >= C ? A : C ); } static int compareSequences ( char *sequence1, char *sequence2, int length1, int length2 ) { if ( length1 < 1 || length2 < 1 || length1 > MAXREADLENGTH || length2 > MAXREADLENGTH ) { return 0; } int i, j; int Choice1, Choice2, Choice3; int maxScore; for ( i = 0; i <= length1; i++ ) { Fmatrix[i][0] = 0; } for ( j = 0; j <= length2; j++ ) { Fmatrix[0][j] = 0; } for ( i = 1; i <= length1; i++ ) { for ( j = 1; j <= length2; j++ ) { Choice1 = Fmatrix[i - 1][j - 1] + SIM[ ( int ) sequence1[i - 1]][ ( int ) sequence2[j - 1]]; Choice2 = Fmatrix[i - 1][j] + INDEL; Choice3 = Fmatrix[i][j - 1] + INDEL; Fmatrix[i][j] = max ( Choice1, Choice2, Choice3 ); } } maxScore = Fmatrix[length1][length2]; return maxScore; } static void mapSlowOntoFast ( int slowSeqLength, int fastSeqLength ) { int slowIndex = slowSeqLength; int fastIndex = fastSeqLength; int fastn, slown; if ( slowIndex == 0 ) { slowToFastMapping[0] = fastIndex; while ( fastIndex >= 0 ) { fastToSlowMapping[fastIndex--] = 0; } return; } if ( fastIndex == 0 ) { while ( slowIndex >= 0 ) { slowToFastMapping[slowIndex--] = 0; } fastToSlowMapping[0] = slowIndex; return; } while ( slowIndex > 0 && fastIndex > 0 ) { fastn = ( int ) fastSequence[fastIndex - 1]; //getCharInTightString(fastSequence,fastIndex-1); slown = ( int ) slowSequence[slowIndex - 1]; //getCharInTightString(slowSequence,slowIndex-1); if ( Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex - 1] + SIM[fastn][slown] ) { fastToSlowMapping[--fastIndex] = --slowIndex; slowToFastMapping[slowIndex] = fastIndex; } else if ( Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex] + INDEL ) { fastToSlowMapping[--fastIndex] = slowIndex - 1; } else if ( Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex][slowIndex - 1] + INDEL ) { slowToFastMapping[--slowIndex] = fastIndex - 1; } else { fprintf ( stderr, "CompareSequence: Error trace.\n" ); abort (); } } while ( slowIndex > 0 ) { slowToFastMapping[--slowIndex] = -1; } while ( fastIndex > 0 ) { fastToSlowMapping[--fastIndex] = -1; } slowToFastMapping[slowSeqLength] = fastSeqLength; fastToSlowMapping[fastSeqLength] = slowSeqLength; } static boolean chopReadFillGap ( int len_seq, int overlap, char *src_seq, char *bal_seq, KmerSet *kset, Kmer WORDF, int *start, int *end, boolean *bal, Kmer *KmerCtg1, int len1, Kmer *KmerCtg2, int len2, int *index1, int *index2 ) { int index, j = 0, bal_j; Kmer word, bal_word; int flag = 0, bal_flag = 0; int ctg1start, bal_ctg1start, ctg2end, bal_ctg2end; int seqStart, bal_start, seqEnd, bal_end; kmer_t *node; boolean found; if ( len_seq < overlap + 1 ) { return 0; } #ifdef MER127 word.high1 = word.low1 = word.high2 = word.low2 = 0; for ( index = 0; index < overlap; index++ ) { word = KmerLeftBitMoveBy2 ( word ); word.low2 |= src_seq[index]; } #else word.high = word.low = 0; for ( index = 0; index < overlap; index++ ) { word = KmerLeftBitMoveBy2 ( word ); word.low |= src_seq[index]; } #endif reverseComplementSeq ( src_seq, len_seq, bal_seq ); // complementary node bal_word = reverseComplement ( word, overlap ); bal_j = len_seq - 0 - overlap; // 0; flag = bal_flag = 0; if ( KmerSmaller ( word, bal_word ) ) { found = search_kmerset ( kset, word, &node ); } else { found = search_kmerset ( kset, bal_word, &node ); } //if ( !found ) { printf ( "chopReadFillGap 1292 not found!\n" ); } if ( found && !node->linear && !node->checked ) { if ( !flag && node->inEdge == 1 ) { ctg1start = searchKmerOnCtg ( word, KmerCtg1, len1 ); if ( ctg1start >= 0 ) { flag = 1; seqStart = j + overlap - 1; } } if ( !bal_flag && node->inEdge == 2 ) { bal_ctg2end = searchKmerOnCtg ( bal_word, KmerCtg2, len2 ); if ( bal_ctg2end >= 0 ) { bal_flag = 2; bal_end = bal_j + overlap - 1; } } } for ( j = 1; j <= len_seq - overlap; j++ ) { word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF ); bal_j = len_seq - j - overlap; // j; bal_word = prevKmerLocal ( bal_word, bal_seq[bal_j], overlap ); if ( KmerSmaller ( word, bal_word ) ) { found = search_kmerset ( kset, word, &node ); } else { found = search_kmerset ( kset, bal_word, &node ); } //if ( !found ) { printf ( "chopReadFillGap 1321 not found!\n" ); } if ( found && !node->linear && !node->checked ) { if ( !flag && node->inEdge == 1 ) { ctg1start = searchKmerOnCtg ( word, KmerCtg1, len1 ); if ( ctg1start >= 0 ) { flag = 1; seqStart = j + overlap - 1; } } else if ( flag == 1 && node->inEdge == 1 ) { index = searchKmerOnCtg ( word, KmerCtg1, len1 ); if ( index >= 0 && index > ctg1start ) // choose hit closer to gap { ctg1start = index; seqStart = j + overlap - 1; } } else if ( flag == 1 && node->inEdge == 2 ) { ctg2end = searchKmerOnCtg ( word, KmerCtg2, len2 ); if ( ctg2end >= 0 ) { flag = 3; seqEnd = j + overlap - 1; break; } } if ( !bal_flag && node->inEdge == 2 ) { bal_ctg2end = searchKmerOnCtg ( bal_word, KmerCtg2, len2 ); if ( bal_ctg2end >= 0 ) { bal_flag = 2; bal_end = bal_j + overlap - 1; } } else if ( bal_flag == 2 && node->inEdge == 2 ) { index = searchKmerOnCtg ( bal_word, KmerCtg2, len2 ); if ( index >= 0 && index < bal_ctg2end ) // choose hit closer to gap { bal_ctg2end = index; bal_end = bal_j + overlap - 1; } } else if ( bal_flag == 2 && node->inEdge == 1 ) { bal_ctg1start = searchKmerOnCtg ( bal_word, KmerCtg1, len1 ); if ( bal_ctg1start >= 0 ) { bal_flag = 3; bal_start = bal_j + overlap - 1; break; } } } } if ( flag == 3 ) { *start = seqStart; *end = seqEnd; *bal = 0; *index1 = ctg1start; *index2 = ctg2end; return 1; } else if ( bal_flag == 3 ) { *start = bal_start; *end = bal_end; *bal = 1; *index1 = bal_ctg1start; *index2 = bal_ctg2end; return 1; } return 0; } static int cutSeqFromTightStr ( char *tightStr, int length, int start, int end, int revS, char *src_seq ) { int i, index = 0; end = end < length ? end : length - 1; start = start >= 0 ? start : 0; if ( !revS ) { for ( i = start; i <= end; i++ ) { src_seq[index++] = getCharInTightString ( tightStr, i ); } } else { for ( i = length - 1 - start; i >= length - end - 1; i-- ) { src_seq[index++] = int_comp ( getCharInTightString ( tightStr, i ) ); } } return end - start + 1; } static int cutSeqFromCtg ( unsigned int ctgID, int start, int end, char *sequence, int originOverlap ) { unsigned int bal_ctg = getTwinCtg ( ctgID ); if ( contig_array[ctgID].length < 1 ) { return 0; } int length = contig_array[ctgID].length + originOverlap; if ( contig_array[ctgID].seq ) { return cutSeqFromTightStr ( contig_array[ctgID].seq, length, start, end, 0, sequence ); } else { return cutSeqFromTightStr ( contig_array[bal_ctg].seq, length, start, end, 1, sequence ); } } static int cutSeqFromRead ( char *src_seq, int length, int start, int end, char *sequence ) { if ( end >= length ) { fprintf ( stderr, "The index is bigger than the length: end %d length %d.\n", end, length ); } end = end < length ? end : length - 1; start = start >= 0 ? start : 0; int i; for ( i = start; i <= end; i++ ) { sequence[i - start] = src_seq[i]; } return end - start + 1; } void printSeq ( FILE *fo, char *seq, int len ) { int i; for ( i = 0; i < len; i++ ) { fprintf ( fo, "%c", int2base ( ( int ) seq[i] ) ); } fprintf ( fo, "\n" ); } static boolean readsCrossGap ( READNEARBY *rdArray, int num, int originOverlap, DARRAY *gapSeqArray, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, CTGinSCAF *ctg1, CTGinSCAF *ctg2, KmerSet *kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2, char *seqGap, char *seqCtg1, char *seqCtg2, int cut1, int cut2 ) { int i, j, start, end, startOnCtg1, endOnCtg2; char *bal_seq; char *src_seq; char *pt; boolean bal, ret = 0, FILL; double maxScore = 0.0; int maxIndex; int lenCtg1, lenCtg2; //build sequences on left and right of the uncertain region int buffer_size = maxReadLen > 100 ? maxReadLen : 100; int length = contig_array[ctg1->ctgID].length + originOverlap; if ( buffer_size > offset1 ) { lenCtg1 = cutSeqFromCtg ( ctg1->ctgID, length - cut1 - ( buffer_size - offset1 ), length - 1 - cut1, seqCtg1, originOverlap ); for ( i = 0; i < offset1; i++ ) { seqCtg1[lenCtg1 + i] = seqGap[i]; } lenCtg1 += offset1; } else { for ( i = offset1 - buffer_size; i < offset1; i++ ) { seqCtg1[i + buffer_size - offset1] = seqGap[i]; } lenCtg1 = buffer_size; } length = contig_array[ctg2->ctgID].length + originOverlap; if ( buffer_size > offset2 ) { lenCtg2 = cutSeqFromCtg ( ctg2->ctgID, cut2, buffer_size - offset2 - 1 + cut2, & ( seqCtg2[offset2] ), originOverlap ); for ( i = 0; i < offset2; i++ ) { seqCtg2[i] = seqGap[i + offset1]; } lenCtg2 += offset2; } else { for ( i = 0; i < buffer_size; i++ ) { seqCtg2[i] = seqGap[i + offset1]; } lenCtg2 = buffer_size; } /* if(offset1>0||offset2>0){ for(i=0;i max ) { continue; } if ( overlap + ( len1 - startOnCtg1 - 1 ) + endOnCtg2 - ( end - start ) > ( int ) originOverlap ) { continue; } // contig1 and contig2 could not overlap more than origOverlap bases START[i] = start; END[i] = end; INDEX1[i] = startOnCtg1; INDEX2[i] = endOnCtg2; BAL[i] = bal; int matchLen = 2 * overlap < ( end - start + overlap ) ? 2 * overlap : ( end - start + overlap ); int match; int alignLen = matchLen; //compare the left of hit kmer on ctg1 //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1; int ctgLeft = ( lenCtg1 ) - ( len1 + overlap - 1 ) + startOnCtg1; int readLeft = start - overlap + 1; int cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft; cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg1->ctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap); cutSeqFromRead ( seqCtg1, lenCtg1, ctgLeft - cmpLen, ctgLeft - 1, fastSequence ); if ( !bal ) { cutSeqFromRead ( src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence ); } else { cutSeqFromRead ( bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence ); } match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen ); alignLen += cmpLen; matchLen += match; //compare the right of hit kmer on ctg1 int ctgRight = len1 - startOnCtg1 - 1; cmpLen = ctgRight < ( rdArray[i].len - start - 1 ) ? ctgRight : ( rdArray[i].len - start - 1 ); cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap); cutSeqFromRead ( seqCtg1, lenCtg1, ctgLeft + overlap, ctgLeft + overlap + cmpLen - 1, fastSequence ); if ( !bal ) { cutSeqFromRead ( src_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence ); } else { cutSeqFromRead ( bal_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence ); } match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen ); //fprintf(stderr,"%d -- %d\n",match,cmpLen); alignLen += cmpLen; matchLen += match; //compare the left of hit kmer on ctg2 ctgLeft = endOnCtg2; readLeft = end - overlap + 1; cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft; cmpLen = ctgLeft <= MAXREADLENGTH ? ctgLeft : MAXREADLENGTH; //cutSeqFromCtg(ctg2->ctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap); cutSeqFromRead ( seqCtg2, lenCtg2, endOnCtg2 - cmpLen, endOnCtg2 - 1, fastSequence ); if ( !bal ) { cutSeqFromRead ( src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence ); } else { cutSeqFromRead ( bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence ); } match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen ); alignLen += cmpLen; matchLen += match; //compare the right of hit kmer on ctg2 //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap; ctgRight = lenCtg2 - endOnCtg2 - overlap; cmpLen = ctgRight < ( rdArray[i].len - end - 1 ) ? ctgRight : ( rdArray[i].len - end - 1 ); cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap); cutSeqFromRead ( seqCtg2, lenCtg2, endOnCtg2 + overlap, endOnCtg2 + overlap + cmpLen - 1, fastSequence ); if ( !bal ) { cutSeqFromRead ( src_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence ); } else { cutSeqFromRead ( bal_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence ); } match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen ); alignLen += cmpLen; matchLen += match; /* if(cmpLen>0&&match!=cmpLen+overlap){ printSeq(stderr,fastSequence,cmpLen+overlap); printSeq(stderr,slowSequence,cmpLen+overlap); printKmerSeqLocal(stderr,kmerCtg2[endOnCtg2],overlap); fprintf(stderr,": %d(%d)\n",bal,endOnCtg2); }else if(cmpLen>0&&match==cmpLen+overlap) fprintf(stderr,"Perfect\n"); */ double score = ( double ) matchLen / alignLen; if ( maxScore < score ) { maxScore = score; //fprintf(stderr,"%4.2f (%d/%d)\n",maxScore,matchLen,alignLen); maxIndex = i; } SCORE[i] = score; } /* if(maxScore>0.0) fprintf(stderr,"SCORE: %4.2f\n",maxScore); */ if ( maxScore > 0.9 ) { /* for(i=0;i 0 ? offset1 - ( len1 - INDEX1[maxIndex] - 1 ) : 0; int rightRemain = offset2 - ( overlap + INDEX2[maxIndex] ) > 0 ? offset2 - ( overlap + INDEX2[maxIndex] ) : 0; ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = END[maxIndex] - START[maxIndex] + leftRemain + rightRemain; if ( darrayPut ( gapSeqArray, ctg1->gapSeqOffset + ( END[maxIndex] - START[maxIndex] + leftRemain + rightRemain ) / 4 ) ) { pt = ( char * ) darrayPut ( gapSeqArray, ctg1->gapSeqOffset ); for ( j = 0; j < leftRemain; j++ ) //get the left side of the gap region from search { writeChar2tightString ( seqGap[j], pt, j ); //fprintf(stderr,"%c",int2base(seqGap[j])); } for ( j = START[maxIndex] + 1; j <= END[maxIndex]; j++ ) { if ( BAL[maxIndex] ) { writeChar2tightString ( bal_seq[j], pt, j - START[maxIndex] - 1 + leftRemain ); //fprintf(stderr,"%c",int2base(bal_seq[j])); } else { writeChar2tightString ( src_seq[j], pt, j - START[maxIndex] - 1 + leftRemain ); //fprintf(stderr,"%c",int2base(src_seq[j])); } } for ( j = offset2 - rightRemain; j < offset2; j++ ) //get the right side of the gap region from search { writeChar2tightString ( seqGap[j + leftRemain], pt, j + END[maxIndex] - START[maxIndex] + leftRemain ); //fprintf(stderr,"%c",int2base(seqGap[j+leftRemain])); } /* fprintf(stderr,": GAPSEQ (%d+%d)(%d+%d)(%d+%d)(%d+%d) B %d\n",offset1,offset2,cut1,cut2, len1-INDEX1[maxIndex]-1,INDEX2[maxIndex],START[maxIndex],END[maxIndex],BAL[maxIndex]); */ ctg1->cutTail = len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 > cut1 ? len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 : cut1; ctg2->cutHead = overlap + INDEX2[maxIndex] - offset2 + cut2 > cut2 ? overlap + INDEX2[maxIndex] - offset2 + cut2 : cut2; ctg2->scaftig_start = 0; ret = 1; } } free ( ( void * ) START ); free ( ( void * ) END ); free ( ( void * ) INDEX1 ); free ( ( void * ) INDEX2 ); free ( ( void * ) SCORE ); free ( ( void * ) BAL ); free ( ( void * ) src_seq ); free ( ( void * ) bal_seq ); return ret; }