/* * localAsm.c * * Copyright (c) 2011-2013 BGI-Shenzhen . * * This file is part of SOAPdenovo-Trans. * * SOAPdenovo-Trans is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SOAPdenovo-Trans is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SOAPdenovo-Trans. If not, see . * */ #include "stdinc.h" #include "newhash.h" #include "extfunc.h" #include "extvab.h" #define CTGendLen 35 // shouldn't larger than max_read_len #define UPlimit 5000 #define MaxRouteNum 10 static void kmerSet_mark (KmerSet * set); static void trace4Repeat (Kmer currW, int steps, int min, int max, int *num_route,KmerSet * kset, Kmer kmerDest, int overlap, Kmer WORDF,int *traceCounter, int maxRoute, kmer_t ** soFarNode, short *multiOccu1, short *multiOccu2, int *routeLens, char **foundRoutes, char *soFarSeq, long long *soFarLinks, double *avgLinks); static Kmer prevKmerLocal (Kmer next, char ch, int overlap) { #ifdef MER127 Kmer word = KmerRightBitMoveBy2 (next); if (2 * (overlap - 1) < 64) { word.low2 |= (((ubyte8) ch) << 2 * (overlap - 1)); } if (2 * (overlap - 1) >= 64 && 2 * (overlap - 1) < 128) { word.high2 |= ((ubyte8) ch) << (2 * (overlap - 1) - 64); } if (2 * (overlap - 1) >= 128 && 2 * (overlap - 1) < 192) { word.low1 |= ((ubyte8) ch) << (2 * (overlap - 1) - 128); } if (2 * (overlap - 1) >= 192 && 2 * (overlap - 1) < 256) { word.high1 |= ((ubyte8) ch) << (2 * (overlap - 1) - 192); } #endif #ifdef MER63 Kmer word = KmerRightBitMoveBy2 (next); if(2*(overlap-1)<64) word.low |= (((ubyte8)ch) << 2*(overlap-1)); else word.high |= ((ubyte8)ch) << (2*(overlap-1)-64); #endif #ifdef MER31 Kmer word = next; word >>= 2; word += ((Kmer) ch) << 2 * (overlap - 1); #endif return word; } static Kmer nextKmerLocal (Kmer prev, char ch, Kmer WordFilter) { Kmer word = KmerLeftBitMoveBy2 (prev); word = KmerAnd (word, WordFilter); #ifdef MER127 word.low2 |= ch; #endif #ifdef MER63 word.low |= ch; #endif #ifdef MER31 word |= ch; #endif return word; } static void singleKmer (int t, KmerSet * kset, int flag, Kmer * kmerBuffer, char *prevcBuffer, char *nextcBuffer) { kmer_t *pos; put_kmerset (kset, kmerBuffer[t], prevcBuffer[t], nextcBuffer[t], &pos); if (pos->inEdge == flag) { return; } else if (pos->inEdge == 0) { pos->inEdge = flag; } else if (pos->inEdge == 1 && flag == 2) { pos->inEdge = 3; } else if (pos->inEdge == 2 && flag == 1) { pos->inEdge = 3; } } static void putKmer2DBgraph (KmerSet * kset, int flag, int kmer_c, Kmer * kmerBuffer, char *prevcBuffer, char *nextcBuffer) { int t; for (t = 0; t < kmer_c; t++) { singleKmer (t, kset, flag, kmerBuffer, prevcBuffer, nextcBuffer); } } static void getSeqFromRead (READNEARBY read, char *src_seq) { int len_seq = read.len; int j; char *tightSeq = (char *) darrayGet (readSeqInGap, read.seqStarter); for (j = 0; j < len_seq; j++) { src_seq[j] = getCharInTightString (tightSeq, j); } } static void chopKmer4Ctg (Kmer * kmerCtg, int lenCtg, int overlap, char *src_seq, Kmer WORDF) { int index, j; Kmer word; word=kmerZero; for (index = 0; index < overlap; index++) { word = KmerLeftBitMoveBy2 (word); #ifdef MER127 word.low2 |= src_seq[index]; #endif #ifdef MER63 word.low |= src_seq[index]; #endif #ifdef MER31 word |= src_seq[index]; #endif } index = 0; kmerCtg[index++] = word; for (j = 1; j <= lenCtg - overlap; j++) { word = nextKmerLocal (word, src_seq[j - 1 + overlap], WORDF); kmerCtg[index++] = word; } } static void chopKmer4read (int len_seq, int overlap, char *src_seq, char *bal_seq, Kmer * kmerBuffer, char *prevcBuffer, char *nextcBuffer, int *kmer_c, Kmer WORDF) { int j, bal_j; Kmer word, bal_word; int index; char InvalidCh = 4; if (len_seq < overlap + 1) { *kmer_c = 0; return; } word=kmerZero; for (index = 0; index < overlap; index++) { word = KmerLeftBitMoveBy2 (word); #ifdef MER127 word.low2 |= src_seq[index]; #endif #ifdef MER63 word.low |= src_seq[index]; #endif #ifdef MER31 word|= src_seq[index]; #endif } reverseComplementSeq (src_seq, len_seq, bal_seq); // complementary node bal_word = reverseComplement (word, overlap); bal_j = len_seq - 0 - overlap; // 0; index = 0; if (KmerSmaller (word, bal_word)) { kmerBuffer[index] = word; prevcBuffer[index] = InvalidCh; nextcBuffer[index++] = src_seq[0 + overlap]; } else { kmerBuffer[index] = bal_word; prevcBuffer[index] = bal_seq[bal_j - 1]; nextcBuffer[index++] = InvalidCh; } for (j = 1; j <= len_seq - overlap; j++) { word = nextKmerLocal (word, src_seq[j - 1 + overlap], WORDF); bal_j = len_seq - j - overlap; // j; bal_word = prevKmerLocal (bal_word, bal_seq[bal_j], overlap); if (KmerSmaller (word, bal_word)) { kmerBuffer[index] = word; prevcBuffer[index] = src_seq[j - 1]; if (j < len_seq - overlap) { nextcBuffer[index++] = src_seq[j + overlap]; } else { nextcBuffer[index++] = InvalidCh; } //printf("%dth: %p with %p\n",kmer_c-1,word,hashBanBuffer[kmer_c-1]); } else { // complementary node kmerBuffer[index] = bal_word; if (bal_j > 0) { prevcBuffer[index] = bal_seq[bal_j - 1]; } else { prevcBuffer[index] = InvalidCh; } nextcBuffer[index++] = bal_seq[bal_j + overlap]; //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]); } } *kmer_c = index; } static void headTightStr (char *tightStr, int length, int start, int headLen, int revS, char *src_seq) { int i, index = 0; if (!revS) { for (i = start; i < start + headLen; i++) { src_seq[index++] = getCharInTightString (tightStr, i); } } else { for (i = length - 1 - start; i >= length - headLen - start; i--) { src_seq[index++] = int_comp (getCharInTightString (tightStr, i)); } } } static int getSeqFromCtg (CTGinSCAF * ctg, boolean fromHead, unsigned int len, int originOverlap, char *src_seq) { unsigned int ctgId = ctg->ctgID; unsigned int bal_ctg = getTwinCtg (ctgId); if (contig_array[ctgId].length < 1) { return 0; } unsigned int length = contig_array[ctgId].length + originOverlap; len = len < length ? len : length; if (fromHead) { if (contig_array[ctgId].seq) { headTightStr (contig_array[ctgId].seq, length, 0, len, 0, src_seq); } else { headTightStr (contig_array[bal_ctg].seq, length, 0, len, 1, src_seq); } } else { if (contig_array[ctgId].seq) { headTightStr (contig_array[ctgId].seq, length, length - len, len, 0, src_seq); } else { headTightStr (contig_array[bal_ctg].seq, length, length - len, len, 1, src_seq); } } return len; } static KmerSet *readsInGap2DBgraph (READNEARBY * rdArray, int num, CTGinSCAF * ctg1, CTGinSCAF * ctg2, int originOverlap, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, Kmer WordFilter) { int kmer_c; Kmer *kmerBuffer; char *nextcBuffer, *prevcBuffer; int i; int buffer_size = maxReadLen > CTGendLen ? maxReadLen : CTGendLen; KmerSet *kmerS = NULL; int lenCtg1; int lenCtg2; char *bal_seq; char *src_seq; src_seq = (char *) ckalloc (buffer_size * sizeof (char)); bal_seq = (char *) ckalloc (buffer_size * sizeof (char)); lenCtg1 = getSeqFromCtg (ctg1, 0, CTGendLen, originOverlap, src_seq); lenCtg2 = getSeqFromCtg (ctg2, 1, CTGendLen, originOverlap, src_seq); if (lenCtg1 <= overlap || lenCtg2 <= overlap) { free ((void *) src_seq); free ((void *) bal_seq); return kmerS; } kmerBuffer = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); prevcBuffer = (char *) ckalloc (buffer_size * sizeof (char)); nextcBuffer = (char *) ckalloc (buffer_size * sizeof (char)); kmerS = init_kmerset (1024, 0.77f); for (i = 0; i < num; i++) { getSeqFromRead (rdArray[i], src_seq); chopKmer4read (rdArray[i].len, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter); putKmer2DBgraph (kmerS, 0, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer); } lenCtg1 = getSeqFromCtg (ctg1, 0, CTGendLen, originOverlap, src_seq); chopKmer4Ctg (kmerCtg1, lenCtg1, overlap, src_seq, WordFilter); chopKmer4read (lenCtg1, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter); putKmer2DBgraph (kmerS, 1, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer); lenCtg2 = getSeqFromCtg (ctg2, 1, CTGendLen, originOverlap, src_seq); chopKmer4Ctg (kmerCtg2, lenCtg2, overlap, src_seq, WordFilter); chopKmer4read (lenCtg2, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter); putKmer2DBgraph (kmerS, 2, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer); /* if(ctg1->ctgID==3733&&ctg2->ctgID==3067){ for(i=0;i31kmer static void printKmerSeqLocal (FILE * fp, Kmer kmer, int overlap) { int i, bit1, bit2, bit3, bit4; char ch; char kmerSeq[128]; if (overlap < 32) { bit4 = overlap; bit3 = 0; bit2 = 0; bit1 = 0; } if (overlap >= 32 && overlap < 64) { bit4 = 32; bit3 = overlap - 32; bit2 = 0; bit1 = 0; } if (overlap >= 64 && overlap < 96) { bit4 = 32; bit3 = 32; bit2 = overlap - 64; bit1 = 0; } if (overlap >= 96 && overlap < 128) { bit4 = 32; bit3 = 32; bit2 = 32; bit1 = overlap - 96; } for (i = bit1 - 1; i >= 0; i--) { ch = kmer.high1 & 0x3; kmer.high1 >>= 2; kmerSeq[i] = ch; } for (i = bit2 - 1; i >= 0; i--) { ch = kmer.low1 & 0x3; kmer.low1 >>= 2; kmerSeq[i + bit1] = ch; } for (i = bit3 - 1; i >= 0; i--) { ch = kmer.high2 & 0x3; kmer.high2 >>= 2; kmerSeq[i + bit1 + bit2] = ch; } for (i = bit4 - 1; i >= 0; i--) { ch = kmer.low2 & 0x3; kmer.low2 >>= 2; kmerSeq[i + bit1 + bit2 + bit3] = ch; } for (i = 0; i < overlap; i++) { fprintf (fp, "%c", int2base ((int) kmerSeq[i])); } }*/ static void kmerSet_mark (KmerSet * set) { int i, in_num, out_num, cvgSingle; kmer_t *rs; long long counter = 0, linear = 0; Kmer word; set->iter_ptr = 0; while (set->iter_ptr < set->size) { if (!is_kmer_entity_null (set->flags, set->iter_ptr)) { in_num = out_num = 0; rs = set->array + set->iter_ptr; word = rs->seq; for (i = 0; i < 4; i++) { cvgSingle = get_kmer_left_cov (*rs, i); if (cvgSingle > 0) { in_num++; } cvgSingle = get_kmer_right_cov (*rs, i); if (cvgSingle > 0) { out_num++; } } if (rs->single) { counter++; } if (in_num == 1 && out_num == 1) { rs->linear = 1; linear++; } } set->iter_ptr++; } //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear); } static kmer_t *searchNode (Kmer word, KmerSet * kset, int overlap) { Kmer bal_word = reverseComplement (word, overlap); kmer_t *node; boolean found; if (KmerSmaller (word, bal_word)) { found = search_kmerset (kset, word, &node); } else { found = search_kmerset (kset, bal_word, &node); } if (found) { return node; } else { return NULL; } } static int searchKmerOnCtg (Kmer currW, Kmer * kmerDest, int num) { int i; for (i = 0; i < num; i++) { if (KmerEqual (currW, kmerDest[i])) { return i; } } return -1; } // pick on from n items randomly static int nPick1 (int *array, int n) { int m, i; m = n - 1; //(int)(drand48()*n); int value = array[m]; for (i = m; i < n - 1; i++) { array[i] = array[i + 1]; } return value; } static void traceAlongDBgraph (Kmer currW, int steps, int min, int max, int *num_route,KmerSet * kset, Kmer * kmerDest, int num, int overlap, Kmer WORDF,char **foundRoutes, int *routeEndOnCtg2, int *routeLens, char *soFarSeq, int *traceCounter, int maxRoute, kmer_t ** soFarNode, boolean * multiOccu, long long *soFarLinks, double *avgLinks) { (*traceCounter)++; if (*traceCounter > UPlimit) { /* if(overlap==19&&kmerDest[0]==pubKmer) printf("UPlimit\n"); */ return; } if (steps > max || *num_route >= maxRoute) { /* if(overlap==19&&kmerDest[0]==pubKmer) printf("max steps/maxRoute\n"); */ return; } Kmer word = reverseComplement (currW, overlap); boolean isSmaller = KmerSmaller (currW, word); int i; char ch; unsigned char links; if (isSmaller) { word = currW; } kmer_t *node; boolean found = search_kmerset (kset, word, &node); // #ifdef DEBUG if (!found) { #ifdef MER127 printf ( "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d\n", __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1, currW.high2, currW.low2, steps ); #endif #ifdef MER63 printf ( "%s Trace: can't find kmer %llx %llx (input %llx %llx ) at step %d\n", __FUNCTION__, word.high,word.low, currW.high,currW.low, steps ); #endif #ifdef MER31 printf ( "%s Trace: can't find kmer %llx (input %llx ) at step %d\n", __FUNCTION__, word, currW, steps ); #endif return; } // #else // if (!found) return; // #endif if (node->twin > 1) { return; } if (soFarNode) { soFarNode[steps] = node; } if (steps > 0) { soFarSeq[steps - 1] = lastCharInKmer (currW); } int index, end; int linkCounter = *soFarLinks; if (steps >= min && node->inEdge > 1 && (end = searchKmerOnCtg (currW, kmerDest, num)) >= 0) { index = *num_route; if (steps > 0) { avgLinks[index] = (double) linkCounter / steps; } else { avgLinks[index] = 0; } //find node that appears more than once in the path multiOccu[index] = 0; for (i = 0; i < steps + 1; i++) { soFarNode[i]->deleted = 0; } for (i = 0; i < steps + 1; i++) { if (soFarNode[i]->deleted) { multiOccu[index] = 1; break; } soFarNode[i]->deleted = 1; } routeEndOnCtg2[index] = end; routeLens[index] = steps; char *array = foundRoutes[index]; for (i = 0; i < steps; i++) { array[i] = soFarSeq[i]; } if (i < max) { array[i] = 4; } //indicate the end of the sequence *num_route = ++index; return; } steps++; if (isSmaller) { int array[] = { 0, 1, 2, 3 }; for (i = 4; i > 0; i--) { ch = nPick1 (array, i); links = get_kmer_right_cov (*node, ch); if (!links) { continue; } *soFarLinks = linkCounter + links; word = nextKmerLocal (currW, ch, WORDF); traceAlongDBgraph (word, steps, min, max, num_route,kset, kmerDest, num, overlap, WORDF, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute, soFarNode, multiOccu, soFarLinks, avgLinks); } } else { int array[] = { 0, 1, 2, 3 }; for (i = 4; i > 0; i--) { ch = nPick1 (array, i); links = get_kmer_left_cov (*node, ch); if (!links) { continue; } *soFarLinks = linkCounter + links; word = nextKmerLocal (currW, int_comp (ch), WORDF); traceAlongDBgraph (word, steps, min, max, num_route,kset, kmerDest, num, overlap, WORDF, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute, soFarNode, multiOccu, soFarLinks, avgLinks); } } } static int searchFgap (KmerSet * kset, CTGinSCAF * ctg1, CTGinSCAF * ctg2, Kmer * kmerCtg1,Kmer * kmerCtg2, unsigned int origOverlap, int overlap, DARRAY * gapSeqArray, int len1, int len2, Kmer WordFilter, int *offset1, int *offset2, char *seqGap, int *cut1, int *cut2) { int i; int ret = 0; kmer_t *node, **soFarNode; int num_route; int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; //0531 int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; char **foundRoutes; char *soFarSeq; int traceCounter; int *routeEndOnCtg2; int *routeLens; boolean *multiOccu; long long soFarLinks; double *avgLinks; //mask linear internal linear kmer on contig1 end routeEndOnCtg2 = (int *) ckalloc (MaxRouteNum * sizeof (int)); routeLens = (int *) ckalloc (MaxRouteNum * sizeof (int)); multiOccu = (boolean *) ckalloc (MaxRouteNum * sizeof (boolean)); short *MULTI1 = (short *) ckalloc (MaxRouteNum * sizeof (short)); short *MULTI2 = (short *) ckalloc (MaxRouteNum * sizeof (short)); soFarSeq = (char *) ckalloc (max * sizeof (char)); soFarNode = (kmer_t **) ckalloc ((max + 1) * sizeof (kmer_t *)); foundRoutes = (char **) ckalloc (MaxRouteNum * sizeof (char *));; avgLinks = (double *) ckalloc (MaxRouteNum * sizeof (double));; for (i = 0; i < MaxRouteNum; i++) { foundRoutes[i] = (char *) ckalloc (max * sizeof (char)); } for (i = len1 - 1; i >= 0; i--) { num_route = traceCounter = soFarLinks = 0; int steps = 0; traceAlongDBgraph (kmerCtg1[i], steps, min, max, &num_route,kset, kmerCtg2, len2, overlap, WordFilter, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, &traceCounter, MaxRouteNum, soFarNode, multiOccu, &soFarLinks, avgLinks); if (num_route > 0) { int m, minEnd = routeEndOnCtg2[0]; for (m = 0; m < num_route; m++) { if (routeLens[m] < 0) { continue; } if (routeEndOnCtg2[m] < minEnd) { minEnd = routeEndOnCtg2[m]; } } /* else if(minFreq>1){ for(m=0;m3) break; printf("%c",int2base((int)foundRoutes[m][j])); } printf(": %4.2f\n",avgLinks[m]); } } */ num_route = traceCounter = soFarLinks = 0; steps = 0; trace4Repeat (kmerCtg1[i], steps, min, max, &num_route,kset, kmerCtg2[minEnd], overlap, WordFilter, &traceCounter, MaxRouteNum, soFarNode, MULTI1, MULTI2, routeLens, foundRoutes, soFarSeq, &soFarLinks, avgLinks); int j, best = 0; int maxLen = routeLens[0]; double maxLink = avgLinks[0]; char *pt; boolean repeat = 0, sameLen = 1; int leftMost = max, rightMost = max; #ifdef DEBUG if (num_route < 1) { fprintf (stderr, "After trace4Repeat: non route was found\n"); continue; } #else if (num_route < 1) continue; #endif if (num_route > 1) { // if multi paths are found, we check on the repeatative occurrences and links/length for (m = 0; m < num_route; m++) { if (routeLens[m] < 0) { continue; } if (MULTI1[m] >= 0 && MULTI2[m] >= 0) { repeat = 1; leftMost = leftMost > MULTI1[m] ? MULTI1[m] : leftMost; rightMost = rightMost > MULTI2[m] ? MULTI2[m] : rightMost; } if (routeLens[m] != maxLen) { sameLen = 0; } if (routeLens[m] < maxLen) { maxLen = routeLens[m]; } if (avgLinks[m] > maxLink) { maxLink = avgLinks[m]; best = m; } } } if (repeat) { *offset1 = *offset2 = *cut1 = *cut2 = 0; int index = 0; char ch; for (j = 0; j < leftMost; j++) { if (routeLens[0] < j + overlap + 1) { break; } else { ch = foundRoutes[0][j]; } for (m = 1; m < num_route; m++) { if (routeLens[m] < 0) { continue; } if (ch != foundRoutes[m][j]) { break; } } if (m == num_route) { seqGap[index++] = ch; } else { break; } } *offset1 = index; index = 0; for (j = 0; j < rightMost; j++) { if (routeLens[0] - overlap - 1 < j) { break; } else { ch = foundRoutes[0][routeLens[0] - overlap - 1 - j]; } for (m = 1; m < num_route; m++) { if (routeLens[m] < 0) { continue; } if (ch != foundRoutes[m][routeLens[m] - overlap - 1 - j]) { break; } } if (m == num_route) { index++; } else { break; } } *offset2 = index; for (j = 0; j < *offset2; j++) { seqGap[*offset1 + *offset2 - 1 - j] = foundRoutes[0][routeLens[0] - overlap - 1 - j]; } if (*offset1 > 0 || *offset2 > 0) { *cut1 = len1 - i - 1; *cut2 = minEnd; //fprintf(stderr,"\n"); for (m = 0; m < num_route; m++) { for (j = 0; j < max; j++) { if (foundRoutes[m][j] > 3) { break; } //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j])); } //fprintf(stderr,": %4.2f\n",avgLinks[m]); } /* fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2); for(index=0;index<*offset1+*offset2;index++) fprintf(stderr,"%c",int2base(seqGap[index])); fprintf(stderr,"\n"); */ } ret = 3; break; } if (overlap + (len1 - i - 1) + minEnd - routeLens[best] > (int) origOverlap) { continue; } ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = routeLens[best]; if (!darrayPut (gapSeqArray, ctg1->gapSeqOffset + maxLen / 4)) { continue; } pt = (char *) darrayPut (gapSeqArray, ctg1->gapSeqOffset); /* printKmerSeqLocal(stderr,kmerCtg1[i],overlap); fprintf(stderr,"-"); */ for (j = 0; j < max; j++) { if (foundRoutes[best][j] > 3) { break; } writeChar2tightString (foundRoutes[best][j], pt, j); //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j])); } //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]); ctg1->cutTail = len1 - i - 1; ctg2->cutHead = overlap + minEnd; ctg2->scaftig_start = 0; ret = 1; break; /* }if(num_route>1){ ret = 2; break; */ } else //mark node which leads to dead end { node = searchNode (kmerCtg1[i], kset, overlap); if (node) { node->twin = 2; } } } for (i = 0; i < MaxRouteNum; i++) { free ((void *) foundRoutes[i]); } free ((void *) soFarSeq); free ((void *) soFarNode); free ((void *) multiOccu); free ((void *) MULTI1); free ((void *) MULTI2); free ((void *) foundRoutes); free ((void *) routeEndOnCtg2); free ((void *) routeLens); return ret; } static void trace4Repeat (Kmer currW, int steps, int min, int max, int *num_route,KmerSet * kset, Kmer kmerDest, int overlap, Kmer WORDF, int *traceCounter, int maxRoute, kmer_t ** soFarNode, short *multiOccu1, short *multiOccu2, int *routeLens, char **foundRoutes, char *soFarSeq, long long *soFarLinks, double *avgLinks) { (*traceCounter)++; if (*traceCounter > UPlimit) { return; } if (steps > max || *num_route >= maxRoute) { return; } Kmer word = reverseComplement (currW, overlap); boolean isSmaller = KmerSmaller (currW, word); char ch; unsigned char links; int index, i; if (isSmaller) { word = currW; } kmer_t *node; boolean found = search_kmerset (kset, word, &node); if (!found) { #ifdef MER127 printf ( "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d\n", __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1, currW.high2, currW.low2, steps ); #endif #ifdef MER63 printf ( "%s Trace: can't find kmer %llx %llx (input %llx %llx ) at step %d\n", __FUNCTION__, word.high,word.low, currW.high,currW.low, steps ); #endif #ifdef MER31 printf ( "%s Trace: can't find kmer %llx (input %llx ) at step %d\n", __FUNCTION__, word, currW, steps ); #endif return; } // #else // if (!found) return; // #endif if (soFarNode) { soFarNode[steps] = node; } if (soFarSeq && steps > 0) { soFarSeq[steps - 1] = lastCharInKmer (currW); } int linkCounter; if (soFarLinks) { linkCounter = *soFarLinks; } if (steps >= min && KmerEqual (currW, kmerDest)) { index = *num_route; if (avgLinks && steps > 0) { avgLinks[index] = (double) linkCounter / steps; } else if (avgLinks) { avgLinks[index] = 0; } //find node that appears more than once in the path if (multiOccu1 && multiOccu2) { for (i = 0; i < steps + 1; i++) { soFarNode[i]->deleted = 0; } int rightMost = 0; boolean MULTI = 0; for (i = 0; i < steps + 1; i++) { if (soFarNode[i]->deleted) { rightMost = rightMost < i - 1 ? i - 1 : rightMost; MULTI = 1; } soFarNode[i]->deleted = 1; } if (!MULTI) { multiOccu1[index] = multiOccu2[index] = -1; } else { multiOccu2[index] = steps - 2 - rightMost < 0 ? 0 : steps - 2 - rightMost; //[0 steps-2] for (i = 0; i < steps + 1; i++) { soFarNode[i]->deleted = 0; } int leftMost = steps - 2; for (i = steps; i >= 0; i--) { if (soFarNode[i]->deleted) { leftMost = leftMost > i - 1 ? i - 1 : leftMost; } soFarNode[i]->deleted = 1; } multiOccu1[index] = leftMost < 0 ? 0 : leftMost; //[0 steps-2] } } if (routeLens) { routeLens[index] = steps; } if (soFarSeq) { char *array = foundRoutes[index]; for (i = 0; i < steps; i++) { array[i] = soFarSeq[i]; } if (i < max) { array[i] = 4; } //indicate the end of the sequence } *num_route = ++index; } steps++; if (isSmaller) { int array[] = { 0, 1, 2, 3 }; for (i = 4; i > 0; i--) { ch = nPick1 (array, i); links = get_kmer_right_cov (*node, ch); if (!links) { continue; } if (soFarLinks) { *soFarLinks = linkCounter + links; } word = nextKmerLocal (currW, ch, WORDF); trace4Repeat (word, steps, min, max, num_route,kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode, multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks); } } else { int array[] = { 0, 1, 2, 3 }; for (i = 4; i > 0; i--) { ch = nPick1 (array, i); links = get_kmer_left_cov (*node, ch); if (!links) { continue; } if (soFarLinks) { *soFarLinks = linkCounter + links; } word = nextKmerLocal (currW, int_comp (ch), WORDF); trace4Repeat (word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode, multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks); } } } //found repeat node on contig ends static void maskRepeatNode (KmerSet * kset, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, int len1, int len2, int max, Kmer WordFilter) { int i; int num_route, steps; int min = 1, maxRoute = 1; int traceCounter; Kmer word, bal_word; kmer_t *node; boolean found; int counter = 0; for (i = 0; i < len1; i++) { word = kmerCtg1[i]; bal_word = reverseComplement (word, overlap); if (KmerLarger (word, bal_word)) { word = bal_word; } found = search_kmerset (kset, word, &node); if (!found || node->linear) { //printf("Found no node for kmer %llx\n",word); continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat (word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if (num_route < 1) { continue; } counter++; node->checked = 1; } for (i = 0; i < len2; i++) { word = kmerCtg2[i]; bal_word = reverseComplement (word, overlap); if (KmerLarger (word, bal_word)) { word = bal_word; } found = search_kmerset (kset, word, &node); if (!found || node->linear) { //printf("Found no node for kmer %llx\n",word); continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat (word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if (num_route < 1) { continue; } counter++; node->checked = 1; } //printf("MR: %d(%d)\n",counter,len1+len2); } /* static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq, KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal, Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2) { int index,j=0,bal_j; Kmer word,bal_word; int flag=0,bal_flag=0; int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end; int seqStart,bal_start,seqEnd,bal_end; kmer_t *node; boolean found; if(len_seqlinear&&!node->checked){ if(!flag&&node->inEdge==1){ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); if(ctg1start>0){ flag = 1; seqStart = j + overlap-1; } } if(!bal_flag&&node->inEdge==2){ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(bal_ctg2end>0){ bal_flag = 2; bal_end = bal_j+overlap-1; } } } for(j = 1; j <= len_seq - overlap; j ++) { word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF); bal_j = len_seq-j-overlap; // j; bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap); if(wordlinear&&!node->checked){ if(!flag&&node->inEdge==1){ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); if(ctg1start>0){ flag = 1; seqStart = j + overlap-1; } }else if(flag==1&&node->inEdge==1){ index = searchKmerOnCtg(word,KmerCtg1,len1); if(index>ctg1start){ // choose hit closer to gap ctg1start = index; seqStart = j + overlap-1; } }else if(flag==1&&node->inEdge==2){ ctg2end = searchKmerOnCtg(word,KmerCtg2,len2); if(ctg2end>0){ flag = 3; seqEnd = j+overlap-1; break; } } if(!bal_flag&&node->inEdge==2){ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(bal_ctg2end>0){ bal_flag = 2; bal_end = bal_j+overlap-1; } }else if(bal_flag==2&&node->inEdge==2){ index = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(indexinEdge==1){ bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1); if(bal_ctg1start>0){ bal_flag = 3; bal_start = bal_j+overlap-1; break; } } } } if(flag==3){ *start = seqStart; *end = seqEnd; *bal = 0; *index1 = ctg1start; *index2 = ctg2end; return 1; }else if(bal_flag==3){ *start = bal_start; *end = bal_end; *bal = 1; *index1 = bal_ctg1start; *index2 = bal_ctg2end; return 1; } return 0; } static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray, Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2, CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max) { int i,j,start,end,startOnCtg1,endOnCtg2; char *bal_seq; char *src_seq; char *pt; boolean bal,ret=0,FILL; src_seq = (char *)ckalloc(maxReadLen*sizeof(char)); bal_seq = (char *)ckalloc(maxReadLen*sizeof(char)); for(i=0;imax) continue; fprintf(stderr,"Read across\n"); //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end); if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap) continue; // contig1 and contig2 could not overlap more than origOverlap bases ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = end-start; if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4)) continue; pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset); for(j=start+1;j<=end;j++){ if(bal) writeChar2tightString(bal_seq[j],pt,j-start-1); else writeChar2tightString(src_seq[j],pt,j-start-1); } ctg1->cutTail = len1-startOnCtg1-1; ctg2->cutHead = overlap + endOnCtg2; ctg2->scaftig_start = 0; ret = 1; break; } free((void*)src_seq); free((void*)bal_seq); return ret; } */ static void kmerSet_markTandem (KmerSet * set, Kmer WordFilter, int overlap); static boolean readsCrossGap (READNEARBY * rdArray, int num, int originOverlap, DARRAY * gapSeqArray, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, CTGinSCAF * ctg1, CTGinSCAF * ctg2, KmerSet * kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2, char *seqGap, char *seqCtg1, char *seqCtg2, int cut1, int cut2); int localGraph (READNEARBY * rdArray, int num, CTGinSCAF * ctg1, CTGinSCAF * ctg2, int origOverlap, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, DARRAY * gapSeqArray, char *seqCtg1, char *seqCtg2, char *seqGap) { /**************** put kmer in DBgraph ****************/ KmerSet *kmerSet; Kmer WordFilter = createFilter (overlap); /* if(ctg1->ctgID==56410&&ctg2->ctgID==61741) printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID); */ kmerSet = readsInGap2DBgraph (rdArray, num, ctg1, ctg2, origOverlap, kmerCtg1, kmerCtg2, overlap, WordFilter); if (!kmerSet) { //printf("no kmer found\n"); return 0; } time_t tt; time (&tt); //srand48 ( ( int ) tt ); /* int i,j; for(i=0;i<2;i++){ int array[] = {0,1,2,3}; for(j=4;j>0;j--) fprintf(stderr,"%d ", nPick1(array,j)); } fprintf(stderr,"\n"); */ /***************** search path to connect contig ends ********/ int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; //count kmer number for contig1 and contig2 ends int len1, len2; len1 = CTGendLen < contig_array[ctg1->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg1->ctgID].length + origOverlap; len2 = CTGendLen < contig_array[ctg2->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg2->ctgID].length + origOverlap; len1 -= overlap - 1; len2 -= overlap - 1; //int pathNum = 2; int offset1 = 0, offset2 = 0, cut1 = 0, cut2 = 0; int pathNum = searchFgap (kmerSet, ctg1, ctg2, kmerCtg1, kmerCtg2, origOverlap, overlap, gapSeqArray, len1, len2, WordFilter, &offset1, &offset2, seqGap, &cut1, &cut2); //printf("SF: %d K %d\n",pathNum,overlap); if (pathNum == 0) { free_kmerset (kmerSet); return 0; } else if (pathNum == 1) { free_kmerset (kmerSet); return 1; } /* else{ printf("ret %d\n",pathNum); free_kmerset(kmerSet); return 0; } */ /******************* cross the gap by single reads *********/ //kmerSet_markTandem(kmerSet,WordFilter,overlap); maskRepeatNode (kmerSet, kmerCtg1, kmerCtg2, overlap, len1, len2, max, WordFilter); boolean found = readsCrossGap (rdArray, num, origOverlap, gapSeqArray, kmerCtg1, kmerCtg2, overlap, ctg1, ctg2, kmerSet, WordFilter, min, max, offset1, offset2, seqGap, seqCtg1, seqCtg2, cut1, cut2); if (found) { //fprintf(stderr,"read across\n"); free_kmerset (kmerSet); return found; } else { free_kmerset (kmerSet); return 0; } } static void kmerSet_markTandem (KmerSet * set, Kmer WordFilter, int overlap) { kmer_t *rs; long long counter = 0; int num_route, steps; int min = 1, max = overlap, maxRoute = 1; int traceCounter; set->iter_ptr = 0; while (set->iter_ptr < set->size) { if (!is_kmer_entity_null (set->flags, set->iter_ptr)) { rs = set->array + set->iter_ptr; if (rs->inEdge > 0) { set->iter_ptr++; continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat (rs->seq, steps, min, max, &num_route, set, rs->seq, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if (num_route < 1) { set->iter_ptr++; continue; } /* printKmerSeqLocal(stderr,rs->seq,overlap); fprintf(stderr, "\n"); */ rs->checked = 1; counter++; } set->iter_ptr++; } } /******************* the following is for read-crossing gaps *************************/ #define MAXREADLENGTH 100 static const int INDEL = 0; static const int SIM[4][4] = { {1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1} }; static char fastSequence[MAXREADLENGTH]; static char slowSequence[MAXREADLENGTH]; static int Fmatrix[MAXREADLENGTH + 1][MAXREADLENGTH + 1]; static int slowToFastMapping[MAXREADLENGTH + 1]; static int fastToSlowMapping[MAXREADLENGTH + 1]; static int max (int A, int B, int C) { A = A >= B ? A : B; return (A >= C ? A : C); } static int compareSequences (char *sequence1, char *sequence2, int length1, int length2) { if (length1 < 1 || length2 < 1 || length1 > MAXREADLENGTH || length2 > MAXREADLENGTH) { return 0; } int i, j; int Choice1, Choice2, Choice3; int maxScore; for (i = 0; i <= length1; i++) { Fmatrix[i][0] = 0; } for (j = 0; j <= length2; j++) { Fmatrix[0][j] = 0; } for (i = 1; i <= length1; i++) { for (j = 1; j <= length2; j++) { Choice1 = Fmatrix[i - 1][j - 1] + SIM[(int) sequence1[i - 1]][(int) sequence2[j - 1]]; Choice2 = Fmatrix[i - 1][j] + INDEL; Choice3 = Fmatrix[i][j - 1] + INDEL; Fmatrix[i][j] = max (Choice1, Choice2, Choice3); } } maxScore = Fmatrix[length1][length2]; return maxScore; } static void mapSlowOntoFast (int slowSeqLength, int fastSeqLength) { int slowIndex = slowSeqLength; int fastIndex = fastSeqLength; int fastn, slown; if (slowIndex == 0) { slowToFastMapping[0] = fastIndex; while (fastIndex >= 0) { fastToSlowMapping[fastIndex--] = 0; } return; } if (fastIndex == 0) { while (slowIndex >= 0) { slowToFastMapping[slowIndex--] = 0; } fastToSlowMapping[0] = slowIndex; return; } while (slowIndex > 0 && fastIndex > 0) { fastn = (int) fastSequence[fastIndex - 1]; //getCharInTightString(fastSequence,fastIndex-1); slown = (int) slowSequence[slowIndex - 1]; //getCharInTightString(slowSequence,slowIndex-1); if (Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex - 1] + SIM[fastn][slown]) { fastToSlowMapping[--fastIndex] = --slowIndex; slowToFastMapping[slowIndex] = fastIndex; } else if (Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex] + INDEL) { fastToSlowMapping[--fastIndex] = slowIndex - 1; } else if (Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex][slowIndex - 1] + INDEL) { slowToFastMapping[--slowIndex] = fastIndex - 1; } else { printf ("compareSequence: Error trace\n"); fflush (stdout); abort (); } } while (slowIndex > 0) { slowToFastMapping[--slowIndex] = -1; } while (fastIndex > 0) { fastToSlowMapping[--fastIndex] = -1; } slowToFastMapping[slowSeqLength] = fastSeqLength; fastToSlowMapping[fastSeqLength] = slowSeqLength; } static boolean chopReadFillGap (int len_seq, int overlap, char *src_seq, char *bal_seq, KmerSet * kset, Kmer WORDF, int *start, int *end, boolean * bal, Kmer * KmerCtg1, int len1, Kmer * KmerCtg2, int len2, int *index1, int *index2) { int index, j = 0, bal_j; Kmer word, bal_word; int flag = 0, bal_flag = 0; int ctg1start, bal_ctg1start, ctg2end, bal_ctg2end; int seqStart, bal_start, seqEnd, bal_end; kmer_t *node; boolean found; if (len_seq < overlap + 1) { return 0; } word=kmerZero; for (index = 0; index < overlap; index++) { word = KmerLeftBitMoveBy2 (word); #ifdef MER127 word.low2 |= src_seq[index]; #endif #ifdef MER63 word.low |= src_seq[index]; #endif #ifdef MER31 word |= src_seq[index]; #endif } reverseComplementSeq (src_seq, len_seq, bal_seq); // complementary node bal_word = reverseComplement (word, overlap); bal_j = len_seq - 0 - overlap; // 0; flag = bal_flag = 0; if (KmerSmaller (word, bal_word)) { found = search_kmerset (kset, word, &node); } else { found = search_kmerset (kset, bal_word, &node); } //if ( !found ) { printf ( "chopReadFillGap 1292 not found!\n" ); } if (found && !node->linear && !node->checked) { if (!flag && node->inEdge == 1) { ctg1start = searchKmerOnCtg (word, KmerCtg1, len1); if (ctg1start >= 0) { flag = 1; seqStart = j + overlap - 1; } } if (!bal_flag && node->inEdge == 2) { bal_ctg2end = searchKmerOnCtg (bal_word, KmerCtg2, len2); if (bal_ctg2end >= 0) { bal_flag = 2; bal_end = bal_j + overlap - 1; } } } for (j = 1; j <= len_seq - overlap; j++) { word = nextKmerLocal (word, src_seq[j - 1 + overlap], WORDF); bal_j = len_seq - j - overlap; // j; bal_word = prevKmerLocal (bal_word, bal_seq[bal_j], overlap); if (KmerSmaller (word, bal_word)) { found = search_kmerset (kset, word, &node); } else { found = search_kmerset (kset, bal_word, &node); } //if ( !found ) { printf ( "chopReadFillGap 1321 not found!\n" ); } if (found && !node->linear && !node->checked) { if (!flag && node->inEdge == 1) { ctg1start = searchKmerOnCtg (word, KmerCtg1, len1); if (ctg1start >= 0) { flag = 1; seqStart = j + overlap - 1; } } else if (flag == 1 && node->inEdge == 1) { index = searchKmerOnCtg (word, KmerCtg1, len1); if (index >= 0 && index > ctg1start) // choose hit closer to gap { ctg1start = index; seqStart = j + overlap - 1; } } else if (flag == 1 && node->inEdge == 2) { ctg2end = searchKmerOnCtg (word, KmerCtg2, len2); if (ctg2end >= 0) { flag = 3; seqEnd = j + overlap - 1; break; } } if (!bal_flag && node->inEdge == 2) { bal_ctg2end = searchKmerOnCtg (bal_word, KmerCtg2, len2); if (bal_ctg2end >= 0) { bal_flag = 2; bal_end = bal_j + overlap - 1; } } else if (bal_flag == 2 && node->inEdge == 2) { index = searchKmerOnCtg (bal_word, KmerCtg2, len2); if (index >= 0 && index < bal_ctg2end) // choose hit closer to gap { bal_ctg2end = index; bal_end = bal_j + overlap - 1; } } else if (bal_flag == 2 && node->inEdge == 1) { bal_ctg1start = searchKmerOnCtg (bal_word, KmerCtg1, len1); if (bal_ctg1start >= 0) { bal_flag = 3; bal_start = bal_j + overlap - 1; break; } } } } if (flag == 3) { *start = seqStart; *end = seqEnd; *bal = 0; *index1 = ctg1start; *index2 = ctg2end; return 1; } else if (bal_flag == 3) { *start = bal_start; *end = bal_end; *bal = 1; *index1 = bal_ctg1start; *index2 = bal_ctg2end; return 1; } return 0; } static int cutSeqFromTightStr (char *tightStr, int length, int start, int end, int revS, char *src_seq) { int i, index = 0; end = end < length ? end : length - 1; start = start >= 0 ? start : 0; if (!revS) { for (i = start; i <= end; i++) { src_seq[index++] = getCharInTightString (tightStr, i); } } else { for (i = length - 1 - start; i >= length - end - 1; i--) { src_seq[index++] = int_comp (getCharInTightString (tightStr, i)); } } return end - start + 1; } static int cutSeqFromCtg (unsigned int ctgID, int start, int end, char *sequence, int originOverlap) { unsigned int bal_ctg = getTwinCtg (ctgID); if (contig_array[ctgID].length < 1) { return 0; } int length = contig_array[ctgID].length + originOverlap; if (contig_array[ctgID].seq) { return cutSeqFromTightStr (contig_array[ctgID].seq, length, start, end, 0, sequence); } else { return cutSeqFromTightStr (contig_array[bal_ctg].seq, length, start, end, 1, sequence); } } static int cutSeqFromRead (char *src_seq, int length, int start, int end, char *sequence) { if (end >= length) { printf ("******: end %d length %d\n", end, length); } end = end < length ? end : length - 1; start = start >= 0 ? start : 0; int i; for (i = start; i <= end; i++) { sequence[i - start] = src_seq[i]; } return end - start + 1; } static void printSeq (FILE * fo, char *seq, int len) { int i; for (i = 0; i < len; i++) { fprintf (fo, "%c", int2base ((int) seq[i])); } fprintf (fo, "\n"); } static boolean readsCrossGap (READNEARBY * rdArray, int num, int originOverlap, DARRAY * gapSeqArray, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, CTGinSCAF * ctg1, CTGinSCAF * ctg2, KmerSet * kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2, char *seqGap, char *seqCtg1, char *seqCtg2, int cut1, int cut2) { int i, j, start, end, startOnCtg1, endOnCtg2; char *bal_seq; char *src_seq; char *pt; boolean bal, ret = 0, FILL; double maxScore = 0.0; int maxIndex; int lenCtg1, lenCtg2; //build sequences on left and right of the uncertain region int buffer_size = maxReadLen > 100 ? maxReadLen : 100; int length = contig_array[ctg1->ctgID].length + originOverlap; if (buffer_size > offset1) { lenCtg1 = cutSeqFromCtg (ctg1->ctgID, length - cut1 - (buffer_size - offset1), length - 1 - cut1, seqCtg1, originOverlap); for (i = 0; i < offset1; i++) { seqCtg1[lenCtg1 + i] = seqGap[i]; } lenCtg1 += offset1; } else { for (i = offset1 - buffer_size; i < offset1; i++) { seqCtg1[i + buffer_size - offset1] = seqGap[i]; } lenCtg1 = buffer_size; } length = contig_array[ctg2->ctgID].length + originOverlap; if (buffer_size > offset2) { lenCtg2 = cutSeqFromCtg (ctg2->ctgID, cut2, buffer_size - offset2 - 1 + cut2, &(seqCtg2[offset2]), originOverlap); for (i = 0; i < offset2; i++) { seqCtg2[i] = seqGap[i + offset1]; } lenCtg2 += offset2; } else { for (i = 0; i < buffer_size; i++) { seqCtg2[i] = seqGap[i + offset1]; } lenCtg2 = buffer_size; } /* if(offset1>0||offset2>0){ for(i=0;i max) { continue; } if (overlap + (len1 - startOnCtg1 - 1) + endOnCtg2 - (end - start) > (int) originOverlap) { continue; } // contig1 and contig2 could not overlap more than origOverlap bases START[i] = start; END[i] = end; INDEX1[i] = startOnCtg1; INDEX2[i] = endOnCtg2; BAL[i] = bal; int matchLen = 2 * overlap < (end - start + overlap) ? 2 * overlap : (end - start + overlap); int match; int alignLen = matchLen; //compare the left of hit kmer on ctg1 //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1; int ctgLeft = (lenCtg1) - (len1 + overlap - 1) + startOnCtg1; int readLeft = start - overlap + 1; int cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft; cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg1->ctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap); cutSeqFromRead (seqCtg1, lenCtg1, ctgLeft - cmpLen, ctgLeft - 1, fastSequence); if (!bal) { cutSeqFromRead (src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); } else { cutSeqFromRead (bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); } match = compareSequences (fastSequence, slowSequence, cmpLen, cmpLen); alignLen += cmpLen; matchLen += match; //compare the right of hit kmer on ctg1 int ctgRight = len1 - startOnCtg1 - 1; cmpLen = ctgRight < (rdArray[i].len - start - 1) ? ctgRight : (rdArray[i].len - start - 1); cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap); cutSeqFromRead (seqCtg1, lenCtg1, ctgLeft + overlap, ctgLeft + overlap + cmpLen - 1, fastSequence); if (!bal) { cutSeqFromRead (src_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence); } else { cutSeqFromRead (bal_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence); } match = compareSequences (fastSequence, slowSequence, cmpLen, cmpLen); //fprintf(stderr,"%d -- %d\n",match,cmpLen); alignLen += cmpLen; matchLen += match; //compare the left of hit kmer on ctg2 ctgLeft = endOnCtg2; readLeft = end - overlap + 1; cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft; cmpLen = ctgLeft <= MAXREADLENGTH ? ctgLeft : MAXREADLENGTH; //cutSeqFromCtg(ctg2->ctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap); cutSeqFromRead (seqCtg2, lenCtg2, endOnCtg2 - cmpLen, endOnCtg2 - 1, fastSequence); if (!bal) { cutSeqFromRead (src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); } else { cutSeqFromRead (bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); } match = compareSequences (fastSequence, slowSequence, cmpLen, cmpLen); alignLen += cmpLen; matchLen += match; //compare the right of hit kmer on ctg2 //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap; ctgRight = lenCtg2 - endOnCtg2 - overlap; cmpLen = ctgRight < (rdArray[i].len - end - 1) ? ctgRight : (rdArray[i].len - end - 1); cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap); cutSeqFromRead (seqCtg2, lenCtg2, endOnCtg2 + overlap, endOnCtg2 + overlap + cmpLen - 1, fastSequence); if (!bal) { cutSeqFromRead (src_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence); } else { cutSeqFromRead (bal_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence); } match = compareSequences (fastSequence, slowSequence, cmpLen, cmpLen); alignLen += cmpLen; matchLen += match; /* if(cmpLen>0&&match!=cmpLen+overlap){ printSeq(stderr,fastSequence,cmpLen+overlap); printSeq(stderr,slowSequence,cmpLen+overlap); printKmerSeqLocal(stderr,kmerCtg2[endOnCtg2],overlap); fprintf(stderr,": %d(%d)\n",bal,endOnCtg2); }else if(cmpLen>0&&match==cmpLen+overlap) fprintf(stderr,"Perfect\n"); */ double score = (double) matchLen / alignLen; if (maxScore < score) { maxScore = score; //fprintf(stderr,"%4.2f (%d/%d)\n",maxScore,matchLen,alignLen); maxIndex = i; } SCORE[i] = score; } /* if(maxScore>0.0) fprintf(stderr,"SCORE: %4.2f\n",maxScore); */ if (maxScore > 0.9) { /* for(i=0;i 0 ? offset1 - (len1 - INDEX1[maxIndex] - 1) : 0; int rightRemain = offset2 - (overlap + INDEX2[maxIndex]) > 0 ? offset2 - (overlap + INDEX2[maxIndex]) : 0; ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = END[maxIndex] - START[maxIndex] + leftRemain + rightRemain; if (darrayPut (gapSeqArray, ctg1->gapSeqOffset + (END[maxIndex] - START[maxIndex] + leftRemain + rightRemain) / 4)) { pt = (char *) darrayPut (gapSeqArray, ctg1->gapSeqOffset); for (j = 0; j < leftRemain; j++) //get the left side of the gap region from search { writeChar2tightString (seqGap[j], pt, j); //fprintf(stderr,"%c",int2base(seqGap[j])); } for (j = START[maxIndex] + 1; j <= END[maxIndex]; j++) { if (BAL[maxIndex]) { writeChar2tightString (bal_seq[j], pt, j - START[maxIndex] - 1 + leftRemain); //fprintf(stderr,"%c",int2base(bal_seq[j])); } else { writeChar2tightString (src_seq[j], pt, j - START[maxIndex] - 1 + leftRemain); //fprintf(stderr,"%c",int2base(src_seq[j])); } } for (j = offset2 - rightRemain; j < offset2; j++) //get the right side of the gap region from search { writeChar2tightString (seqGap[j + leftRemain], pt, j + END[maxIndex] - START[maxIndex] + leftRemain); //fprintf(stderr,"%c",int2base(seqGap[j+leftRemain])); } /* fprintf(stderr,": GAPSEQ (%d+%d)(%d+%d)(%d+%d)(%d+%d) B %d\n",offset1,offset2,cut1,cut2, len1-INDEX1[maxIndex]-1,INDEX2[maxIndex],START[maxIndex],END[maxIndex],BAL[maxIndex]); */ ctg1->cutTail = len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 > cut1 ? len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 : cut1; ctg2->cutHead = overlap + INDEX2[maxIndex] - offset2 + cut2 > cut2 ? overlap + INDEX2[maxIndex] - offset2 + cut2 : cut2; ctg2->scaftig_start = 0; ret = 1; } } free ((void *) START); free ((void *) END); free ((void *) INDEX1); free ((void *) INDEX2); free ((void *) SCORE); free ((void *) BAL); free ((void *) src_seq); free ((void *) bal_seq); return ret; }