#include "stdinc.h" #include "newhash.h" #include "extfunc.h" #include "extvab.h" #define CTGendLen 35 // shouldn't larger than max_read_len #define UPlimit 5000 #define MaxRouteNum 10 static Kmer pubKmer = 0x1b4d65165b; static void kmerSet_mark(KmerSet *set); static void trace4Repeat(Kmer currW, int steps, int min, int max, int *num_route, KmerSet *kset, Kmer kmerDest, int overlap, Kmer WORDF, int *traceCounter, int maxRoute, kmer_t **soFarNode, short *multiOccu1, short *multiOccu2, int *routeLens, char **foundRoutes, char *soFarSeq, long long *soFarLinks, double *avgLinks); static Kmer prevKmerLocal(Kmer next, char ch, int overlap) { Kmer word = next; word >>= 2; word += ((Kmer)ch) << 2 * (overlap - 1); return word; } static Kmer nextKmerLocal(Kmer prev, char ch, Kmer WordFilter) { Kmer word = prev; word <<= 2; word &= WordFilter; word += ch; return word; } static void singleKmer(int t, KmerSet *kset, int flag, Kmer *kmerBuffer, char *prevcBuffer, char *nextcBuffer) { kmer_t *pos; put_kmerset(kset, kmerBuffer[t], prevcBuffer[t], nextcBuffer[t], &pos); if(pos->inEdge == flag) return; else if(pos->inEdge == 0) pos->inEdge = flag; else if(pos->inEdge == 1 && flag == 2) pos->inEdge = 3; else if(pos->inEdge == 2 && flag == 1) pos->inEdge = 3; } static void putKmer2DBgraph(KmerSet *kset, int flag, int kmer_c, Kmer *kmerBuffer, char *prevcBuffer, char *nextcBuffer) { int t; for(t = 0; t < kmer_c; t++) singleKmer(t, kset, flag, kmerBuffer, prevcBuffer, nextcBuffer); } static void getSeqFromRead(READNEARBY read, char *src_seq) { int len_seq = read.len; int j; char *tightSeq = (char *)darrayGet(readSeqInGap, read.seqStarter); for(j = 0; j < len_seq; j++) src_seq[j] = getCharInTightString(tightSeq, j); } static void chopKmer4Ctg(Kmer *kmerCtg, int lenCtg, int overlap, char *src_seq, Kmer WORDF) { int index, j; Kmer word = 0; for (index = 0; index < overlap; index++) { word <<= 2; word += src_seq[index]; } index = 0; kmerCtg[index++] = word; for(j = 1; j <= lenCtg - overlap; j ++) { word = nextKmerLocal(word, src_seq[j - 1 + overlap], WORDF); kmerCtg[index++] = word; } } static void chopKmer4read(int len_seq, int overlap, char *src_seq, char *bal_seq, Kmer *kmerBuffer, char *prevcBuffer, char *nextcBuffer, int *kmer_c, Kmer WORDF) { int j, bal_j; Kmer word, bal_word; int index; char InvalidCh = 4; if(len_seq < overlap + 1) { *kmer_c = 0; return; } word = 0; for (index = 0; index < overlap; index++) { word <<= 2; word += src_seq[index]; } reverseComplementSeq(src_seq, len_seq, bal_seq); // complementary node bal_word = reverseComplement(word, overlap); bal_j = len_seq - 0 - overlap; // 0; index = 0; if(word < bal_word) { kmerBuffer[index] = word; prevcBuffer[index] = InvalidCh; nextcBuffer[index++] = src_seq[0 + overlap]; } else { kmerBuffer[index] = bal_word; prevcBuffer[index] = bal_seq[bal_j - 1]; nextcBuffer[index++] = InvalidCh; } for(j = 1; j <= len_seq - overlap; j ++) { word = nextKmerLocal(word, src_seq[j - 1 + overlap], WORDF); bal_j = len_seq - j - overlap; // j; bal_word = prevKmerLocal(bal_word, bal_seq[bal_j], overlap); if(word < bal_word) { kmerBuffer[index] = word; prevcBuffer[index] = src_seq[j - 1]; if(j < len_seq - overlap) nextcBuffer[index++] = src_seq[j + overlap]; else nextcBuffer[index++] = InvalidCh; //printf("%dth: %p with %p\n",kmer_c-1,word,hashBanBuffer[kmer_c-1]); } else { // complementary node kmerBuffer[index] = bal_word; if(bal_j > 0) prevcBuffer[index] = bal_seq[bal_j - 1]; else prevcBuffer[index] = InvalidCh; nextcBuffer[index++] = bal_seq[bal_j + overlap]; //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]); } } *kmer_c = index; } static void headTightStr(char *tightStr, int length, int start, int headLen, int revS, char *src_seq) { int i, index = 0; if(!revS) { for(i = start; i < start + headLen; i++) src_seq[index++] = getCharInTightString(tightStr, i); } else { for(i = length - 1 - start; i >= length - headLen - start; i--) src_seq[index++] = int_comp(getCharInTightString(tightStr, i)); } } static int getSeqFromCtg(CTGinSCAF *ctg, boolean fromHead, unsigned int len, int originOverlap, char *src_seq) { unsigned int ctgId = ctg->ctgID; unsigned int bal_ctg = getTwinCtg(ctgId); if(contig_array[ctgId].length < 1) return 0; unsigned int length = contig_array[ctgId].length + originOverlap; len = len < length ? len : length; if(fromHead) { if(contig_array[ctgId].seq) headTightStr(contig_array[ctgId].seq, length, 0, len, 0, src_seq); else headTightStr(contig_array[bal_ctg].seq, length, 0, len, 1, src_seq); } else { if(contig_array[ctgId].seq) headTightStr(contig_array[ctgId].seq, length, length - len, len, 0, src_seq); else headTightStr(contig_array[bal_ctg].seq, length, length - len, len, 1, src_seq); } return len; } static KmerSet *readsInGap2DBgraph(READNEARBY *rdArray, int num, CTGinSCAF *ctg1, CTGinSCAF *ctg2, int originOverlap, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, Kmer WordFilter) { int kmer_c; Kmer *kmerBuffer; char *nextcBuffer, *prevcBuffer; int i; int buffer_size = maxReadLen > CTGendLen ? maxReadLen : CTGendLen; KmerSet *kmerS = NULL; int lenCtg1; int lenCtg2; char *bal_seq; char *src_seq; src_seq = (char *)ckalloc(buffer_size * sizeof(char)); bal_seq = (char *)ckalloc(buffer_size * sizeof(char)); kmerBuffer = (Kmer *)ckalloc(buffer_size * sizeof(Kmer)); prevcBuffer = (char *)ckalloc(buffer_size * sizeof(char)); nextcBuffer = (char *)ckalloc(buffer_size * sizeof(char)); kmerS = init_kmerset(1024, 0.77f); for(i = 0; i < num; i++) { getSeqFromRead(rdArray[i], src_seq); chopKmer4read(rdArray[i].len, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter); putKmer2DBgraph(kmerS, 0, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer); } lenCtg1 = getSeqFromCtg(ctg1, 0, CTGendLen, originOverlap, src_seq); chopKmer4Ctg(kmerCtg1, lenCtg1, overlap, src_seq, WordFilter); chopKmer4read(lenCtg1, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter); putKmer2DBgraph(kmerS, 1, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer); lenCtg2 = getSeqFromCtg(ctg2, 1, CTGendLen, originOverlap, src_seq); chopKmer4Ctg(kmerCtg2, lenCtg2, overlap, src_seq, WordFilter); chopKmer4read(lenCtg2, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter); putKmer2DBgraph(kmerS, 2, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer); /* if(ctg1->ctgID==3733&&ctg2->ctgID==3067){ for(i=0;i= 0; i--) { ch = kmer & 3; kmer >>= 2; kmerSeq[i] = ch; } for(i = 0; i < overlap; i++) fprintf(fo, "%c", int2base((int)kmerSeq[i])); } static void kmerSet_mark(KmerSet *set) { int i, in_num, out_num, cvgSingle; kmer_t *rs; long long counter = 0, linear = 0; Kmer word; set->iter_ptr = 0; while(set->iter_ptr < set->size) { if(!is_kmer_entity_null(set->flags, set->iter_ptr)) { in_num = out_num = 0; rs = set->array + set->iter_ptr; word = rs->seq; for(i = 0; i < 4; i++) { cvgSingle = get_kmer_left_cov(*rs, i); if(cvgSingle > 0) { in_num++; } cvgSingle = get_kmer_right_cov(*rs, i); if(cvgSingle > 0) { out_num++; } } if(rs->single) { counter++; } if(in_num == 1 && out_num == 1) { rs->linear = 1; linear++; } } set->iter_ptr ++; } //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear); } static kmer_t *searchNode(Kmer word, KmerSet *kset, int overlap) { Kmer bal_word = reverseComplement(word, overlap); kmer_t *node; boolean found; if(word < bal_word) found = search_kmerset(kset, word, &node); else found = search_kmerset(kset, bal_word, &node); if(found) return node; else return NULL; } static int searchKmerOnCtg(Kmer currW, Kmer *kmerDest, int num) { int i; for(i = 0; i < num; i++) { if(currW == kmerDest[i]) { return i; } } return -1; } // pick on from n items randomly static int nPick1(int *array, int n) { int m, i; m = n - 1; //(int)(drand48()*n); int value = array[m]; for(i = m; i < n - 1; i++) array[i] = array[i + 1]; return value; } static void traceAlongDBgraph(Kmer currW, int steps, int min, int max, int *num_route, KmerSet *kset, Kmer *kmerDest, int num, int overlap, Kmer WORDF, char **foundRoutes, int *routeEndOnCtg2, int *routeLens, char *soFarSeq, int *traceCounter, int maxRoute, kmer_t **soFarNode, boolean *multiOccu, long long *soFarLinks, double *avgLinks) { (*traceCounter)++; if(*traceCounter > UPlimit) { /* if(overlap==19&&kmerDest[0]==pubKmer) printf("UPlimit\n"); */ return; } if(steps > max || *num_route >= maxRoute) { /* if(overlap==19&&kmerDest[0]==pubKmer) printf("max steps/maxRoute\n"); */ return; } Kmer word = reverseComplement(currW, overlap); boolean isSmaller = currW < word; int i; char ch; unsigned char links; if(isSmaller) word = currW; kmer_t *node; boolean found = search_kmerset(kset, word, &node); if(!found) { printf("Trace: can't find kmer %llx (rc %llx, input %llx) at step %d\n", word, reverseComplement(word, overlap), currW, steps); return; } if(node->twin > 1) return; if(soFarNode) soFarNode[steps] = node; if(steps > 0) soFarSeq[steps - 1] = currW & 0x03; int index, end; int linkCounter = *soFarLinks; if(steps >= min && node->inEdge > 1 && (end = searchKmerOnCtg(currW, kmerDest, num)) >= 0) { index = *num_route; if(steps > 0) avgLinks[index] = (double)linkCounter / steps; else avgLinks[index] = 0; //find node that appears more than once in the path multiOccu[index] = 0; for(i = 0; i < steps + 1; i++) soFarNode[i]->deleted = 0; for(i = 0; i < steps + 1; i++) { if(soFarNode[i]->deleted) { multiOccu[index] = 1; break; } soFarNode[i]->deleted = 1; } routeEndOnCtg2[index] = end; routeLens[index] = steps; char *array = foundRoutes[index]; for(i = 0; i < steps; i++) array[i] = soFarSeq[i]; if(i < max) array[i] = 4; //indicate the end of the sequence *num_route = ++index; return; } steps++; if(isSmaller) { int array[] = {0, 1, 2, 3}; for(i = 4; i > 0; i--) { ch = nPick1(array, i); links = get_kmer_right_cov(*node, ch); if(!links) continue; *soFarLinks = linkCounter + links; word = nextKmerLocal(currW, ch, WORDF); traceAlongDBgraph(word, steps, min, max, num_route, kset, kmerDest, num, overlap, WORDF, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute, soFarNode, multiOccu, soFarLinks, avgLinks); } } else { int array[] = {0, 1, 2, 3}; for(i = 4; i > 0; i--) { ch = nPick1(array, i); links = get_kmer_left_cov(*node, ch); if(!links) continue; *soFarLinks = linkCounter + links; word = nextKmerLocal(currW, int_comp(ch), WORDF); traceAlongDBgraph(word, steps, min, max, num_route, kset, kmerDest, num, overlap, WORDF, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute, soFarNode, multiOccu, soFarLinks, avgLinks); } } } static int searchFgap(KmerSet *kset, CTGinSCAF *ctg1, CTGinSCAF *ctg2, Kmer *kmerCtg1, Kmer *kmerCtg2, unsigned int origOverlap, int overlap, DARRAY *gapSeqArray, int len1, int len2, Kmer WordFilter, int *offset1, int *offset2, char *seqGap, int *cut1, int *cut2) { int i; int ret = 0; kmer_t *node, **soFarNode; int num_route; int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; //0531 int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; char **foundRoutes; char *soFarSeq; int traceCounter; int *routeEndOnCtg2; int *routeLens; boolean *multiOccu; long long soFarLinks; double *avgLinks; //mask linear internal linear kmer on contig1 end routeEndOnCtg2 = (int *)ckalloc(MaxRouteNum * sizeof(int)); routeLens = (int *)ckalloc(MaxRouteNum * sizeof(int)); multiOccu = (boolean *)ckalloc(MaxRouteNum * sizeof(boolean)); short *MULTI1 = (short *)ckalloc(MaxRouteNum * sizeof(short)); short *MULTI2 = (short *)ckalloc(MaxRouteNum * sizeof(short)); soFarSeq = (char *)ckalloc(max * sizeof(char)); soFarNode = (kmer_t **)ckalloc((max + 1) * sizeof(kmer_t *)); foundRoutes = (char **)ckalloc(MaxRouteNum * sizeof(char *));; avgLinks = (double *)ckalloc(MaxRouteNum * sizeof(double));; for(i = 0; i < MaxRouteNum; i++) foundRoutes[i] = (char *)ckalloc(max * sizeof(char)); for(i = len1 - 1; i >= 0; i--) { num_route = traceCounter = soFarLinks = 0; int steps = 0; traceAlongDBgraph(kmerCtg1[i], steps, min, max, &num_route, kset, kmerCtg2, len2, overlap, WordFilter, foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, &traceCounter, MaxRouteNum, soFarNode, multiOccu, &soFarLinks, avgLinks); if(num_route > 0) { int m, minEnd = routeEndOnCtg2[0]; for(m = 0; m < num_route; m++) { if(routeLens[m] < 0) continue; if(routeEndOnCtg2[m] < minEnd) minEnd = routeEndOnCtg2[m]; } /* else if(minFreq>1){ for(m=0;m3) break; printf("%c",int2base((int)foundRoutes[m][j])); } printf(": %4.2f\n",avgLinks[m]); } } */ num_route = traceCounter = soFarLinks = 0; steps = 0; trace4Repeat(kmerCtg1[i], steps, min, max, &num_route, kset, kmerCtg2[minEnd], overlap, WordFilter, &traceCounter, MaxRouteNum, soFarNode, MULTI1, MULTI2, routeLens, foundRoutes, soFarSeq, &soFarLinks, avgLinks); int j, best = 0; int maxLen = routeLens[0]; double maxLink = avgLinks[0]; char *pt; boolean repeat = 0, sameLen = 1; int leftMost = max, rightMost = max; if(num_route < 1) { fprintf(stderr, "After trace4Repeat: non route was found\n"); continue; } if(num_route > 1) { // if multi paths are found, we check on the repeatative occurrences and links/length for(m = 0; m < num_route; m++) { if(routeLens[m] < 0) continue; if(MULTI1[m] >= 0 && MULTI2[m] >= 0) { repeat = 1; leftMost = leftMost > MULTI1[m] ? MULTI1[m] : leftMost; rightMost = rightMost > MULTI2[m] ? MULTI2[m] : rightMost; } if(routeLens[m] != maxLen) sameLen = 0; if(routeLens[m] < maxLen) maxLen = routeLens[m]; if(avgLinks[m] > maxLink) { maxLink = avgLinks[m]; best = m; } } } if(repeat) { *offset1 = *offset2 = *cut1 = *cut2 = 0; int index = 0; char ch; for(j = 0; j < leftMost; j++) { if(routeLens[0] < j + overlap + 1) break; else ch = foundRoutes[0][j]; for(m = 1; m < num_route; m++) { if(routeLens[m] < 0) continue; if(ch != foundRoutes[m][j]) break; } if(m == num_route) seqGap[index++] = ch; else break; } *offset1 = index; index = 0; for(j = 0; j < rightMost; j++) { if(routeLens[0] - overlap - 1 < j) break; else ch = foundRoutes[0][routeLens[0] - overlap - 1 - j]; for(m = 1; m < num_route; m++) { if(routeLens[m] < 0) continue; if(ch != foundRoutes[m][routeLens[m] - overlap - 1 - j]) break; } if(m == num_route) index++; else break; } *offset2 = index; for(j = 0; j < *offset2; j++) seqGap[*offset1 + *offset2 - 1 - j] = foundRoutes[0][routeLens[0] - overlap - 1 - j]; if(*offset1 > 0 || *offset2 > 0) { *cut1 = len1 - i - 1; *cut2 = minEnd; //fprintf(stderr,"\n"); for(m = 0; m < num_route; m++) { for(j = 0; j < max; j++) { if(foundRoutes[m][j] > 3) break; //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j])); } //fprintf(stderr,": %4.2f\n",avgLinks[m]); } /* fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2); for(index=0;index<*offset1+*offset2;index++) fprintf(stderr,"%c",int2base(seqGap[index])); fprintf(stderr,"\n"); */ } ret = 3; break; } if(overlap + (len1 - i - 1) + minEnd - routeLens[best] > (int)origOverlap) continue; ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = routeLens[best]; if(!darrayPut(gapSeqArray, ctg1->gapSeqOffset + maxLen / 4)) continue; pt = (char *)darrayPut(gapSeqArray, ctg1->gapSeqOffset); /* printKmer(stderr,kmerCtg1[i],overlap); fprintf(stderr,"-"); */ for(j = 0; j < max; j++) { if(foundRoutes[best][j] > 3) break; writeChar2tightString(foundRoutes[best][j], pt, j); //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j])); } //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]); ctg1->cutTail = len1 - i - 1; ctg2->cutHead = overlap + minEnd; ctg2->scaftig_start = 0; ret = 1; break; /* }if(num_route>1){ ret = 2; break; */ } else //mark node which leads to dead end { node = searchNode(kmerCtg1[i], kset, overlap); if(node) node->twin = 2; } } for(i = 0; i < MaxRouteNum; i++) free((void *)foundRoutes[i]); free((void *)soFarSeq); free((void *)soFarNode); free((void *)multiOccu); free((void *)MULTI1); free((void *)MULTI2); free((void *)foundRoutes); free((void *)routeEndOnCtg2); free((void *)routeLens); return ret; } static void trace4Repeat(Kmer currW, int steps, int min, int max, int *num_route, KmerSet *kset, Kmer kmerDest, int overlap, Kmer WORDF, int *traceCounter, int maxRoute, kmer_t **soFarNode, short *multiOccu1, short *multiOccu2, int *routeLens, char **foundRoutes, char *soFarSeq, long long *soFarLinks, double *avgLinks) { (*traceCounter)++; if(*traceCounter > UPlimit) return; if(steps > max || *num_route >= maxRoute) return; Kmer word = reverseComplement(currW, overlap); boolean isSmaller = currW < word; char ch; unsigned char links; int index, i; if(isSmaller) word = currW; kmer_t *node; boolean found = search_kmerset(kset, word, &node); if(!found) { printf("Trace: can't find kmer %llx (rc %llx, input %llx) at step %d\n", word, reverseComplement(word, overlap), currW, steps); return; } if(soFarNode) soFarNode[steps] = node; if(soFarSeq && steps > 0) soFarSeq[steps - 1] = currW & 0x03; int linkCounter; if(soFarLinks) linkCounter = *soFarLinks; if(steps >= min && currW == kmerDest) { index = *num_route; if(avgLinks && steps > 0) avgLinks[index] = (double)linkCounter / steps; else if(avgLinks) avgLinks[index] = 0; //find node that appears more than once in the path if(multiOccu1 && multiOccu2) { for(i = 0; i < steps + 1; i++) soFarNode[i]->deleted = 0; int rightMost = 0; boolean MULTI = 0; for(i = 0; i < steps + 1; i++) { if(soFarNode[i]->deleted) { rightMost = rightMost < i - 1 ? i - 1 : rightMost; MULTI = 1; } soFarNode[i]->deleted = 1; } if(!MULTI) multiOccu1[index] = multiOccu2[index] = -1; else { multiOccu2[index] = steps - 2 - rightMost < 0 ? 0 : steps - 2 - rightMost; //[0 steps-2] for(i = 0; i < steps + 1; i++) soFarNode[i]->deleted = 0; int leftMost = steps - 2; for(i = steps; i >= 0; i--) { if(soFarNode[i]->deleted) leftMost = leftMost > i - 1 ? i - 1 : leftMost; soFarNode[i]->deleted = 1; } multiOccu1[index] = leftMost < 0 ? 0 : leftMost; //[0 steps-2] } } if(routeLens) routeLens[index] = steps; if(soFarSeq) { char *array = foundRoutes[index]; for(i = 0; i < steps; i++) array[i] = soFarSeq[i]; if(i < max) array[i] = 4; //indicate the end of the sequence } *num_route = ++index; } steps++; if(isSmaller) { int array[] = {0, 1, 2, 3}; for(i = 4; i > 0; i--) { ch = nPick1(array, i); links = get_kmer_right_cov(*node, ch); if(!links) continue; if(soFarLinks) *soFarLinks = linkCounter + links; word = nextKmerLocal(currW, ch, WORDF); trace4Repeat(word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode, multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks); } } else { int array[] = {0, 1, 2, 3}; for(i = 4; i > 0; i--) { ch = nPick1(array, i); links = get_kmer_left_cov(*node, ch); if(!links) continue; if(soFarLinks) *soFarLinks = linkCounter + links; word = nextKmerLocal(currW, int_comp(ch), WORDF); trace4Repeat(word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode, multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks); } } } //found repeat node on contig ends static void maskRepeatNode(KmerSet *kset, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, int len1, int len2, int max, Kmer WordFilter) { int i; int num_route, steps; int min = 1, maxRoute = 1; int traceCounter; Kmer word, bal_word; kmer_t *node; boolean found; int counter = 0; for(i = 0; i < len1; i++) { word = kmerCtg1[i]; bal_word = reverseComplement(word, overlap); if(word > bal_word) word = bal_word; found = search_kmerset(kset, word, &node); if(!found || node->linear) { //printf("Found no node for kmer %llx\n",word); continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat(word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if(num_route < 1) continue; counter++; node->checked = 1; } for(i = 0; i < len2; i++) { word = kmerCtg2[i]; bal_word = reverseComplement(word, overlap); if(word > bal_word) word = bal_word; found = search_kmerset(kset, word, &node); if(!found || node->linear) { //printf("Found no node for kmer %llx\n",word); continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat(word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if(num_route < 1) continue; counter++; node->checked = 1; } //printf("MR: %d(%d)\n",counter,len1+len2); } /* static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq, KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal, Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2) { int index,j=0,bal_j; Kmer word,bal_word; int flag=0,bal_flag=0; int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end; int seqStart,bal_start,seqEnd,bal_end; kmer_t *node; boolean found; if(len_seqlinear&&!node->checked){ if(!flag&&node->inEdge==1){ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); if(ctg1start>0){ flag = 1; seqStart = j + overlap-1; } } if(!bal_flag&&node->inEdge==2){ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(bal_ctg2end>0){ bal_flag = 2; bal_end = bal_j+overlap-1; } } } for(j = 1; j <= len_seq - overlap; j ++) { word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF); bal_j = len_seq-j-overlap; // j; bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap); if(wordlinear&&!node->checked){ if(!flag&&node->inEdge==1){ ctg1start = searchKmerOnCtg(word,KmerCtg1,len1); if(ctg1start>0){ flag = 1; seqStart = j + overlap-1; } }else if(flag==1&&node->inEdge==1){ index = searchKmerOnCtg(word,KmerCtg1,len1); if(index>ctg1start){ // choose hit closer to gap ctg1start = index; seqStart = j + overlap-1; } }else if(flag==1&&node->inEdge==2){ ctg2end = searchKmerOnCtg(word,KmerCtg2,len2); if(ctg2end>0){ flag = 3; seqEnd = j+overlap-1; break; } } if(!bal_flag&&node->inEdge==2){ bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(bal_ctg2end>0){ bal_flag = 2; bal_end = bal_j+overlap-1; } }else if(bal_flag==2&&node->inEdge==2){ index = searchKmerOnCtg(bal_word,KmerCtg2,len2); if(indexinEdge==1){ bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1); if(bal_ctg1start>0){ bal_flag = 3; bal_start = bal_j+overlap-1; break; } } } } if(flag==3){ *start = seqStart; *end = seqEnd; *bal = 0; *index1 = ctg1start; *index2 = ctg2end; return 1; }else if(bal_flag==3){ *start = bal_start; *end = bal_end; *bal = 1; *index1 = bal_ctg1start; *index2 = bal_ctg2end; return 1; } return 0; } static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray, Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2, CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max) { int i,j,start,end,startOnCtg1,endOnCtg2; char *bal_seq; char *src_seq; char *pt; boolean bal,ret=0,FILL; src_seq = (char *)ckalloc(maxReadLen*sizeof(char)); bal_seq = (char *)ckalloc(maxReadLen*sizeof(char)); for(i=0;imax) continue; fprintf(stderr,"Read across\n"); //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end); if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap) continue; // contig1 and contig2 could not overlap more than origOverlap bases ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = end-start; if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4)) continue; pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset); for(j=start+1;j<=end;j++){ if(bal) writeChar2tightString(bal_seq[j],pt,j-start-1); else writeChar2tightString(src_seq[j],pt,j-start-1); } ctg1->cutTail = len1-startOnCtg1-1; ctg2->cutHead = overlap + endOnCtg2; ctg2->scaftig_start = 0; ret = 1; break; } free((void*)src_seq); free((void*)bal_seq); return ret; } */ static void kmerSet_markTandem(KmerSet *set, Kmer WordFilter, int overlap); static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap, DARRAY *gapSeqArray, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, CTGinSCAF *ctg1, CTGinSCAF *ctg2, KmerSet *kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2, char *seqGap, char *seqCtg1, char *seqCtg2, int cut1, int cut2); int localGraph(READNEARBY *rdArray, int num, CTGinSCAF *ctg1, CTGinSCAF *ctg2, int origOverlap, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, DARRAY *gapSeqArray, char *seqCtg1, char *seqCtg2, char *seqGap) { /**************** put kmer in DBgraph ****************/ KmerSet *kmerSet; Kmer WordFilter = (((Kmer) 1) << (2 * overlap)) - 1; /* if(ctg1->ctgID==56410&&ctg2->ctgID==61741) printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID); */ kmerSet = readsInGap2DBgraph(rdArray, num, ctg1, ctg2, origOverlap, kmerCtg1, kmerCtg2, overlap, WordFilter); time_t tt; time(&tt); // srand48((int)tt); /* int i,j; for(i=0;i<2;i++){ int array[] = {0,1,2,3}; for(j=4;j>0;j--) fprintf(stderr,"%d ", nPick1(array,j)); } fprintf(stderr,"\n"); */ /***************** search path to connect contig ends ********/ int gapLen = ctg2->start - ctg1->end - origOverlap + overlap; int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff; //count kmer number for contig1 and contig2 ends int len1, len2; len1 = CTGendLen < contig_array[ctg1->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg1->ctgID].length + origOverlap; len2 = CTGendLen < contig_array[ctg2->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg2->ctgID].length + origOverlap; len1 -= overlap - 1; len2 -= overlap - 1; //int pathNum = 2; int offset1 = 0, offset2 = 0, cut1 = 0, cut2 = 0; int pathNum = searchFgap(kmerSet, ctg1, ctg2, kmerCtg1, kmerCtg2, origOverlap, overlap, gapSeqArray, len1, len2, WordFilter, &offset1, &offset2, seqGap, &cut1, &cut2); //printf("SF: %d K %d\n",pathNum,overlap); if(pathNum == 0) { free_kmerset(kmerSet); return 0; } else if(pathNum == 1) { free_kmerset(kmerSet); return 1; }/* else{ printf("ret %d\n",pathNum); free_kmerset(kmerSet); return 0; } */ /******************* cross the gap by single reads *********/ //kmerSet_markTandem(kmerSet,WordFilter,overlap); maskRepeatNode(kmerSet, kmerCtg1, kmerCtg2, overlap, len1, len2, max, WordFilter); boolean found = readsCrossGap(rdArray, num, origOverlap, gapSeqArray, kmerCtg1, kmerCtg2, overlap, ctg1, ctg2, kmerSet, WordFilter, min, max, offset1, offset2, seqGap, seqCtg1, seqCtg2, cut1, cut2); if(found) { //fprintf(stderr,"read across\n"); free_kmerset(kmerSet); return found; } else { free_kmerset(kmerSet); return 0; } } static void kmerSet_markTandem(KmerSet *set, Kmer WordFilter, int overlap) { kmer_t *rs; long long counter = 0; int num_route, steps; int min = 1, max = overlap, maxRoute = 1; int traceCounter; set->iter_ptr = 0; while(set->iter_ptr < set->size) { if(!is_kmer_entity_null(set->flags, set->iter_ptr)) { rs = set->array + set->iter_ptr; if(rs->inEdge > 0) { set->iter_ptr ++; continue; } num_route = traceCounter = 0; steps = 0; trace4Repeat(rs->seq, steps, min, max, &num_route, set, rs->seq, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); if(num_route < 1) { set->iter_ptr ++; continue; } /* printKmer(stderr,rs->seq,overlap); fprintf(stderr, "\n"); */ rs->checked = 1; counter++; } set->iter_ptr ++; } } /******************* the following is for read-crossing gaps *************************/ #define MAXREADLENGTH 100 static const int INDEL = 0; static const int SIM[4][4] = { {1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1} }; static char fastSequence[MAXREADLENGTH]; static char slowSequence[MAXREADLENGTH]; static int Fmatrix[MAXREADLENGTH + 1][MAXREADLENGTH + 1]; static int slowToFastMapping[MAXREADLENGTH + 1]; static int fastToSlowMapping[MAXREADLENGTH + 1]; static int max(int A, int B, int C) { A = A >= B ? A : B; return (A >= C ? A : C); } static int compareSequences(char *sequence1, char *sequence2, int length1, int length2) { if(length1 < 1 || length2 < 1 || length1 > MAXREADLENGTH || length2 > MAXREADLENGTH) return 0; int i, j; int Choice1, Choice2, Choice3; int maxScore; for (i = 0; i <= length1; i++) Fmatrix[i][0] = 0; for (j = 0; j <= length2; j++) Fmatrix[0][j] = 0; for (i = 1; i <= length1; i++) { for (j = 1; j <= length2; j++) { Choice1 = Fmatrix[i - 1][j - 1] + SIM[(int) sequence1[i - 1]] [(int) sequence2[j - 1]]; Choice2 = Fmatrix[i - 1][j] + INDEL; Choice3 = Fmatrix[i][j - 1] + INDEL; Fmatrix[i][j] = max(Choice1, Choice2, Choice3); } } maxScore = Fmatrix[length1][length2]; return maxScore; } static void mapSlowOntoFast(int slowSeqLength, int fastSeqLength) { int slowIndex = slowSeqLength; int fastIndex = fastSeqLength; int fastn, slown; if (slowIndex == 0) { slowToFastMapping[0] = fastIndex; while (fastIndex >= 0) fastToSlowMapping[fastIndex--] = 0; return; } if (fastIndex == 0) { while (slowIndex >= 0) slowToFastMapping[slowIndex--] = 0; fastToSlowMapping[0] = slowIndex; return; } while (slowIndex > 0 && fastIndex > 0) { fastn = (int) fastSequence[fastIndex - 1]; //getCharInTightString(fastSequence,fastIndex-1); slown = (int) slowSequence[slowIndex - 1]; //getCharInTightString(slowSequence,slowIndex-1); if (Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex - 1] + SIM[fastn][slown]) { fastToSlowMapping[--fastIndex] = --slowIndex; slowToFastMapping[slowIndex] = fastIndex; } else if (Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex] + INDEL) fastToSlowMapping[--fastIndex] = slowIndex - 1; else if (Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex][slowIndex - 1] + INDEL) slowToFastMapping[--slowIndex] = fastIndex - 1; else { printf("compareSequence: Error trace\n"); fflush(stdout); abort(); } } while (slowIndex > 0) slowToFastMapping[--slowIndex] = -1; while (fastIndex > 0) fastToSlowMapping[--fastIndex] = -1; slowToFastMapping[slowSeqLength] = fastSeqLength; fastToSlowMapping[fastSeqLength] = slowSeqLength; } static boolean chopReadFillGap(int len_seq, int overlap, char *src_seq, char *bal_seq, KmerSet *kset, Kmer WORDF, int *start, int *end, boolean *bal, Kmer *KmerCtg1, int len1, Kmer *KmerCtg2, int len2, int *index1, int *index2) { int index, j = 0, bal_j; Kmer word, bal_word; int flag = 0, bal_flag = 0; int ctg1start, bal_ctg1start, ctg2end, bal_ctg2end; int seqStart, bal_start, seqEnd, bal_end; kmer_t *node; boolean found; if(len_seq < overlap + 1) { return 0; } word = 0; for (index = 0; index < overlap; index++) { word <<= 2; word += src_seq[index]; } reverseComplementSeq(src_seq, len_seq, bal_seq); // complementary node bal_word = reverseComplement(word, overlap); bal_j = len_seq - 0 - overlap; // 0; flag = bal_flag = 0; if(word < bal_word) { found = search_kmerset(kset, word, &node); } else { found = search_kmerset(kset, bal_word, &node); } if(found && !node->linear && !node->checked) { if(!flag && node->inEdge == 1) { ctg1start = searchKmerOnCtg(word, KmerCtg1, len1); if(ctg1start >= 0) { flag = 1; seqStart = j + overlap - 1; } } if(!bal_flag && node->inEdge == 2) { bal_ctg2end = searchKmerOnCtg(bal_word, KmerCtg2, len2); if(bal_ctg2end >= 0) { bal_flag = 2; bal_end = bal_j + overlap - 1; } } } for(j = 1; j <= len_seq - overlap; j ++) { word = nextKmerLocal(word, src_seq[j - 1 + overlap], WORDF); bal_j = len_seq - j - overlap; // j; bal_word = prevKmerLocal(bal_word, bal_seq[bal_j], overlap); if(word < bal_word) { found = search_kmerset(kset, word, &node); } else { found = search_kmerset(kset, bal_word, &node); } if(found && !node->linear && !node->checked) { if(!flag && node->inEdge == 1) { ctg1start = searchKmerOnCtg(word, KmerCtg1, len1); if(ctg1start >= 0) { flag = 1; seqStart = j + overlap - 1; } } else if(flag == 1 && node->inEdge == 1) { index = searchKmerOnCtg(word, KmerCtg1, len1); if(index >= 0 && index > ctg1start) // choose hit closer to gap { ctg1start = index; seqStart = j + overlap - 1; } } else if(flag == 1 && node->inEdge == 2) { ctg2end = searchKmerOnCtg(word, KmerCtg2, len2); if(ctg2end >= 0) { flag = 3; seqEnd = j + overlap - 1; break; } } if(!bal_flag && node->inEdge == 2) { bal_ctg2end = searchKmerOnCtg(bal_word, KmerCtg2, len2); if(bal_ctg2end >= 0) { bal_flag = 2; bal_end = bal_j + overlap - 1; } } else if(bal_flag == 2 && node->inEdge == 2) { index = searchKmerOnCtg(bal_word, KmerCtg2, len2); if(index >= 0 && index < bal_ctg2end) // choose hit closer to gap { bal_ctg2end = index; bal_end = bal_j + overlap - 1; } } else if(bal_flag == 2 && node->inEdge == 1) { bal_ctg1start = searchKmerOnCtg(bal_word, KmerCtg1, len1); if(bal_ctg1start >= 0) { bal_flag = 3; bal_start = bal_j + overlap - 1; break; } } } } if(flag == 3) { *start = seqStart; *end = seqEnd; *bal = 0; *index1 = ctg1start; *index2 = ctg2end; return 1; } else if(bal_flag == 3) { *start = bal_start; *end = bal_end; *bal = 1; *index1 = bal_ctg1start; *index2 = bal_ctg2end; return 1; } return 0; } static int cutSeqFromTightStr(char *tightStr, int length, int start, int end, int revS, char *src_seq) { int i, index = 0; end = end < length ? end : length - 1; start = start >= 0 ? start : 0; if(!revS) { for(i = start; i <= end; i++) src_seq[index++] = getCharInTightString(tightStr, i); } else { for(i = length - 1 - start; i >= length - end - 1; i--) src_seq[index++] = int_comp(getCharInTightString(tightStr, i)); } return end - start + 1; } static int cutSeqFromCtg(unsigned int ctgID, int start, int end, char *sequence, int originOverlap) { unsigned int bal_ctg = getTwinCtg(ctgID); if(contig_array[ctgID].length < 1) return 0; int length = contig_array[ctgID].length + originOverlap; if(contig_array[ctgID].seq) return cutSeqFromTightStr(contig_array[ctgID].seq, length, start, end, 0, sequence); else return cutSeqFromTightStr(contig_array[bal_ctg].seq, length, start, end, 1, sequence); } static int cutSeqFromRead(char *src_seq, int length, int start, int end, char *sequence) { if(end >= length) printf("******: end %d length %d\n", end, length); end = end < length ? end : length - 1; start = start >= 0 ? start : 0; int i; for(i = start; i <= end; i++) sequence[i - start] = src_seq[i]; return end - start + 1; } static void printSeq(FILE *fo, char *seq, int len) { int i; for(i = 0; i < len; i++) fprintf(fo, "%c", int2base((int)seq[i])); fprintf(fo, "\n"); } static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap, DARRAY *gapSeqArray, Kmer *kmerCtg1, Kmer *kmerCtg2, int overlap, CTGinSCAF *ctg1, CTGinSCAF *ctg2, KmerSet *kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2, char *seqGap, char *seqCtg1, char *seqCtg2, int cut1, int cut2) { int i, j, start, end, startOnCtg1, endOnCtg2; char *bal_seq; char *src_seq; char *pt; boolean bal, ret = 0, FILL; double maxScore = 0.0; int maxIndex; int lenCtg1, lenCtg2; //build sequences on left and right of the uncertain region int buffer_size = maxReadLen > 100 ? maxReadLen : 100; int length = contig_array[ctg1->ctgID].length + originOverlap; if(buffer_size > offset1) { lenCtg1 = cutSeqFromCtg(ctg1->ctgID, length - cut1 - (buffer_size - offset1), length - 1 - cut1, seqCtg1, originOverlap); for(i = 0; i < offset1; i++) seqCtg1[lenCtg1 + i] = seqGap[i]; lenCtg1 += offset1; } else { for(i = offset1 - buffer_size; i < offset1; i++) seqCtg1[i + buffer_size - offset1] = seqGap[i]; lenCtg1 = buffer_size; } length = contig_array[ctg2->ctgID].length + originOverlap; if(buffer_size > offset2) { lenCtg2 = cutSeqFromCtg(ctg2->ctgID, cut2, buffer_size - offset2 - 1 + cut2, &(seqCtg2[offset2]), originOverlap); for(i = 0; i < offset2; i++) seqCtg2[i] = seqGap[i + offset1]; lenCtg2 += offset2; } else { for(i = 0; i < buffer_size; i++) seqCtg2[i] = seqGap[i + offset1]; lenCtg2 = buffer_size; } /* if(offset1>0||offset2>0){ for(i=0;i max) continue; if(overlap + (len1 - startOnCtg1 - 1) + endOnCtg2 - (end - start) > (int)originOverlap) continue; // contig1 and contig2 could not overlap more than origOverlap bases START[i] = start; END[i] = end; INDEX1[i] = startOnCtg1; INDEX2[i] = endOnCtg2; BAL[i] = bal; int matchLen = 2 * overlap < (end - start + overlap) ? 2 * overlap : (end - start + overlap); int match; int alignLen = matchLen; //compare the left of hit kmer on ctg1 //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1; int ctgLeft = (lenCtg1) - (len1 + overlap - 1) + startOnCtg1; int readLeft = start - overlap + 1; int cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft; cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg1->ctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap); cutSeqFromRead(seqCtg1, lenCtg1, ctgLeft - cmpLen, ctgLeft - 1, fastSequence); if(!bal) cutSeqFromRead(src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); else cutSeqFromRead(bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); match = compareSequences(fastSequence, slowSequence, cmpLen, cmpLen); alignLen += cmpLen; matchLen += match; //compare the right of hit kmer on ctg1 int ctgRight = len1 - startOnCtg1 - 1; cmpLen = ctgRight < (rdArray[i].len - start - 1) ? ctgRight : (rdArray[i].len - start - 1); cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap); cutSeqFromRead(seqCtg1, lenCtg1, ctgLeft + overlap, ctgLeft + overlap + cmpLen - 1, fastSequence); if(!bal) cutSeqFromRead(src_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence); else cutSeqFromRead(bal_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence); match = compareSequences(fastSequence, slowSequence, cmpLen, cmpLen); //fprintf(stderr,"%d -- %d\n",match,cmpLen); alignLen += cmpLen; matchLen += match; //compare the left of hit kmer on ctg2 ctgLeft = endOnCtg2; readLeft = end - overlap + 1; cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft; cmpLen = ctgLeft <= MAXREADLENGTH ? ctgLeft : MAXREADLENGTH; //cutSeqFromCtg(ctg2->ctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap); cutSeqFromRead(seqCtg2, lenCtg2, endOnCtg2 - cmpLen, endOnCtg2 - 1, fastSequence); if(!bal) cutSeqFromRead(src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); else cutSeqFromRead(bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence); match = compareSequences(fastSequence, slowSequence, cmpLen, cmpLen); alignLen += cmpLen; matchLen += match; //compare the right of hit kmer on ctg2 //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap; ctgRight = lenCtg2 - endOnCtg2 - overlap; cmpLen = ctgRight < (rdArray[i].len - end - 1) ? ctgRight : (rdArray[i].len - end - 1); cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH; //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap); cutSeqFromRead(seqCtg2, lenCtg2, endOnCtg2 + overlap, endOnCtg2 + overlap + cmpLen - 1, fastSequence); if(!bal) cutSeqFromRead(src_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence); else cutSeqFromRead(bal_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence); match = compareSequences(fastSequence, slowSequence, cmpLen, cmpLen); alignLen += cmpLen; matchLen += match; /* if(cmpLen>0&&match!=cmpLen+overlap){ printSeq(stderr,fastSequence,cmpLen+overlap); printSeq(stderr,slowSequence,cmpLen+overlap); printKmer(stderr,kmerCtg2[endOnCtg2],overlap); fprintf(stderr,": %d(%d)\n",bal,endOnCtg2); }else if(cmpLen>0&&match==cmpLen+overlap) fprintf(stderr,"Perfect\n"); */ double score = (double)matchLen / alignLen; if(maxScore < score) { maxScore = score; //fprintf(stderr,"%4.2f (%d/%d)\n",maxScore,matchLen,alignLen); maxIndex = i; } SCORE[i] = score; } /* if(maxScore>0.0) fprintf(stderr,"SCORE: %4.2f\n",maxScore); */ if(maxScore > 0.9) { /* for(i=0;i 0 ? offset1 - (len1 - INDEX1[maxIndex] - 1) : 0; int rightRemain = offset2 - (overlap + INDEX2[maxIndex]) > 0 ? offset2 - (overlap + INDEX2[maxIndex]) : 0; ctg1->gapSeqOffset = gapSeqArray->item_c; ctg1->gapSeqLen = END[maxIndex] - START[maxIndex] + leftRemain + rightRemain; if(darrayPut(gapSeqArray, ctg1->gapSeqOffset + (END[maxIndex] - START[maxIndex] + leftRemain + rightRemain) / 4)) { pt = (char *)darrayPut(gapSeqArray, ctg1->gapSeqOffset); for(j = 0; j < leftRemain; j++) //get the left side of the gap region from search { writeChar2tightString(seqGap[j], pt, j); fprintf(stderr, "%c", int2base(seqGap[j])); } for(j = START[maxIndex] + 1; j <= END[maxIndex]; j++) { if(BAL[maxIndex]) { writeChar2tightString(bal_seq[j], pt, j - START[maxIndex] - 1 + leftRemain); fprintf(stderr, "%c", int2base(bal_seq[j])); } else { writeChar2tightString(src_seq[j], pt, j - START[maxIndex] - 1 + leftRemain); fprintf(stderr, "%c", int2base(src_seq[j])); } } for(j = offset2 - rightRemain; j < offset2; j++) //get the right side of the gap region from search { writeChar2tightString(seqGap[j + leftRemain], pt, j + END[maxIndex] - START[maxIndex] + leftRemain); fprintf(stderr, "%c", int2base(seqGap[j + leftRemain])); } fprintf(stderr, ": GAPSEQ (%d+%d)(%d+%d)(%d+%d)(%d+%d) B %d\n", offset1, offset2, cut1, cut2, len1 - INDEX1[maxIndex] - 1, INDEX2[maxIndex], START[maxIndex], END[maxIndex], BAL[maxIndex]); ctg1->cutTail = len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 > cut1 ? len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 : cut1; ctg2->cutHead = overlap + INDEX2[maxIndex] - offset2 + cut2 > cut2 ? overlap + INDEX2[maxIndex] - offset2 + cut2 : cut2; ctg2->scaftig_start = 0; ret = 1; } } free((void *)START); free((void *)END); free((void *)INDEX1); free((void *)INDEX2); free((void *)SCORE); free((void *)BAL); free((void *)src_seq); free((void *)bal_seq); return ret; }