/* * prlReadFillGap.c * * Copyright (c) 2011-2013 BGI-Shenzhen . * * This file is part of SOAPdenovo-Trans. * * SOAPdenovo-Trans is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SOAPdenovo-Trans is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SOAPdenovo-Trans. If not, see . * */ #include "stdinc.h" #include "newhash.h" #include "extfunc.h" #include "extvab.h" #define RDBLOCKSIZE 50 #define CTGappend 50 static Kmer MAXKMER; static int Ncounter; static int allGaps; // for multi threads static int *counters; static pthread_mutex_t mutex; static int scafBufSize = 100; static boolean *flagBuf; static unsigned char *thrdNoBuf; static STACK **ctgStackBuffer; static int scafCounter; static int scafInBuf; static unsigned int *loci_id; static unsigned int *loci_count; static char ** curr_type; static void MarkCtgOccu (unsigned int ctg); /* static void printRead(int len,char *seq) { int j; fprintf(stderr,">read\n"); for(j=0;jlen = len; rd->dis = pos; rd->seqStarter = starter; } static void convertIndex () { int *length_array = (int *) ckalloc ((num_ctg + 1) * sizeof (int)); unsigned int i; for (i = 1; i <= num_ctg; i++) { length_array[i] = 0; } for (i = 1; i <= num_ctg; i++) { if (index_array[i] > 0) { length_array[index_array[i]] = i; } } for (i = 1; i <= num_ctg; i++) { index_array[i] = length_array[i]; } //contig i with new index: index_array[i] free ((void *) length_array); } static long long getRead1by1 (FILE * fp, DARRAY * readSeqInGap) { long long readCounter = 0; if (!fp) { return readCounter; } int len, pos; unsigned int ctgID; long long starter; char *pt; char *freadBuf = (char *) ckalloc ((maxReadLen / 4 + 1) * sizeof (char)); while (fread (&len, sizeof (int), 1, fp) == 1) { if (fread (&ctgID, sizeof (int), 1, fp) != 1) { break; } if (fread (&pos, sizeof (int), 1, fp) != 1) { break; } if (fread (freadBuf, sizeof (char), len / 4 + 1, fp) != (unsigned) (len / 4 + 1)) { break; } //put seq to dynamic array starter = readSeqInGap->item_c; if (!darrayPut (readSeqInGap, starter + len / 4)) // make sure there's room for this seq { break; } pt = (char *) darrayPut (readSeqInGap, starter); bcopy (freadBuf, pt, len / 4 + 1); attach1read2contig (ctgID, len, pos, starter); readCounter++; } free ((void *) freadBuf); return readCounter; } // Darray *readSeqInGap static boolean loadReads4gap (char *graphfile) { FILE *fp, *fp2; char name[1024]; long long readCounter; sprintf (name, "%s.readInGap", graphfile); fp = fopen (name, "rb"); sprintf (name, "%s.longReadInGap", graphfile); fp2 = fopen (name, "rb"); if (!fp && !fp2) { return 0; } if (!orig2new) { convertIndex (); orig2new = 1; } readSeqInGap = (DARRAY *) createDarray (1000000, sizeof (char)); if (fp) { readCounter = getRead1by1 (fp, readSeqInGap); printf ("Loaded %lld reads from %s.readInGap\n", readCounter, graphfile); fclose (fp); } if (fp2) { readCounter = getRead1by1 (fp2, readSeqInGap); printf ("Loaded %lld reads from %s.LongReadInGap\n", readCounter, graphfile); fclose (fp2); } return 1; } static void debugging1 () { unsigned int i; if (orig2new) { unsigned int *length_array = (unsigned int *) ckalloc ((num_ctg + 1) * sizeof (unsigned int)); //use length_array to change info in index_array for (i = 1; i <= num_ctg; i++) { length_array[i] = 0; } for (i = 1; i <= num_ctg; i++) { if (index_array[i] > 0) { length_array[index_array[i]] = i; } } for (i = 1; i <= num_ctg; i++) { index_array[i] = length_array[i]; } //contig i with original index: index_array[i] orig2new = 0; } READNEARBY *rd; int j; char *pt; for (i = 1; i <= num_ctg; i++) { if (!contig_array[i].closeReads) { continue; } if (index_array[i] != 735) { continue; } printf ("contig %d, len %d: \n", index_array[i], contig_array[i].length); stackBackup (contig_array[i].closeReads); while ((rd = (READNEARBY *) stackPop (contig_array[i].closeReads)) != NULL) { printf ("%d\t%d\t%lld\t", rd->dis, rd->len, rd->seqStarter); pt = (char *) darrayGet (readSeqInGap, rd->seqStarter); for (j = 0; j < rd->len; j++) { printf ("%c", int2base ((int) getCharInTightString (pt, j))); } printf ("\n"); } stackRecover (contig_array[i].closeReads); } } static void initiateCtgInScaf (CTGinSCAF * actg) { actg->cutTail = 0; actg->cutHead = overlaplen; actg->gapSeqLen = 0; } static int procGap (char *line, STACK * ctgsStack) { char *tp; int length, i, seg; unsigned int ctg; CTGinSCAF *ctgPt; tp = strtok (line, " "); tp = strtok (NULL, " "); //length length = atoi (tp); tp = strtok (NULL, " "); //seg seg = atoi (tp); if (!seg) { return length; } for (i = 0; i < seg; i++) { tp = strtok (NULL, " "); ctg = atoi (tp); MarkCtgOccu (ctg); ctgPt = (CTGinSCAF *) stackPush (ctgsStack); initiateCtgInScaf (ctgPt); ctgPt->ctgID = ctg; ctgPt->start = 0; ctgPt->end = 0; ctgPt->scaftig_start = 0; ctgPt->mask = 1; } return length; } static void debugging2 (int index, STACK * ctgsStack) { CTGinSCAF *actg; stackBackup (ctgsStack); printf (">scaffold%d\t%d 0.0\n", index, ctgsStack->item_c); while ((actg = stackPop (ctgsStack)) != NULL) { printf ("%d\t%d\t%d\t%d\n", actg->ctgID, actg->start, actg->end, actg->scaftig_start); } stackRecover (ctgsStack); } static int cmp_reads (const void *a, const void *b) { READNEARBY *A, *B; A = (READNEARBY *) a; B = (READNEARBY *) b; if (A->dis > B->dis) { return 1; } else if (A->dis == B->dis) { return 0; } else { return -1; } } static void cutRdArray (READNEARBY * rdArray, int gapStart, int gapEnd, int *count, int arrayLen, READNEARBY * cutArray) { int i; int num = 0; for (i = 0; i < arrayLen; i++) { if (rdArray[i].dis > gapEnd) { break; } if ((rdArray[i].dis + rdArray[i].len) >= gapStart) { cutArray[num].dis = rdArray[i].dis; cutArray[num].len = rdArray[i].len; cutArray[num++].seqStarter = rdArray[i].seqStarter; } } *count = num; } static void outputTightStr (FILE * fp, char *tightStr, int start, int length, int outputlen, int revS, int *col) { int i; int end; int column = *col; if (!revS) { end = start + outputlen <= length ? start + outputlen : length; for (i = start; i < end; i++) { fprintf (fp, "%c", int2base ((int) getCharInTightString (tightStr, i))); if ((++column) % 100 == 0) { //column = 0; fprintf (fp, "\n"); } } } else { end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0; for (i = length - 1 - start; i >= end; i--) { fprintf (fp, "%c", int2compbase ((int) getCharInTightString (tightStr, i))); if ((++column) % 100 == 0) { fprintf (fp, "\n"); //column = 0; } } } *col = column; } static void outputTightStrLowerCase (FILE * fp, char *tightStr, int start, int length, int outputlen, int revS, int *col) { int i; int end; int column = *col; if (!revS) { end = start + outputlen <= length ? start + outputlen : length; for (i = start; i < end; i++) { fprintf (fp, "%c", "actg"[(int) getCharInTightString (tightStr, i)]); if ((++column) % 100 == 0) { //column = 0; fprintf (fp, "\n"); } } } else { end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0; for (i = length - 1 - start; i >= end; i--) { fprintf (fp, "%c", "tgac"[(int) getCharInTightString (tightStr, i)]); if ((++column) % 100 == 0) { fprintf (fp, "\n"); //column = 0; } } } *col = column; } static void outputNs (FILE * fp, int gapN, int *col) { int i, column = *col; for (i = 0; i < gapN; i++) { fprintf (fp, "N"); if ((++column) % 100 == 0) { //column = 0; fprintf (fp, "\n"); } } *col = column; } static void outputGapInfo (unsigned int ctg1, unsigned int ctg2) { unsigned int bal_ctg1 = getTwinCtg (ctg1); unsigned int bal_ctg2 = getTwinCtg (ctg2); if (isLargerThanTwin (ctg1)) { fprintf (stderr, "%d\t", index_array[bal_ctg1]); } else { fprintf (stderr, "%d\t", index_array[ctg1]); } if (isLargerThanTwin (ctg2)) { fprintf (stderr, "%d\n", index_array[bal_ctg2]); } else { fprintf (stderr, "%d\n", index_array[ctg2]); } } static void output1gap (FILE * fo, int scafIndex, CTGinSCAF * prevCtg, CTGinSCAF * actg, DARRAY * gapSeqArray) { unsigned int ctg1, bal_ctg1, length1; int start1, outputlen1; unsigned int ctg2, bal_ctg2, length2; int start2, outputlen2; char *pt; int column = 0; ctg1 = prevCtg->ctgID; bal_ctg1 = getTwinCtg (ctg1); start1 = prevCtg->cutHead; length1 = contig_array[ctg1].length + overlaplen; if (length1 - prevCtg->cutTail - start1 > CTGappend) { outputlen1 = CTGappend; start1 = length1 - prevCtg->cutTail - outputlen1; } else { outputlen1 = length1 - prevCtg->cutTail - start1; } ctg2 = actg->ctgID; bal_ctg2 = getTwinCtg (ctg2); start2 = actg->cutHead; length2 = contig_array[ctg2].length + overlaplen; if (length2 - actg->cutTail - start2 > CTGappend) { outputlen2 = CTGappend; } else { outputlen2 = length2 - actg->cutTail - start2; } if (isLargerThanTwin (ctg1)) { fprintf (fo, ">S%d_C%d_L%d_G%d", scafIndex, index_array[bal_ctg1], outputlen1, prevCtg->gapSeqLen); } else { fprintf (fo, ">S%d_C%d_L%d_G%d", scafIndex, index_array[ctg1], outputlen1, prevCtg->gapSeqLen); } if (isLargerThanTwin (ctg2)) { fprintf (fo, "_C%d_L%d\n", index_array[bal_ctg2], outputlen2); } else { fprintf (fo, "_C%d_L%d\n", index_array[ctg2], outputlen2); } if (contig_array[ctg1].seq) { outputTightStr (fo, contig_array[ctg1].seq, start1, length1, outputlen1, 0, &column); } else if (contig_array[bal_ctg1].seq) { outputTightStr (fo, contig_array[bal_ctg1].seq, start1, length1, outputlen1, 1, &column); } pt = (char *) darrayPut (gapSeqArray, prevCtg->gapSeqOffset); outputTightStrLowerCase (fo, pt, 0, prevCtg->gapSeqLen, prevCtg->gapSeqLen, 0, &column); if (contig_array[ctg2].seq) { outputTightStr (fo, contig_array[ctg2].seq, start2, length2, outputlen2, 0, &column); } else if (contig_array[bal_ctg2].seq) { outputTightStr (fo, contig_array[bal_ctg2].seq, start2, length2, outputlen2, 1, &column); } fprintf (fo, "\n"); } static void outputGapSeq (FILE * fo, int index, STACK * ctgsStack, DARRAY * gapSeqArray) { CTGinSCAF *actg, *prevCtg = NULL; stackRecover (ctgsStack); fprintf (fo, ">scaffold%d\n", index); while ((actg = stackPop (ctgsStack)) != NULL) { if (prevCtg) { if (actg->scaftig_start) { fprintf (fo, "0\t%d\t%d\n", prevCtg->mask, actg->mask); } else { fprintf (fo, "1\t%d\t%d\n", prevCtg->mask, actg->mask); } } /* if(prevCtg&&prevCtg->gapSeqLen>0) output1gap(fo,index,prevCtg,actg,gapSeqArray); */ prevCtg = actg; } } static void outputScafSeq (FILE * fo, FILE * foc, FILE * fo3, int index, STACK * ctgsStack, DARRAY * gapSeqArray, unsigned int locus_id,unsigned int locus_count,char * type) { CTGinSCAF *actg, *prevCtg = NULL; unsigned int ctg, bal_ctg, ctg_out, length; int start, outputlen, gapN; char *pt; int column = 0; long long cvgSum = 0; int lenSum = 0; int ctg_start_pos = 0, lu_len = 0, lu_end = 0; char strand; stackRecover (ctgsStack); while ((actg = stackPop (ctgsStack)) != NULL) { if (!(contig_array[actg->ctgID].cvg > 0)) { continue; } lenSum += contig_array[actg->ctgID].length; cvgSum += contig_array[actg->ctgID].length * contig_array[actg->ctgID].cvg; } if (lenSum > 0) { fprintf (fo, ">scaffold%d Locus_%d_%d %4.1f %s\n", index, locus_id,locus_count ,(double) cvgSum / lenSum,type); } else { fprintf (fo, ">scaffold%d Locus_%d_%d 0.0 %s\n", index, locus_id,locus_count,type); } fprintf (foc, ">scaffold%d Locus_%d_%d\n", index, locus_id,locus_count); stackRecover (ctgsStack); lenSum = 0; while ((actg = stackPop (ctgsStack)) != NULL) { ctg = actg->ctgID; bal_ctg = getTwinCtg (ctg); length = contig_array[ctg].length + overlaplen; if (prevCtg && actg->scaftig_start) { gapN = actg->start - prevCtg->start - contig_array[prevCtg->ctgID].length; gapN = gapN > 0 ? gapN : 1; outputNs (fo, gapN, &column); lenSum++; fprintf (fo3, "scaffold%d\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", index, ctg_start_pos+1,ctg_start_pos + gapN,lenSum,gapN); ctg_start_pos += gapN; //outputGapInfo(prevCtg->ctgID,ctg); Ncounter++; } if (!prevCtg) { start = 0; } else { start = actg->cutHead; } outputlen = length - start - actg->cutTail; if (contig_array[ctg].seq) { outputTightStr (fo, contig_array[ctg].seq, start, length, outputlen, 0, &column); lu_end = start + outputlen > length ? length : start + outputlen; lu_len = lu_end - start; strand = '+'; fprintf (foc, "%d\t", ctg); lenSum++; fprintf (fo3, "scaffold%d\t%d\t%d\t%d\tW\t%d\t%d\t%d\t+\n",index,ctg_start_pos+1,ctg_start_pos+lu_len,lenSum,ctg,start+1,lu_end); } else if (contig_array[bal_ctg].seq) { outputTightStr (fo, contig_array[bal_ctg].seq, start, length, outputlen, 1, &column); lu_end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0; lu_len = length - start - lu_end; strand = '-'; fprintf (foc, "%d\t", bal_ctg); lenSum++; fprintf (fo3, "scaffold%d\t%d\t%d\t%d\tW\t%d\t%d\t%d\t-\n",index,ctg_start_pos+1,ctg_start_pos+lu_len,lenSum,bal_ctg,lu_end+1,length-start); } fprintf (foc, "%d\t%c\t%d\n", ctg_start_pos, strand, lu_len); ctg_start_pos += lu_len; if (actg->gapSeqLen < 1) { prevCtg = actg; continue; } pt = (char *) darrayPut (gapSeqArray, actg->gapSeqOffset); outputTightStrLowerCase (fo, pt, 0, actg->gapSeqLen, actg->gapSeqLen, 0, &column); lenSum++; fprintf (fo3, "scaffold%d\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n",index,ctg_start_pos+1,ctg_start_pos+actg->gapSeqLen,lenSum,actg->gapSeqLen); ctg_start_pos += actg->gapSeqLen; prevCtg = actg; } fprintf (fo, "\n"); } static void fill1scaf (int index, STACK * ctgsStack, int thrdID); static void check1scaf (int t, int thrdID) { if (flagBuf[t]) { return; } boolean late = 0; pthread_mutex_lock (&mutex); if (!flagBuf[t]) { flagBuf[t] = 1; thrdNoBuf[t] = thrdID; } else { late = 1; } pthread_mutex_unlock (&mutex); if (late) { return; } counters[thrdID]++; fill1scaf (scafCounter + t + 1, ctgStackBuffer[t], thrdID); } static void fill1scaf (int index, STACK * ctgsStack, int thrdID) { CTGinSCAF *actg, *prevCtg = NULL; READNEARBY *rdArray, *rdArray4gap, *rd; int numRd = 0, count, maxGLen = 0; unsigned int ctg, bal_ctg; STACK *rdStack; while ((actg = stackPop (ctgsStack)) != NULL) { if (prevCtg) { maxGLen = maxGLen < (actg->start - prevCtg->end) ? (actg->start - prevCtg->end) : maxGLen; } ctg = actg->ctgID; bal_ctg = getTwinCtg (ctg); if (actg->mask) { prevCtg = actg; continue; } if (contig_array[ctg].closeReads) { numRd += contig_array[ctg].closeReads->item_c; } else if (contig_array[bal_ctg].closeReads) { numRd += contig_array[bal_ctg].closeReads->item_c; } prevCtg = actg; } if (numRd < 1) { return; } rdArray = (READNEARBY *) ckalloc (numRd * sizeof (READNEARBY)); rdArray4gap = (READNEARBY *) ckalloc (numRd * sizeof (READNEARBY)); //fprintf(stderr,"scaffold%d reads4gap %d\n",index,numRd); // collect reads appended to contigs in this scaffold int numRd2 = 0; stackRecover (ctgsStack); while ((actg = stackPop (ctgsStack)) != NULL) { ctg = actg->ctgID; bal_ctg = getTwinCtg (ctg); if (actg->mask) { continue; } if (contig_array[ctg].closeReads) { rdStack = contig_array[ctg].closeReads; } else if (contig_array[bal_ctg].closeReads) { rdStack = contig_array[bal_ctg].closeReads; } else { continue; } stackBackup (rdStack); while ((rd = (READNEARBY *) stackPop (rdStack)) != NULL) { rdArray[numRd2].len = rd->len; rdArray[numRd2].seqStarter = rd->seqStarter; if (isSmallerThanTwin (ctg)) { rdArray[numRd2++].dis = actg->start - overlaplen + rd->dis; } else rdArray[numRd2++].dis = actg->start - overlaplen + contig_array[ctg].length - rd->len - rd->dis; } stackRecover (rdStack); } if (numRd2 != numRd) { printf ("##reads numbers doesn't match, %d vs %d when scaffold %d\n", numRd, numRd2, index); } qsort (rdArray, numRd, sizeof (READNEARBY), cmp_reads); //fill gap one by one int gapStart, gapEnd; int numIn = 0; boolean flag; int buffer_size = maxReadLen > 100 ? maxReadLen : 100; int maxGSLen = maxGLen + GLDiff < 10 ? 10 : maxGLen + GLDiff; //fprintf(stderr,"maxGlen %d, maxGSlen %d\n",maxGLen,maxGSLen); char *seqGap = (char *) ckalloc (maxGSLen * sizeof (char)); // temp array for gap sequence Kmer *kmerCtg1 = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); Kmer *kmerCtg2 = (Kmer *) ckalloc (buffer_size * sizeof (Kmer)); char *seqCtg1 = (char *) ckalloc (buffer_size * sizeof (char)); char *seqCtg2 = (char *) ckalloc (buffer_size * sizeof (char)); prevCtg = NULL; stackRecover (ctgsStack); while ((actg = stackPop (ctgsStack)) != NULL) { if (!prevCtg || !actg->scaftig_start) { prevCtg = actg; continue; } gapStart = prevCtg->end - 100; gapEnd = actg->start - overlaplen + 100; cutRdArray (rdArray, gapStart, gapEnd, &count, numRd, rdArray4gap); numIn += count; /* if(!count){ prevCtg = actg; continue; } */ int overlap; for (overlap = overlaplen; overlap > 14; overlap -= 2) { flag = localGraph (rdArray4gap, count, prevCtg, actg, overlaplen, kmerCtg1, kmerCtg2, overlap, darrayBuf[thrdID], seqCtg1, seqCtg2, seqGap); //free_kmerset(kmerSet); if (flag == 1) { /* fprintf(stderr,"Between ctg %d and %d, Found with %d\n",prevCtg->ctgID ,actg->ctgID,overlap); */ break; } } /* if(count==0) printf("Gap closed without reads\n"); if(!flag) fprintf(stderr,"Between ctg %d and %d, NO routes found\n",prevCtg->ctgID,actg->ctgID); */ prevCtg = actg; } //fprintf(stderr,"____scaffold%d reads in gap %d\n",index,numIn); free ((void *) seqGap); free ((void *) kmerCtg1); free ((void *) kmerCtg2); free ((void *) seqCtg1); free ((void *) seqCtg2); free ((void *) rdArray); free ((void *) rdArray4gap); } static void reverseStack (STACK * dStack, STACK * sStack) { CTGinSCAF *actg, *ctgPt; emptyStack (dStack); while ((actg = (CTGinSCAF *) stackPop (sStack)) != NULL) { ctgPt = (CTGinSCAF *) stackPush (dStack); ctgPt->ctgID = actg->ctgID; ctgPt->start = actg->start; ctgPt->end = actg->end; ctgPt->scaftig_start = actg->scaftig_start; ctgPt->mask = actg->mask; ctgPt->cutHead = actg->cutHead; ctgPt->cutTail = actg->cutTail; ctgPt->gapSeqLen = actg->gapSeqLen; ctgPt->gapSeqOffset = actg->gapSeqOffset; } stackBackup (dStack); } static Kmer tightStr2Kmer (char *tightStr, int start, int length, int revS) { int i; Kmer word; word=kmerZero; if (!revS) { if (start + overlaplen > length) { printf ("tightStr2Kmer A: no enough bases for kmer\n"); return word; } for (i = start; i < start + overlaplen; i++) { word = KmerLeftBitMoveBy2 (word); #ifdef MER127 word.low2 |= getCharInTightString (tightStr, i); #endif #ifdef MER63 word.low |= getCharInTightString (tightStr, i); #endif #ifdef MER31 word |=getCharInTightString (tightStr, i); #endif } } else { if (length - start - overlaplen < 0) { printf ("tightStr2Kmer B: no enough bases for kmer\n"); return word; } for (i = length - 1 - start; i > length - 1 - start - overlaplen; i--) { word = KmerLeftBitMoveBy2 (word); #ifdef MER127 word.low2 |= int_comp (getCharInTightString (tightStr, i)); #endif #ifdef MER63 word.low |= int_comp (getCharInTightString (tightStr, i)); #endif #ifdef MER31 word |= int_comp (getCharInTightString (tightStr, i)); #endif } } return word; } static Kmer maxKmer () { Kmer word; word=kmerZero; int i; for (i = 0; i < overlaplen; i++) { word = KmerLeftBitMoveBy2 (word); #ifdef MER127 word.low2 |= 0x3; #endif #ifdef MER63 word.low|= 0x3; #endif #ifdef MER31 word |= 0x3; #endif } return word; } static int contigCatch (unsigned int prev_ctg, unsigned int ctg) { if (contig_array[prev_ctg].length == 0 || contig_array[ctg].length == 0) { return 0; } Kmer kmerAtEnd, kmerAtStart; Kmer MaxKmer; unsigned int bal_ctg1 = getTwinCtg (prev_ctg); unsigned int bal_ctg2 = getTwinCtg (ctg); int i, start; int len1 = contig_array[prev_ctg].length + overlaplen; int len2 = contig_array[ctg].length + overlaplen; start = contig_array[prev_ctg].length; if (contig_array[prev_ctg].seq) { kmerAtEnd = tightStr2Kmer (contig_array[prev_ctg].seq, start, len1, 0); } else { kmerAtEnd = tightStr2Kmer (contig_array[bal_ctg1].seq, start, len1, 1); } start = 0; if (contig_array[ctg].seq) { kmerAtStart = tightStr2Kmer (contig_array[ctg].seq, start, len2, 0); } else { kmerAtStart = tightStr2Kmer (contig_array[bal_ctg2].seq, start, len2, 1); } MaxKmer = MAXKMER; for (i = 0; i < 10; i++) { if (KmerEqual (kmerAtStart, kmerAtEnd)) { break; } MaxKmer = KmerRightBitMoveBy2 (MaxKmer); kmerAtEnd = KmerAnd (kmerAtEnd, MaxKmer); kmerAtStart = KmerRightBitMoveBy2 (kmerAtStart); } if (i < 10) { return overlaplen - i; } else { return 0; } } static void initStackBuf (STACK ** ctgStackBuffer, int scafBufSize) { int i; for (i = 0; i < scafBufSize; i++) { flagBuf[i] = 1; ctgStackBuffer[i] = (STACK *) createStack (100, sizeof (CTGinSCAF)); } } static void freeStackBuf (STACK ** ctgStackBuffer, int scafBufSize) { int i; for (i = 0; i < scafBufSize; i++) { freeStack (ctgStackBuffer[i]); } } static void threadRoutine (void *para) { PARAMETER *prm; int i; prm = (PARAMETER *) para; //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table); while (1) { if (*(prm->selfSignal) == 1) { emptyDarray (darrayBuf[prm->threadID]); for (i = 0; i < scafInBuf; i++) { check1scaf (i, prm->threadID); } *(prm->selfSignal) = 0; } else if (*(prm->selfSignal) == 2) { *(prm->selfSignal) = 0; break; } usleep (1); } } static void creatThrds (pthread_t * threads, PARAMETER * paras) { unsigned char i; int temp; for (i = 0; i < thrd_num; i++) { if ((temp = pthread_create (&threads[i], NULL, (void *) threadRoutine, &(paras[i]))) != 0) { printf ("create threads failed\n"); exit (1); } } printf ("%d thread created in prlReadFillGap\n...\n", thrd_num); } static void sendWorkSignal (unsigned char SIG, unsigned char *thrdSignals) { int t; for (t = 0; t < thrd_num; t++) { thrdSignals[t + 1] = SIG; } while (1) { usleep (10); for (t = 0; t < thrd_num; t++) if (thrdSignals[t + 1]) { break; } if (t == thrd_num) { break; } } } static void thread_wait (pthread_t * threads) { int i; for (i = 0; i < thrd_num; i++) if (threads[i] != 0) { pthread_join (threads[i], NULL); } } static void outputSeqs (FILE * fo, FILE * foc, FILE * fo2, FILE * fo3, int scafInBuf) { int i, thrdID; for (i = 0; i < scafInBuf; i++) { thrdID = thrdNoBuf[i]; outputScafSeq (fo, foc, fo3, scafCounter + i + 1, ctgStackBuffer[i], darrayBuf[thrdID],loci_id[i],loci_count[i],curr_type[i]); outputGapSeq (fo2, scafCounter + i + 1, ctgStackBuffer[i], darrayBuf[thrdID]); } } static void MaskContig (unsigned int ctg) { contig_array[ctg].mask = 1; contig_array[getTwinCtg (ctg)].mask = 1; } static void MarkCtgOccu (unsigned int ctg) { contig_array[ctg].flag = 1; contig_array[getTwinCtg (ctg)].flag = 1; } static void output_ctg (unsigned int ctg, FILE * fo) { if (contig_array[ctg].length < 1) { return; } int len; unsigned int bal_ctg = getTwinCtg (ctg); len = contig_array[ctg].length + overlaplen; int col = 0; if (contig_array[ctg].seq) { fprintf (fo, ">C%d %4.1f\n", ctg, (double) contig_array[ctg].cvg); outputTightStr (fo, contig_array[ctg].seq, 0, len, len, 0, &col); } else if (contig_array[bal_ctg].seq) { fprintf (fo, ">C%d %4.1f\n", bal_ctg, (double) contig_array[ctg].cvg); outputTightStr (fo, contig_array[bal_ctg].seq, 0, len, len, 0, &col); } contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; fprintf (fo, "\n"); } void prlReadsCloseGap (char *graphfile) { //thrd_num=1; if (fillGap) { boolean flag; printf ("\nStart to load reads for gap filling. %d length discrepancy is allowed\n", GLDiff); printf ("...\n"); flag = loadReads4gap (graphfile); if (!flag) { return; } } if (orig2new) { convertIndex (); orig2new = 0; } FILE *fp, *fo, *fo2, *fo3, *foc; char line[1024]; CTGinSCAF *actg; STACK *ctgStack, *aStack; int index = 0, offset = 0, counter, overallLen; int i, starter, prev_start, gapLen, catchable; unsigned int ctg, prev_ctg = 0; boolean IsPrevGap; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; for (ctg = 1; ctg <= num_ctg; ctg++) { contig_array[ctg].flag = 0; } MAXKMER = maxKmer (); ctgStack = (STACK *) createStack (1000, sizeof (CTGinSCAF)); sprintf (line, "%s.scaf_gap", graphfile); fp = ckopen (line, "r"); sprintf (line, "%s.scafSeq", graphfile); fo = ckopen (line, "w"); sprintf (line, "%s.contigPosInscaff", graphfile); foc = ckopen (line, "w"); sprintf (line, "%s.agp", graphfile); fo3 = ckopen (line, "w"); sprintf (line, "%s.gapSeq", graphfile); fo2 = ckopen (line, "w"); pthread_mutex_init (&mutex, NULL); flagBuf = (boolean *) ckalloc (scafBufSize * sizeof (boolean));; thrdNoBuf = (unsigned char *) ckalloc (scafBufSize * sizeof (unsigned char));; memset (thrdNoBuf, 0, scafBufSize * sizeof (char)); ctgStackBuffer = (STACK **) ckalloc (scafBufSize * sizeof (STACK *)); initStackBuf (ctgStackBuffer, scafBufSize); darrayBuf = (DARRAY **) ckalloc (thrd_num * sizeof (DARRAY *)); counters = (int *) ckalloc (thrd_num * sizeof (int)); loci_id=(unsigned int *)ckalloc(sizeof(unsigned int )*100000 ); loci_count=(unsigned int *)ckalloc(sizeof(unsigned int )*100000 ); for (i = 0; i < thrd_num; i++) { counters[i] = 0; darrayBuf[i] = (DARRAY *) createDarray (100000, sizeof (char)); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; } if (fillGap) { creatThrds (threads, paras); } Ncounter = scafCounter = scafInBuf = allGaps = 0; curr_type = (char ** ) ckalloc ( sizeof(char *)*scafBufSize); for(i=0;i') { if (index) { aStack = ctgStackBuffer[scafInBuf]; flagBuf[scafInBuf++] = 0; reverseStack (aStack, ctgStack); if (scafInBuf == scafBufSize) { if (fillGap) { sendWorkSignal (1, thrdSignal); } outputSeqs (fo, foc, fo2, fo3, scafInBuf); scafCounter += scafInBuf; scafInBuf = 0; } if (index % 1000 == 0) { printf ("Processed %d scaffolds\n", index); } } //read next scaff emptyStack (ctgStack); IsPrevGap = offset = prev_ctg = 0; sscanf (line + 9, "%d %d %d Locus_%u_%d %s\n", &index,&counter, &overallLen, &(loci_id[scafInBuf]), &(loci_count[scafInBuf]),curr_type[scafInBuf]); continue; } if (line[0] == 'G') // gap appears { if (fillGap) { gapLen = procGap (line, ctgStack); IsPrevGap = 1; } continue; } if (line[0] >= '0' && line[0] <= '9') // a contig line { sscanf (line, "%d %d", &ctg, &starter); actg = (CTGinSCAF *) stackPush (ctgStack); actg->ctgID = ctg; if (contig_array[ctg].flag) { MaskContig (ctg); } else { MarkCtgOccu (ctg); } initiateCtgInScaf (actg); if (!prev_ctg) { actg->cutHead = 0; } else if (!IsPrevGap) { allGaps++; } if (!IsPrevGap) { if (prev_ctg && (starter - prev_start - (int) contig_array[prev_ctg].length) < ((int) overlaplen * 4)) { /* if(fillGap) catchable = contigCatch(prev_ctg,ctg); else */ catchable = 0; if (catchable) // prev_ctg and ctg overlap **bp { allGaps--; /* if(isLargerThanTwin(prev_ctg)) fprintf(stderr,"%d ####### by_overlap\n",getTwinCtg(prev_ctg)); else fprintf(stderr,"%d ####### by_overlap\n",prev_ctg); */ actg->scaftig_start = 0; actg->cutHead = catchable; offset += -(starter - prev_start - contig_array[prev_ctg].length) + (overlaplen - catchable); } else { actg->scaftig_start = 1; } } else { actg->scaftig_start = 1; } } else { offset += -(starter - prev_start - contig_array[prev_ctg].length) + gapLen; actg->scaftig_start = 0; } actg->start = starter + offset; actg->end = actg->start + contig_array[ctg].length - 1; actg->mask = contig_array[ctg].mask; IsPrevGap = 0; prev_ctg = ctg; prev_start = starter; } } if (index) { aStack = ctgStackBuffer[scafInBuf]; flagBuf[scafInBuf++] = 0; reverseStack (aStack, ctgStack); if (fillGap) { sendWorkSignal (1, thrdSignal); } outputSeqs (fo, foc, fo2, fo3, scafInBuf); } if (fillGap) { sendWorkSignal (2, thrdSignal); thread_wait (threads); } for (ctg = 1; ctg <= num_ctg; ctg++) { if ((contig_array[ctg].length + overlaplen) < 100 || contig_array[ctg].flag) { continue; } output_ctg (ctg, fo); } printf ("Done with %d scaffolds, %d gaps finished, %d gaps overall\n", index, allGaps - Ncounter, allGaps); index = 0; for (i = 0; i < thrd_num; i++) { freeDarray (darrayBuf[i]); index += counters[i]; } if (fillGap) { printf ("Threads processed %d scaffolds\n", index); } free ((void *) darrayBuf); if (readSeqInGap) { freeDarray (readSeqInGap); } fclose (fp); fclose (fo); fclose (fo2); fclose (foc); freeStack (ctgStack); freeStackBuf (ctgStackBuffer, scafBufSize); for(i=0;i