/* * read2scaf.c * * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * * SOAPdenovo is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SOAPdenovo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SOAPdenovo. If not, see . * */ #include "stdinc.h" #include "newhash.h" #include "kmerhash.h" #include "extfunc.h" #include "extvab.h" static int Ncounter; static int allGaps; // for multi threads static int scafBufSize = 100; static STACK **ctgStackBuffer; static int scafCounter; static int scafInBuf; static void convertIndex () { int *length_array = ( int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( int ) ); unsigned int i; for ( i = 1; i <= num_ctg; i++ ) { length_array[i] = 0; } for ( i = 1; i <= num_ctg; i++ ) { if ( index_array[i] > 0 ) { length_array[index_array[i]] = i; } } for ( i = 1; i <= num_ctg; i++ ) { index_array[i] = length_array[i]; } //contig i with new index: index_array[i] free ( ( void * ) length_array ); } static void reverseStack ( STACK *dStack, STACK *sStack ) { CTGinSCAF *actg, *ctgPt; emptyStack ( dStack ); while ( ( actg = ( CTGinSCAF * ) stackPop ( sStack ) ) != NULL ) { ctgPt = ( CTGinSCAF * ) stackPush ( dStack ); ctgPt->ctgID = actg->ctgID; ctgPt->start = actg->start; ctgPt->end = actg->end; } stackBackup ( dStack ); } static void initStackBuf ( STACK **ctgStackBuffer, int scafBufSize ) { int i; for ( i = 0; i < scafBufSize; i++ ) { ctgStackBuffer[i] = ( STACK * ) createStack ( 100, sizeof ( CTGinSCAF ) ); } } static void freeStackBuf ( STACK **ctgStackBuffer, int scafBufSize ) { int i; for ( i = 0; i < scafBufSize; i++ ) { freeStack ( ctgStackBuffer[i] ); } } static void mapCtg2Scaf ( int scafInBuf ) { int i, scafID; CTGinSCAF *actg; STACK *ctgsStack; unsigned int ctg, bal_ctg; for ( i = 0; i < scafInBuf; i++ ) { scafID = scafCounter + i + 1; ctgsStack = ctgStackBuffer[i]; while ( ( actg = stackPop ( ctgsStack ) ) != NULL ) { ctg = actg->ctgID; bal_ctg = getTwinCtg ( ctg ); if ( contig_array[ctg].from_vt != 0 ) { contig_array[ctg].multi = 1; contig_array[bal_ctg].multi = 1; continue; } contig_array[ctg].from_vt = scafID; contig_array[ctg].to_vt = actg->start; contig_array[ctg].flag = 0; //ctg and scaf on the same strand contig_array[bal_ctg].from_vt = scafID; contig_array[bal_ctg].to_vt = actg->start; contig_array[bal_ctg].flag = 1; } } } static void locateContigOnscaff ( char *graphfile ) { FILE *fp; char line[1024]; CTGinSCAF *actg; STACK *ctgStack, *aStack; int index = 0, counter, overallLen; int starter, prev_start, gapN, scafLen; unsigned int ctg, prev_ctg = 0; for ( ctg = 1; ctg <= num_ctg; ctg++ ) { contig_array[ctg].from_vt = 0; contig_array[ctg].multi = 0; } ctgStack = ( STACK * ) createStack ( 1000, sizeof ( CTGinSCAF ) ); sprintf ( line, "%s.scaf_gap", graphfile ); fp = ckopen ( line, "r" ); ctgStackBuffer = ( STACK ** ) ckalloc ( scafBufSize * sizeof ( STACK * ) ); initStackBuf ( ctgStackBuffer, scafBufSize ); Ncounter = scafCounter = scafInBuf = allGaps = 0; while ( fgets ( line, sizeof ( line ), fp ) != NULL ) { if ( line[0] == '>' ) { if ( index ) { aStack = ctgStackBuffer[scafInBuf++]; reverseStack ( aStack, ctgStack ); if ( scafInBuf == scafBufSize ) { mapCtg2Scaf ( scafInBuf ); scafCounter += scafInBuf; scafInBuf = 0; } if ( index % 1000 == 0 ) { fprintf ( stderr, "Processed %d scaffolds.\n", index ); } } //read next scaff scafLen = prev_ctg = 0; emptyStack ( ctgStack ); sscanf ( line + 9, "%d %d %d", &index, &counter, &overallLen ); //fprintf(stderr,">%d\n",index); continue; } if ( line[0] == 'G' ) // gap appears { continue; } if ( line[0] >= '0' && line[0] <= '9' ) // a contig line { sscanf ( line, "%d %d", &ctg, &starter ); actg = ( CTGinSCAF * ) stackPush ( ctgStack ); actg->ctgID = ctg; if ( !prev_ctg ) { actg->start = scafLen; actg->end = actg->start + overlaplen + contig_array[ctg].length - 1; } else { gapN = starter - prev_start - ( int ) contig_array[prev_ctg].length; gapN = gapN < 1 ? 1 : gapN; actg->start = scafLen + gapN; actg->end = actg->start + contig_array[ctg].length - 1; } //fprintf(stderr,"%d\t%d\n",actg->start,actg->end); scafLen = actg->end + 1; prev_ctg = ctg; prev_start = starter; } } if ( index ) { aStack = ctgStackBuffer[scafInBuf++]; reverseStack ( aStack, ctgStack ); mapCtg2Scaf ( scafInBuf ); } gapN = 0; for ( ctg = 1; ctg <= num_ctg; ctg++ ) { if ( contig_array[ctg].from_vt == 0 || contig_array[ctg].multi == 1 ) { continue; } gapN++; } fprintf ( stderr, "\nDone with %d scaffolds, %d contigs in Scaffolld\n", index, gapN ); /* if(readSeqInGap) freeDarray(readSeqInGap); */ fclose ( fp ); freeStack ( ctgStack ); freeStackBuf ( ctgStackBuffer, scafBufSize ); free ( ( void * ) ctgStackBuffer ); } static boolean contigElligible ( unsigned int contigno ) { unsigned int ctg = index_array[contigno]; if ( contig_array[ctg].from_vt == 0 || contig_array[ctg].multi == 1 ) { return 0; } else { return 1; } } static void output1read ( FILE *fo, long long readno, unsigned int contigno, int pos ) { unsigned int ctg = index_array[contigno]; int posOnScaf; char orien; pos = pos < 0 ? 0 : pos; if ( contig_array[ctg].flag == 0 ) { posOnScaf = contig_array[ctg].to_vt + pos - overlaplen; orien = '+'; } else { posOnScaf = contig_array[ctg].to_vt + contig_array[ctg].length - pos; orien = '-'; } /* if(readno==676) printf("Read %lld in region from %d, extend %d, pos %d, orien %c\n", readno,contig_array[ctg].to_vt,contig_array[ctg].length,posOnScaf,orien); */ fprintf ( fo, "%lld\t%d\t%d\t%c\n", readno, contig_array[ctg].from_vt, posOnScaf, orien ); } void locateReadOnScaf ( char *graphfile ) { char name[1024], line[1024]; FILE *fp, *fo; long long readno, counter = 0, pre_readno = 0; unsigned int contigno, pre_contigno; int pre_pos, pos; locateContigOnscaff ( graphfile ); sprintf ( name, "%s.readOnContig", graphfile ); fp = ckopen ( name, "r" ); sprintf ( name, "%s.readOnScaf", graphfile ); fo = ckopen ( name, "w" ); if ( !orig2new ) { convertIndex (); orig2new = 1; } fgets ( line, 1024, fp ); while ( fgets ( line, 1024, fp ) != NULL ) { sscanf ( line, "%lld %d %d", &readno, &contigno, &pos ); if ( ( readno % 2 == 0 ) && ( pre_readno == readno - 1 ) // they are a pair of reads && contigElligible ( pre_contigno ) && contigElligible ( contigno ) ) { output1read ( fo, pre_readno, pre_contigno, pre_pos ); output1read ( fo, readno, contigno, pos ); counter++; } pre_readno = readno; pre_contigno = contigno; pre_pos = pos; } fprintf ( stderr, "%lld pair(s) on contig\n", counter ); fclose ( fp ); fclose ( fo ); }