/* * prlRead2Ctg.c * * Copyright (c) 2008-2016 Ruibang Luo . * * This file is part of SOAPdenovo. * * SOAPdenovo is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SOAPdenovo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SOAPdenovo. If not, see . * */ #include "stdinc.h" #include "newhash.h" #include "kmerhash.h" #include "extfunc.h" #include "extvab.h" #include "zlib.h" static long long readsInGap = 0; static int buffer_size = 100000000; static long long readCounter; static long long mapCounter; static int ALIGNLEN = 0; //buffer related varibles for chop kmer static int read_c; static char **rcSeq; static char **seqBuffer; static int *lenBuffer; static unsigned int *ctgIdArray; static int *posArray; static char *orienArray; static char *footprint; // flag indicates whether the read shoulld leave markers on contigs static char **read_name; // kmer related variables static int kmer_c; static Kmer *kmerBuffer; static ubyte8 *hashBanBuffer; static kmer_t **nodeBuffer; static boolean *smallerBuffer; static unsigned int *indexArray; static int *insSizeArray; static int *deletion; static void parse1read ( int t ); static void threadRoutine ( void *thrdID ); static void searchKmer ( int t, KmerSet *kset ); static void chopKmer4read ( int t, int threadID ); static void thread_wait ( pthread_t *threads ); static void creatThrds ( pthread_t *threads, PARAMETER *paras ) { unsigned char i; int temp; for ( i = 0; i < thrd_num; i++ ) { if ( ( temp = pthread_create ( &threads[i], NULL, ( void * ) threadRoutine, & ( paras[i] ) ) ) != 0 ) { fprintf ( stderr, "Create threads failed.\n" ); exit ( 1 ); } } fprintf ( stderr, "%d thread(s) initialized.\n", thrd_num ); } static void threadRoutine ( void *para ) { PARAMETER *prm; int i, t; unsigned char id; prm = ( PARAMETER * ) para; id = prm->threadID; while ( 1 ) { if ( * ( prm->selfSignal ) == 1 ) { for ( i = 0; i < kmer_c; i++ ) { if ( ( hashBanBuffer[i] % thrd_num ) != id ) { continue; } searchKmer ( i, KmerSets[id] ); } * ( prm->selfSignal ) = 0; } else if ( * ( prm->selfSignal ) == 2 ) { for ( i = 0; i < read_c; i++ ) { if ( i % thrd_num != id ) { continue; } chopKmer4read ( i, id + 1 ); } * ( prm->selfSignal ) = 0; } else if ( * ( prm->selfSignal ) == 3 ) { // parse reads for ( t = 0; t < read_c; t++ ) { if ( t % thrd_num != id ) { continue; } parse1read ( t ); } * ( prm->selfSignal ) = 0; } else if ( * ( prm->selfSignal ) == 5 ) { * ( prm->selfSignal ) = 0; break; } usleep ( 1 ); } } /* static void chopReads() { int i; for(i=0;ideleted ) { nodeBuffer[t] = node; } else { nodeBuffer[t] = NULL; } } /************************************************* Function: parse1read Description: Aligns read to contig by choosing the contig on which the read has most kmers hits. Input: 1. t: read index Output: None. Return: None. *************************************************/ static void parse1read ( int t ) { unsigned int j, i, s; unsigned int contigID; int counter = 0, counter2 = 0; unsigned int ctgLen, pos = 0; kmer_t *node; boolean isSmaller; int flag, maxOcc = 0; kmer_t *maxNode = NULL; int alldgnLen = lenBuffer[t] > ALIGNLEN ? ALIGNLEN : lenBuffer[t]; int multi = alldgnLen - overlaplen + 1 < 2 ? 2 : alldgnLen - overlaplen + 1; unsigned int start, finish; footprint[t] = 0; start = indexArray[t]; finish = indexArray[t + 1]; if ( finish == start ) { ctgIdArray[t] = 0; return; } for ( j = start; j < finish; j++ ) { node = nodeBuffer[j]; if ( !node ) //same as previous { continue; } flag = 1; for ( s = j + 1; s < finish; s++ ) { if ( !nodeBuffer[s] ) { continue; } if ( get_kmer_contig_id(*nodeBuffer[s]) == get_kmer_contig_id(*node) ) { flag++; nodeBuffer[s] = NULL; } } if ( ( overlaplen < 32 && flag >= 2 ) || overlaplen > 32 ) { counter2++; } if ( flag >= multi ) { counter++; } else { continue; } if ( flag > maxOcc ) { pos = j; maxOcc = flag; maxNode = node; } } if ( !counter ) { ctgIdArray[t] = 0; return; } if ( counter2 > 1 ) { footprint[t] = 1; } j = pos; i = pos - start + 1; node = nodeBuffer[j]; isSmaller = smallerBuffer[j]; contigID = get_kmer_contig_id(*node); ctgLen = contig_array[contigID].length; pos = node->r_links; if ( node->twin == isSmaller ) { orienArray[t] = '-'; ctgIdArray[t] = getTwinCtg ( contigID ); posArray[t] = ctgLen - pos - overlaplen - i + 1; } else { orienArray[t] = '+'; ctgIdArray[t] = contigID; posArray[t] = pos - i + 1; } } static void sendWorkSignal ( unsigned char SIG, unsigned char *thrdSignals ) { int t; for ( t = 0; t < thrd_num; t++ ) { thrdSignals[t + 1] = SIG; } while ( 1 ) { usleep ( 10 ); for ( t = 0; t < thrd_num; t++ ) if ( thrdSignals[t + 1] ) { break; } if ( t == thrd_num ) { break; } } } static void locate1read ( int t ) { int i, j, start, finish; kmer_t *node; unsigned int contigID; int pos, ctgLen; boolean isSmaller; start = indexArray[t]; finish = indexArray[t + 1]; for ( j = start; j < finish; j++ ) { node = nodeBuffer[j]; if ( !node ) //same as previous { continue; } i = j - start + 1; isSmaller = smallerBuffer[j]; contigID = get_kmer_contig_id(*node); ctgLen = contig_array[contigID].length; pos = node->r_links; if ( node->twin == isSmaller ) { ctgIdArray[t] = getTwinCtg ( contigID ); posArray[t] = ctgLen - pos - overlaplen - i + 1; } else { ctgIdArray[t] = contigID; posArray[t] = pos - i + 1; } } } static void output1read_gz ( int t, gzFile *outfp, gzFile *outfp2, char orien, int dhflag ) { int len = lenBuffer[t]; int index; readsInGap++; for ( index = 0; index < len; index++ ) { writeChar2tightString ( seqBuffer[t][index], rcSeq[1], index ); } gzwrite ( outfp, &len, sizeof ( int ) ); gzwrite ( outfp, &ctgIdArray[t], sizeof ( int ) ); gzwrite ( outfp, &posArray[t], sizeof ( int ) ); gzwrite ( outfp, rcSeq[1], ( unsigned ) ( len / 4 + 1 ) ); if ( fill && insSizeArray[t] < 2000 && len > 0 ) { gzprintf ( outfp2, ">%d\t%d\t%d\t%c\t%d\t%d\n", len, ctgIdArray[t], posArray[t], orien, insSizeArray[t], dhflag ); for ( index = 0; index < len; index++ ) { gzprintf ( outfp2, "%c", int2base ( ( int ) seqBuffer[t][index] ) ); } gzprintf ( outfp2, "\n" ); } } static void output1read ( int t, FILE *outfp1, FILE *outfp2, char orien, int dhflag ) { int len = lenBuffer[t]; int index; readsInGap++; /* if(ctgIdArray[t]==735||ctgIdArray[t]==getTwinCtg(735)){ printf("%d\t%d\t%d\t",t+1,ctgIdArray[t],posArray[t]); int j; for(j=0;j 0 ) { fprintf ( outfp2, ">%d\t%d\t%d\t%c\t%d\t%d\n", len, ctgIdArray[t], posArray[t], orien, insSizeArray[t], dhflag ); for ( index = 0; index < len; index++ ) { fprintf ( outfp2, "%c", int2base ( ( int ) seqBuffer[t][index] ) ); } fprintf ( outfp2, "\n" ); } } static void output1Nread ( int t, FILE *outfp ) { int len = lenBuffer[t]; int index; fprintf ( outfp, "%d\t%d\t%d\n", lenBuffer[t], ctgIdArray[t], posArray[t] ); fprintf ( outfp, ">%s\n", read_name[t] ); for ( index = 0; index < len; index++ ) { fprintf ( outfp, "%c", int2base ( ( int ) seqBuffer[t][index] ) ); } fprintf ( outfp, "\n" ); } static void getPEreadOnContig ( int t, gzFile *outfp ) { int len1, len2, index; char orien1, orien2; len1 = lenBuffer[t - 1]; len2 = lenBuffer[t]; orien1 = orienArray[t - 1]; orien2 = orienArray[t]; if ( insSizeArray[t] < 2000 && insSizeArray[t] == insSizeArray[t - 1] ) { gzwrite ( outfp, &len1, sizeof ( int ) ); gzwrite ( outfp, &ctgIdArray[t - 1], sizeof ( int ) ); gzwrite ( outfp, &posArray[t - 1], sizeof ( int ) ); gzwrite ( outfp, &orien1, sizeof ( char ) ); gzwrite ( outfp, &insSizeArray[t - 1], sizeof ( int ) ); for ( index = 0; index < len1; index++ ) { writeChar2tightString ( seqBuffer[t - 1][index], rcSeq[1], index ); } gzwrite ( outfp, rcSeq[1], ( unsigned ) ( len1 / 4 + 1 ) ); gzwrite ( outfp, &len2, sizeof ( int ) ); gzwrite ( outfp, &ctgIdArray[t], sizeof ( int ) ); gzwrite ( outfp, &posArray[t], sizeof ( int ) ); gzwrite ( outfp, &orien2, sizeof ( char ) ); gzwrite ( outfp, &insSizeArray[t], sizeof ( int ) ); for ( index = 0; index < len2; index++ ) { writeChar2tightString ( seqBuffer[t][index], rcSeq[1], index ); } gzwrite ( outfp, rcSeq[1], ( unsigned ) ( len2 / 4 + 1 ) ); } } /* static void getPEreadOnContig(int t,FILE* outfp) { int len1,len2,index; char orien1,orien2; len1 = lenBuffer[t-1]; len2 = lenBuffer[t]; orien1 = orienArray[t-1]; orien2 = orienArray[t]; if(insSizeArray[t]<2000&&insSizeArray[t]==insSizeArray[t-1]){ fprintf(outfp,">%d\t%d\t%d\t%c\t%d\n",len1,ctgIdArray[t-1],posArray[t-1],orien1,insSizeArray[t-1]); for(index=0;index%d\t%d\t%d\t%c\t%d\n",len2,ctgIdArray[t],posArray[t],orien2,insSizeArray[t]); for(index=0;index R1 <-- R2 output1read_gz ( read2, outfp1, outfp2, orientation, 2 ); } } static void recordLongRead ( FILE *outfp1, FILE *outfp2 ) { int t; for ( t = 0; t < read_c; t++ ) { readCounter++; if ( footprint[t] ) { output1read ( t, outfp1, outfp2, orienArray[t], 0 ); } } } static void recordAlldgn ( gzFile *outfp, int *insSizeArr, gzFile *outfp1, gzFile *outfp2, gzFile *outfp4 ) { int t, ctgId; boolean rd1gap, rd2gap; char orientation; for ( t = 0; t < read_c; t++ ) { readCounter++; rd1gap = rd2gap = 0; ctgId = ctgIdArray[t]; if ( outfp1 && t % 2 == 1 ) //make sure this is read2 in a pair { if ( ctgIdArray[t] < 1 && ctgIdArray[t - 1] > 0 ) { getReadIngap ( t, insSizeArr[t], outfp1, outfp2, 0 ); rd2gap = 1; } else if ( ctgIdArray[t] > 0 && ctgIdArray[t - 1] < 1 ) { getReadIngap ( t - 1, insSizeArr[t - 1], outfp1, outfp2, 1 ); rd1gap = 1; } else if ( ctgIdArray[t] > 0 && ctgIdArray[t - 1] > 0 ) //PE read on contig { if ( fill ) { getPEreadOnContig ( t, outfp4 ); } } } if ( ctgId < 1 ) { continue; } mapCounter++; gzprintf ( outfp, "%lld\t%u\t%d\t%c\n", readCounter, ctgIdArray[t], posArray[t], orienArray[t] ); if ( t % 2 == 0 ) { continue; } // reads are not located by pe info but across edges if ( outfp1 && footprint[t - 1] && !rd1gap ) { if ( ctgIdArray[t - 1] < 1 ) { locate1read ( t - 1 ); } if ( orienArray[t] == '+' ) { orientation = '-'; } else { orientation = '+'; } output1read_gz ( t - 1, outfp1, outfp2, orientation, 1 ); //read1 in gap. } if ( outfp1 && footprint[t] && !rd2gap ) { if ( ctgIdArray[t] < 1 ) { locate1read ( t ); } if ( orienArray[t - 1] == '+' ) { orientation = '-'; } else { orientation = '+'; } output1read_gz ( t, outfp1, outfp2, orientation, 2 ); //read2 in gap. } } } /************************************************* Function: basicContigInfo Description: Loads contig index and length infromation. Input: 1. infile: the graph file prefix Output: None. Return: None. *************************************************/ void basicContigInfo ( char *infile ) { char name[256], lldne[1024]; FILE *fp; int length, bal_ed, num_all, num_long, index; sprintf ( name, "%s.ContigIndex", infile ); fp = ckopen ( name, "r" ); fgets ( lldne, sizeof ( lldne ), fp ); sscanf ( lldne + 8, "%d %d", &num_all, &num_long ); fprintf ( stderr, "%d edge(s) in the graph.\n", num_all ); num_ctg = num_all; contig_array = ( CONTIG * ) ckalloc ( ( num_all + 1 ) * sizeof ( CONTIG ) ); fgets ( lldne, sizeof ( lldne ), fp ); num_long = 0; while ( fgets ( lldne, sizeof ( lldne ), fp ) != NULL ) { sscanf ( lldne, "%d %d %d", &index, &length, &bal_ed ); contig_array[++num_long].length = length; contig_array[num_long].bal_edge = bal_ed + 1; if ( index != num_long ) { fprintf ( stderr, "BasicContigInfo: %d vs %d.\n", index, num_long ); } if ( bal_ed == 0 ) { continue; } contig_array[++num_long].length = length; contig_array[num_long].bal_edge = -bal_ed + 1; } fclose ( fp ); } /************************************************* Function: prlRead2Ctg Description: Maps reads to contigs one by one. Input: 1. libfile: the reads config file 2. outfile: the graph file prefix Output: None. Return: None. *************************************************/ void prlRead2Ctg ( char *libfile, char *outfile ) { long long i; char *src_name, *next_name, name[256]; FILE *fo2; gzFile *fo, *outfp1 = NULL, *outfp2 = NULL, *outfp3 = NULL, *outfp4 = NULL; int maxReadNum, libNo, prevLibNo, insSize = 0; boolean flag, pairs = 1; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; //init maxReadLen = 0; maxNameLen = 256; scan_libInfo ( libfile ); alloc_pe_mem ( num_libs ); if ( !maxReadLen ) { maxReadLen = 100; } fprintf ( stderr, "In file: %s, max seq len %d, max name len %d\n", libfile, maxReadLen, maxNameLen ); if ( maxReadLen > maxReadLen4all ) { maxReadLen4all = maxReadLen; } src_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) ); nodeBuffer = ( kmer_t ** ) ckalloc ( buffer_size * sizeof ( kmer_t * ) ); smallerBuffer = ( boolean * ) ckalloc ( buffer_size * sizeof ( boolean ) ); maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 ); maxReadNum = maxReadNum % 2 == 0 ? maxReadNum : maxReadNum - 1; //make sure paired reads are processed at the same batch seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); indexArray = ( unsigned int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( unsigned int ) ); ctgIdArray = ( unsigned int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( unsigned int ) ); posArray = ( int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( int ) ); orienArray = ( char * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( char ) ); footprint = ( char * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( char ) ); insSizeArray = ( int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( int ) ); read_name = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); if ( gLineLen < maxReadLen ) { gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) ); } for ( i = 0; i < maxReadNum; i++ ) { read_name[i] = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); } for ( i = 0; i < maxReadNum; i++ ) { seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) ); thrdSignal[0] = 0; if ( 1 ) { for ( i = 0; i < thrd_num; i++ ) { rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; } creatThrds ( threads, paras ); } if ( !contig_array ) { basicContigInfo ( outfile ); } sprintf ( name, "%s.readInGap.gz", outfile ); outfp1 = gzopen ( name, "wb" ); if ( fill ) { sprintf ( name, "%s.shortreadInGap.gz", outfile ); outfp2 = gzopen ( name, "w" ); } sprintf ( name, "%s.readOnContig.gz", outfile ); fo = gzopen ( name, "w" ); if ( fill ) { sprintf ( name, "%s.PEreadOnContig.gz", outfile ); outfp4 = gzopen ( name, "wb" ); } gzprintf ( fo, "read\tcontig\tpos\n" ); readCounter = mapCounter = readsInGap = 0; kmer_c = n_solexa = read_c = i = libNo = readNumBack = gradsCounter = 0; prevLibNo = -1; int type = 0; //decide whether the PE reads is good or bad while ( ( flag = read1seqInLib ( seqBuffer[read_c], read_name[read_c], & ( lenBuffer[read_c] ), &libNo, pairs, 0, &type ) ) != 0 ) { if ( type == -1 ) //if the reads is bad, go back. { i--; if ( lenBuffer[read_c - 1] >= overlaplen + 1 ) { kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1; } read_c--; n_solexa -= 2; continue; } if ( libNo != prevLibNo ) { prevLibNo = libNo; insSize = lib_array[libNo].avg_ins; ALIGNLEN = lib_array[libNo].map_len; if ( insSize > 1000 ) { ALIGNLEN = ALIGNLEN < 35 ? 35 : ALIGNLEN; } else { ALIGNLEN = ALIGNLEN < 32 ? 32 : ALIGNLEN; } fprintf ( stderr, "Current insert size is %d, map_len is %d.\n", insSize, ALIGNLEN ); } insSizeArray[read_c] = insSize; if ( insSize > 1000 ) { ALIGNLEN = ALIGNLEN < ( lenBuffer[read_c] / 2 + 1 ) ? ( lenBuffer[read_c] / 2 + 1 ) : ALIGNLEN; } if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } indexArray[read_c] = kmer_c; if ( lenBuffer[read_c] >= overlaplen + 1 ) { kmer_c += lenBuffer[read_c] - overlaplen + 1; } read_c++; if ( read_c == maxReadNum ) { indexArray[read_c] = kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //searchKmer sendWorkSignal ( 3, thrdSignal ); //parse1read recordAlldgn ( fo, insSizeArray, outfp1, outfp2, outfp4 ); kmer_c = 0; read_c = 0; } } if ( read_c ) { indexArray[read_c] = kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //searchKmer sendWorkSignal ( 3, thrdSignal ); //parse1read recordAlldgn ( fo, insSizeArray, outfp1, outfp2, outfp4 ); fprintf ( stderr, "\nTotal reads %lld\n", readCounter ); fprintf ( stderr, "Reads in gaps %lld\n", readsInGap ); fprintf ( stderr, "Ratio %.1f%%\n", ( float ) readsInGap / readCounter * 100 ); } fprintf ( stderr, "Reads on contigs %lld\n", mapCounter ); fprintf ( stderr, "Ratio %.1f%%\n", ( float ) mapCounter / readCounter * 100 ); sendWorkSignal ( 5, thrdSignal ); //stop threads thread_wait ( threads ); gzclose ( fo ); sprintf ( name, "%s.peGrads", outfile ); fo2 = ckopen ( name, "w" ); fprintf ( fo2, "grads&num: %d\t%lld\t%d\n", gradsCounter, n_solexa, maxReadLen4all ); if ( pairs ) { if ( gradsCounter ) { fprintf ( stderr, "%d pe insert size, the largest boundary is %lld.\n\n", gradsCounter, pes[gradsCounter - 1].PE_bound ); } else { fprintf ( stderr, "No paired reads found.\n" ); } for ( i = 0; i < gradsCounter; i++ ) { fprintf ( fo2, "%d\t%lld\t%d\t%d\n", pes[i].insertS, pes[i].PE_bound, pes[i].rank, pes[i].pair_num_cut ); } fclose ( fo2 ); } gzclose ( outfp1 ); if ( fill ) { gzclose ( outfp2 ); gzclose ( outfp4 ); } free_pe_mem (); free_libs (); if ( 1 ) // multi-threads { for ( i = 0; i < thrd_num; i++ ) { free ( ( void * ) rcSeq[i + 1] ); } } free ( ( void * ) rcSeq ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) seqBuffer[i] ); } free ( ( void * ) seqBuffer ); free ( ( void * ) lenBuffer ); free ( ( void * ) indexArray ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) read_name[i] ); } free ( ( void * ) read_name ); free ( ( void * ) kmerBuffer ); free ( ( void * ) smallerBuffer ); free ( ( void * ) hashBanBuffer ); free ( ( void * ) nodeBuffer ); free ( ( void * ) ctgIdArray ); free ( ( void * ) posArray ); free ( ( void * ) orienArray ); free ( ( void * ) footprint ); free ( ( void * ) insSizeArray ); free ( ( void * ) src_name ); free ( ( void * ) next_name ); if ( gLineLen < maxReadLen ) { free ( ( void * ) gStr ); gStr = NULL; } if ( contig_array ) { free ( ( void * ) contig_array ); contig_array = NULL; } } static void thread_wait ( pthread_t *threads ) { int i; for ( i = 0; i < thrd_num; i++ ) if ( threads[i] != 0 ) { pthread_join ( threads[i], NULL ); } } /************************************************* Function: prlLongRead2Ctg Description: Aligns long reads to contigs. Input: 1. libfile: the reads contig file 2. outfile: the output file prefix Output: None. Return: None. *************************************************/ void prlLongRead2Ctg ( char *libfile, char *outfile ) { long long i; char *src_name, *next_name, name[256]; FILE *outfp1, *outfp2, *outfp3; int maxReadNum, libNo, prevLibNo; boolean flag, pairs = 0; pthread_t threads[thrd_num]; unsigned char thrdSignal[thrd_num + 1]; PARAMETER paras[thrd_num]; maxReadLen = 0; maxNameLen = 256; scan_libInfo ( libfile ); if ( !maxReadLen ) { maxReadLen = 100; } int longReadLen = getMaxLongReadLen ( num_libs ); if ( longReadLen < 1 ) // no long reads { return; } maxReadLen4all = maxReadLen < longReadLen ? longReadLen : maxReadLen; fprintf ( stderr, "In file: %s, long read len %d, max name len %d.\n", libfile, longReadLen, maxNameLen ); maxReadLen = longReadLen; src_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) ); hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) ); nodeBuffer = ( kmer_t ** ) ckalloc ( buffer_size * sizeof ( kmer_t * ) ); smallerBuffer = ( boolean * ) ckalloc ( buffer_size * sizeof ( boolean ) ); maxReadNum = buffer_size / ( maxReadLen - overlaplen + 1 ); maxReadNum = maxReadNum % 2 == 0 ? maxReadNum : maxReadNum - 1; //make sure paired reads are processed at the same batch seqBuffer = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); read_name = ( char ** ) ckalloc ( maxReadNum * sizeof ( char * ) ); for ( i = 0; i < maxReadNum; i++ ) { read_name[i] = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) ); } lenBuffer = ( int * ) ckalloc ( maxReadNum * sizeof ( int ) ); indexArray = ( unsigned int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( unsigned int ) ); ctgIdArray = ( unsigned int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( unsigned int ) ); posArray = ( int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( int ) ); orienArray = ( char * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( char ) ); footprint = ( char * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( char ) ); insSizeArray = ( int * ) ckalloc ( ( maxReadNum + 1 ) * sizeof ( int ) ); if ( gLineLen < maxReadLen ) { gStr = ( char * ) ckalloc ( ( maxReadLen + 1 ) * sizeof ( char ) ); } for ( i = 0; i < maxReadNum; i++ ) { seqBuffer[i] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); } rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) ); deletion = ( int * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( int ) ); thrdSignal[0] = 0; deletion[0] = 0; if ( 1 ) { for ( i = 0; i < thrd_num; i++ ) { rcSeq[i + 1] = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) ); deletion[i + 1] = 0; thrdSignal[i + 1] = 0; paras[i].threadID = i; paras[i].mainSignal = &thrdSignal[0]; paras[i].selfSignal = &thrdSignal[i + 1]; } creatThrds ( threads, paras ); } if ( !contig_array ) { basicContigInfo ( outfile ); } sprintf ( name, "%s.longReadInGap", outfile ); outfp1 = ckopen ( name, "wb" ); if ( fill ) { sprintf ( name, "%s.RlongReadInGap", outfile ); outfp2 = ckopen ( name, "w" ); } readCounter = 0; kmer_c = n_solexa = read_c = i = libNo = 0; prevLibNo = -1; int type = 0; //decide whether the PE reads is good or bad while ( ( flag = read1seqInLib ( seqBuffer[read_c], read_name[read_c], & ( lenBuffer[read_c] ), &libNo, pairs, 4, &type ) ) != 0 ) { if ( type == -1 ) //if the reads is bad, go back. { i--; if ( lenBuffer[read_c - 1] >= overlaplen + 1 ) { kmer_c -= lenBuffer[read_c - 1] - overlaplen + 1; } read_c--; n_solexa -= 2; continue; } if ( libNo != prevLibNo ) { prevLibNo = libNo; ALIGNLEN = lib_array[libNo].map_len; ALIGNLEN = ALIGNLEN < 35 ? 35 : ALIGNLEN; fprintf ( stderr, "Map_len %d.\n", ALIGNLEN ); } insSizeArray[read_c] = 18; if ( ( ++i ) % 100000000 == 0 ) { fprintf ( stderr, "--- %lldth reads.\n", i ); } indexArray[read_c] = kmer_c; if ( lenBuffer[read_c] >= overlaplen + 1 ) { kmer_c += lenBuffer[read_c] - overlaplen + 1; } read_c++; if ( read_c == maxReadNum ) { indexArray[read_c] = kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //searchKmer sendWorkSignal ( 3, thrdSignal ); //parse1read recordLongRead ( outfp1, outfp2 ); kmer_c = 0; read_c = 0; } } if ( read_c ) { indexArray[read_c] = kmer_c; sendWorkSignal ( 2, thrdSignal ); //chopKmer4read sendWorkSignal ( 1, thrdSignal ); //searchKmer sendWorkSignal ( 3, thrdSignal ); //parse1read recordLongRead ( outfp1, outfp2 ); fprintf ( stderr, "Output %lld out of %lld (%.1f)%% reads in gaps.\n", readsInGap, readCounter, ( float ) readsInGap / readCounter * 100 ); } sendWorkSignal ( 5, thrdSignal ); //stop thread_wait ( threads ); fclose ( outfp1 ); if ( fill ) { fclose ( outfp2 ); } free_libs (); if ( 1 ) // multi-threads { for ( i = 0; i < thrd_num; i++ ) { deletion[0] += deletion[i + 1]; free ( ( void * ) rcSeq[i + 1] ); } } fprintf ( stderr, "%d reads deleted.\n", deletion[0] ); free ( ( void * ) rcSeq ); free ( ( void * ) deletion ); for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) seqBuffer[i] ); } for ( i = 0; i < maxReadNum; i++ ) { free ( ( void * ) read_name[i] ); } free ( ( void * ) seqBuffer ); free ( ( void * ) lenBuffer ); free ( ( void * ) indexArray ); free ( ( void * ) kmerBuffer ); free ( ( void * ) smallerBuffer ); free ( ( void * ) hashBanBuffer ); free ( ( void * ) nodeBuffer ); free ( ( void * ) ctgIdArray ); free ( ( void * ) posArray ); free ( ( void * ) orienArray ); free ( ( void * ) footprint ); free ( ( void * ) insSizeArray ); free ( ( void * ) src_name ); free ( ( void * ) next_name ); if ( gLineLen < maxReadLen ) { free ( ( void * ) gStr ); gStr = NULL; } }