/* * orderContig.c * * Copyright (c) 2011-2013 BGI-Shenzhen . * * This file is part of SOAPdenovo-Trans. * * SOAPdenovo-Trans is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SOAPdenovo-Trans is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with SOAPdenovo-Trans. If not, see . * */ #include "stdinc.h" #include "newhash.h" #include "extfunc.h" #include "extvab.h" #include "dfibHeap.h" #include "fibHeap.h" #include "darray.h" #include "zlib.h" #define CNBLOCKSIZE 10000 #define MAXC 10000 #define MAXCinBetween 200 #define MaxNodeInSub 10000 #define GapLowerBound -2000 #define GapUpperBound 300000 static boolean static_f = 0; static double OverlapPercent = 0.05; static double ConflPercent = 0.05; static int gapCounter; static int orienCounter; static int throughCounter; static DARRAY *solidArray; static DARRAY *tempArray; static int solidCounter; static CTGinHEAP ctg4heapArray[MaxNodeInSub + 1]; // index in this array are put to heaps, start from 1 static unsigned int nodesInSub[MaxNodeInSub]; static int nodeDistance[MaxNodeInSub]; static int nodeCounter; static unsigned int nodesInSubInOrder[MaxNodeInSub]; static int nodeDistanceInOrder[MaxNodeInSub]; static DARRAY *scaf3, *scaf5; static DARRAY *gap3, *gap5; static unsigned int downstreamCTG[MAXCinBetween]; static unsigned int upstreamCTG[MAXCinBetween]; static int dsCtgCounter; static int usCtgCounter; static CONNECT *checkConnect (unsigned int from_c, unsigned int to_c); static int maskPuzzle (int num_connect, unsigned int contigLen); static void freezing (); static boolean checkOverlapInBetween (double tolerance); static int setConnectDelete (unsigned int from_c, unsigned int to_c, char flag, boolean cleanBinding); static int setConnectWP (unsigned int from_c, unsigned int to_c, char flag); static void general_linearization (boolean strict); static void debugging2 (); static void smallScaf (); static void detectBreakScaf (); static boolean checkSimple (DARRAY * ctgArray, int count); static void checkCircle (); //find the only connection involved in connection binding static CONNECT *getBindCnt (unsigned int ctg) { CONNECT *ite_cnt; CONNECT *bindCnt = NULL; CONNECT *temp_cnt = NULL; CONNECT *temp3_cnt = NULL; int count = 0; int count2 = 0; int count3 = 0; ite_cnt = contig_array[ctg].downwardConnect; while (ite_cnt) { if (ite_cnt->nextInScaf) { count++; bindCnt = ite_cnt; } if (ite_cnt->prevInScaf) { temp_cnt = ite_cnt; count2++; } if (ite_cnt->singleInScaf) { temp3_cnt = ite_cnt; count3++; } ite_cnt = ite_cnt->next; } if (count == 1) { return bindCnt; } if (count == 0 && count2 == 1) { return temp_cnt; } if (count == 0 && count2 == 0 && count3 == 1) { return temp3_cnt; } return NULL; } static void createAnalogousCnt (unsigned int sourceStart, CONNECT * originCnt, int gap, unsigned int targetStart, unsigned int targetStop) { CONNECT *temp_cnt; unsigned int balTargetStart = getTwinCtg (targetStart); unsigned int balTargetStop = getTwinCtg (targetStop); unsigned int balSourceStart = getTwinCtg (sourceStart); unsigned int balSourceStop = getTwinCtg (originCnt->contigID); originCnt->deleted = 1; temp_cnt = getCntBetween (balSourceStop, balSourceStart); temp_cnt->deleted = 1; if (gap < GapLowerBound) { gapCounter++; return; } temp_cnt = add1Connect (targetStart, targetStop, gap, originCnt->weight, 1); if (temp_cnt) { temp_cnt->inherit = 1; } temp_cnt = add1Connect (balTargetStop, balTargetStart, gap, originCnt->weight, 1); if (temp_cnt) { temp_cnt->inherit = 1; } } // increase #long_pe_support for a conncet by 1 static void add1LongPEcov (unsigned int fromCtg, unsigned int toCtg, int weight) { //check if they are on the same scaff if (contig_array[fromCtg].from_vt != contig_array[toCtg].from_vt || contig_array[fromCtg].to_vt != contig_array[toCtg].to_vt) { printf ("Warning from add1LongPEcov: contig %d and %d not on the same scaffold\n", fromCtg, toCtg); return; } if (contig_array[fromCtg].indexInScaf >= contig_array[toCtg].indexInScaf) { printf ("Warning from add1LongPEcov: wrong about order between contig %d and %d\n", fromCtg, toCtg); return; } CONNECT *bindCnt; unsigned int prevCtg = fromCtg; bindCnt = getBindCnt (fromCtg); while (bindCnt) { if (bindCnt->maxGap + weight <= 1000) { bindCnt->maxGap += weight; } else { bindCnt->maxGap = 1000; } if (fromCtg == 0 && toCtg == 0) printf ("link (%d %d ) covered by link (%d %d), wt %d\n", prevCtg, bindCnt->contigID, fromCtg, toCtg, weight); if (bindCnt->contigID == toCtg) { break; } prevCtg = bindCnt->contigID; bindCnt = bindCnt->nextInScaf; } unsigned int bal_fc = getTwinCtg (fromCtg); unsigned int bal_tc = getTwinCtg (toCtg); bindCnt = getBindCnt (bal_tc); prevCtg = bal_tc; while (bindCnt) { if (bindCnt->maxGap + weight <= 1000) { bindCnt->maxGap += weight; } else { bindCnt->maxGap = 1000; } if (fromCtg == 0 && toCtg == 0) printf ("link (%d %d ) covered by link (%d %d), wt %d\n", prevCtg, bindCnt->contigID, fromCtg, toCtg, weight); if (bindCnt->contigID == bal_fc) { return; } prevCtg = bindCnt->contigID; bindCnt = bindCnt->nextInScaf; } printf ("Warning from add1LongPEcov: not reach the end (%d %d) (B)\n", bal_tc, bal_fc); } // for long pair ends, move the connections along scaffolds established by shorter pair ends till reach the ends static void downSlide () { int len = 0, gap; unsigned int i; CONNECT *ite_cnt, *bindCnt, *temp_cnt; unsigned int bottomCtg, topCtg, bal_i; unsigned int targetCtg, bal_target; boolean getThrough, orienConflict; int slideLen, slideLen2; orienCounter = throughCounter = 0; for (i = 1; i <= num_ctg; i++) { if (contig_array[i].mask || !contig_array[i].downwardConnect) { continue; } bindCnt = getBindCnt (i); if (!bindCnt) { continue; } bal_i = getTwinCtg (i); len = slideLen = 0; bottomCtg = i; //find the last unmasked contig in this binding while (bindCnt->nextInScaf) { len += bindCnt->gapLen + contig_array[bindCnt->contigID].length; if (contig_array[bindCnt->contigID].mask == 0) { bottomCtg = bindCnt->contigID; slideLen = len; } bindCnt = bindCnt->nextInScaf; } len += bindCnt->gapLen + contig_array[bindCnt->contigID].length; if (contig_array[bindCnt->contigID].mask == 0 || bottomCtg == 0) { bottomCtg = bindCnt->contigID; slideLen = len; } //check each connetion from long pair ends ite_cnt = contig_array[i].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask || ite_cnt->singleInScaf || ite_cnt->nextInScaf || ite_cnt->prevInScaf || ite_cnt->inherit) { ite_cnt = ite_cnt->next; continue; } targetCtg = ite_cnt->contigID; if (contig_array[i].from_vt == contig_array[targetCtg].from_vt) // on the same scaff { if (contig_array[i].indexInScaf > contig_array[targetCtg].indexInScaf) { orienCounter++; } else { throughCounter++; } setConnectDelete (i, ite_cnt->contigID, 1, 0); ite_cnt = ite_cnt->next; continue; } //check if this connection conflicts with previous scaffold orientationally temp_cnt = getBindCnt (targetCtg); orienConflict = 0; if (temp_cnt) { while (temp_cnt->nextInScaf) { if (temp_cnt->contigID == i) { orienConflict = 1; printf ("Warning from downSlide: still on the same scaff: %d and %d\n", i, targetCtg); printf ("on scaff %d and %d\n", contig_array[i].from_vt, contig_array[targetCtg].from_vt); printf ("on bal_scaff %d and %d\n", contig_array[bal_target].to_vt, contig_array[bal_i].to_vt); break; } temp_cnt = temp_cnt->nextInScaf; } if (temp_cnt->contigID == i) { orienConflict = 1; } } if (orienConflict) { orienCounter++; setConnectDelete (i, ite_cnt->contigID, 1, 0); ite_cnt = ite_cnt->next; continue; } //find the most top contig along previous scaffold starting with the target contig of this connection bal_target = getTwinCtg (targetCtg); slideLen2 = 0; if (contig_array[targetCtg].mask == 0) { topCtg = bal_target; } else { topCtg = 0; } temp_cnt = getBindCnt (bal_target); getThrough = len = 0; if (temp_cnt) { //find the last contig in this binding while (temp_cnt->nextInScaf) { //check if this route reaches bal_i if (temp_cnt->contigID == bal_i) { printf ("Warning from downSlide: (B) still on the same scaff: %d and %d (%d and %d)\n", i, targetCtg, bal_target, bal_i); printf ("on scaff %d and %d\n", contig_array[i].from_vt, contig_array[targetCtg].from_vt); printf ("on bal_scaff %d and %d\n", contig_array[bal_target].to_vt, contig_array[bal_i].to_vt); getThrough = 1; break; } len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length; if (contig_array[temp_cnt->contigID].mask == 0) { topCtg = temp_cnt->contigID; slideLen2 = len; } temp_cnt = temp_cnt->nextInScaf; } len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length; if (contig_array[temp_cnt->contigID].mask == 0 || topCtg == 0) { topCtg = temp_cnt->contigID; slideLen2 = len; } if (temp_cnt->contigID == bal_i) { getThrough = 1; } else { topCtg = getTwinCtg (topCtg); } } else { topCtg = targetCtg; } if (getThrough) { throughCounter++; setConnectDelete (i, ite_cnt->contigID, 1, 0); ite_cnt = ite_cnt->next; continue; } //add a connection between bottomCtg and topCtg gap = ite_cnt->gapLen - slideLen - slideLen2; if (bottomCtg != topCtg && !(i == bottomCtg && targetCtg == topCtg)) { createAnalogousCnt (i, ite_cnt, gap, bottomCtg, topCtg); if (contig_array[bottomCtg].mask || contig_array[topCtg].mask) { printf ("downSlide to masked contig\n"); } } ite_cnt = ite_cnt->next; } //for each connect } // for each contig printf ("downSliding is done...orienConflict %d, fall inside %d\n", orienCounter, throughCounter); } static boolean setNextInScaf (CONNECT * cnt, CONNECT * nextCnt) { if (!cnt) { printf ("setNextInScaf: empty pointer\n"); return 0; } if (!nextCnt) { cnt->nextInScaf = nextCnt; return 1; } if (cnt->mask || cnt->deleted) { printf ("setNextInScaf: cnt is masked or deleted\n"); return 0; } if (nextCnt->deleted || nextCnt->mask) { printf ("setNextInScaf: nextCnt is masked or deleted\n"); return 0; } cnt->nextInScaf = nextCnt; return 1; } static boolean setPrevInScaf (CONNECT * cnt, boolean flag) { if (!cnt) { printf ("setPrevInScaf: empty pointer\n"); return 0; } if (!flag) { cnt->prevInScaf = flag; return 1; } if (cnt->mask || cnt->deleted) { printf ("setPrevInScaf: cnt is masked or deleted\n"); return 0; } cnt->prevInScaf = flag; return 1; } /* connect A is upstream to B, replace A with C from_c > branch_c - to_c from_c_new */ static void substitueUSinScaf (CONNECT * origin, unsigned int from_c_new) { if (!origin || !origin->nextInScaf) { return; } unsigned int branch_c, to_c; unsigned int bal_branch_c, bal_to_c; unsigned int bal_from_c_new = getTwinCtg (from_c_new); CONNECT *bal_origin, *bal_nextCNT, *prevCNT, *bal_prevCNT; branch_c = origin->contigID; to_c = origin->nextInScaf->contigID; bal_branch_c = getTwinCtg (branch_c); bal_to_c = getTwinCtg (to_c); prevCNT = checkConnect (from_c_new, branch_c); bal_nextCNT = checkConnect (bal_to_c, bal_branch_c); if (!bal_nextCNT) { printf ("substitueUSinScaf: no connect between %d and %d\n", bal_to_c, bal_branch_c); return; } bal_origin = bal_nextCNT->nextInScaf; bal_prevCNT = checkConnect (bal_branch_c, bal_from_c_new); setPrevInScaf (bal_nextCNT->nextInScaf, 0); setNextInScaf (prevCNT, origin->nextInScaf); setNextInScaf (bal_nextCNT, bal_prevCNT); setPrevInScaf (bal_prevCNT, 1); setNextInScaf (origin, NULL); setPrevInScaf (bal_origin, 0); } /* connect B is downstream to C, replace B with A to_c from_c - branch_c < to_c_new */ static void substitueDSinScaf (CONNECT * origin, unsigned int branch_c, unsigned int to_c_new) { if (!origin || !origin->prevInScaf) { return; } unsigned int to_c; unsigned int bal_branch_c, bal_to_c, bal_to_c_new; unsigned int from_c, bal_from_c; CONNECT *bal_origin, *prevCNT, *bal_prevCNT; CONNECT *nextCNT, *bal_nextCNT; to_c = origin->contigID; bal_branch_c = getTwinCtg (branch_c); bal_to_c = getTwinCtg (to_c); bal_origin = getCntBetween (bal_to_c, bal_branch_c); if (!bal_origin) { printf ("substitueDSinScaf: no connect between %d and %d\n", bal_to_c, bal_branch_c); return; } bal_from_c = bal_origin->nextInScaf->contigID; from_c = getTwinCtg (bal_from_c); bal_to_c_new = getTwinCtg (to_c_new); prevCNT = checkConnect (from_c, branch_c); nextCNT = checkConnect (branch_c, to_c_new); setNextInScaf (prevCNT, nextCNT); setPrevInScaf (nextCNT, 1); bal_nextCNT = checkConnect (bal_to_c_new, bal_branch_c); bal_prevCNT = checkConnect (bal_branch_c, bal_from_c); setNextInScaf (bal_nextCNT, bal_prevCNT); setPrevInScaf (origin, 0); setNextInScaf (bal_origin, NULL); } static int validConnect (unsigned int ctg, CONNECT * preCNT) { if (preCNT && preCNT->nextInScaf) { return 1; } CONNECT *cn_temp; int count = 0; if (!contig_array[ctg].downwardConnect) { return count; } cn_temp = contig_array[ctg].downwardConnect; while (cn_temp) { if (!cn_temp->deleted && !cn_temp->mask) { count++; } cn_temp = cn_temp->next; } return count; } static CONNECT *getNextContig (unsigned int ctg, CONNECT * preCNT, boolean * exception) { CONNECT *cn_temp, *retCNT = NULL; int count = 0, valid_in; unsigned int nextCtg, bal_ctg; *exception = 0; if (preCNT && preCNT->nextInScaf) { if (preCNT->contigID != ctg) { printf ("pre cnt does not lead to %d\n", ctg); } nextCtg = preCNT->nextInScaf->contigID; cn_temp = getCntBetween (ctg, nextCtg); if (cn_temp && (cn_temp->mask || cn_temp->deleted)) { printf ("getNextContig: arc(%d %d) twin (%d %d) with mask %d deleted %d\n", ctg, nextCtg, getTwinCtg (nextCtg), getTwinCtg (ctg), cn_temp->mask, cn_temp->deleted); if (!cn_temp->prevInScaf) { printf ("not even has a prevInScaf\n"); } cn_temp = getCntBetween (getTwinCtg (nextCtg), getTwinCtg (ctg)); if (!cn_temp->nextInScaf) { printf ("its twin cnt not has a nextInScaf\n"); } fflush (stdout); *exception = 1; } else { return preCNT->nextInScaf; } } bal_ctg = getTwinCtg (ctg); valid_in = validConnect (bal_ctg, NULL); if (valid_in > 1) { return NULL; } if (!contig_array[ctg].downwardConnect) { return NULL; } cn_temp = contig_array[ctg].downwardConnect; while (cn_temp) { if (cn_temp->mask || cn_temp->deleted) { cn_temp = cn_temp->next; continue; } count++; if (count == 1) { retCNT = cn_temp; } else if (count == 2) { return NULL; } cn_temp = cn_temp->next; } return retCNT; } // get the valid connect between 2 given ctgs static CONNECT *checkConnect (unsigned int from_c, unsigned int to_c) { CONNECT *cn_temp = getCntBetween (from_c, to_c); if (!cn_temp) { return NULL; } if (!cn_temp->mask && !cn_temp->deleted) { return cn_temp; } return NULL; } static int setConnectMask (unsigned int from_c, unsigned int to_c, char mask) { CONNECT *cn_temp, *cn_bal, *cn_ds, *cn_us; unsigned int bal_fc = getTwinCtg (from_c); unsigned int bal_tc = getTwinCtg (to_c); unsigned int ctg3, bal_ctg3; cn_temp = getCntBetween (from_c, to_c); cn_bal = getCntBetween (bal_tc, bal_fc); if (!cn_temp || !cn_bal) { return 0; } cn_temp->mask = mask; cn_bal->mask = mask; if (!mask) { return 1; } if (cn_temp->nextInScaf) //undo the binding { setPrevInScaf (cn_temp->nextInScaf, 0); ctg3 = cn_temp->nextInScaf->contigID; setNextInScaf (cn_temp, NULL); bal_ctg3 = getTwinCtg (ctg3); cn_ds = getCntBetween (bal_ctg3, bal_tc); setNextInScaf (cn_ds, NULL); setPrevInScaf (cn_bal, 0); } // ctg3 -> from_c -> to_c // bal_ctg3 <- bal_fc <- bal_tc if (cn_bal->nextInScaf) { setPrevInScaf (cn_bal->nextInScaf, 0); bal_ctg3 = cn_bal->nextInScaf->contigID; setNextInScaf (cn_bal, NULL); ctg3 = getTwinCtg (bal_ctg3); cn_us = getCntBetween (ctg3, from_c); setNextInScaf (cn_us, NULL); setPrevInScaf (cn_temp, 0); } return 1; } static boolean setConnectUsed (unsigned int from_c, unsigned int to_c, char flag) { CONNECT *cn_temp, *cn_bal; unsigned int bal_fc = getTwinCtg (from_c); unsigned int bal_tc = getTwinCtg (to_c); cn_temp = getCntBetween (from_c, to_c); cn_bal = getCntBetween (bal_tc, bal_fc); if (!cn_temp || !cn_bal) { return 0; } cn_temp->used = flag; cn_bal->used = flag; return 1; } static int setConnectWP (unsigned int from_c, unsigned int to_c, char flag) { CONNECT *cn_temp, *cn_bal; unsigned int bal_fc = getTwinCtg (from_c); unsigned int bal_tc = getTwinCtg (to_c); cn_temp = getCntBetween (from_c, to_c); cn_bal = getCntBetween (bal_tc, bal_fc); if (!cn_temp || !cn_bal) { return 0; } cn_temp->weakPoint = flag; cn_bal->weakPoint = flag; //fprintf(stderr,"contig %d and %d, weakPoint %d\n",from_c,to_c,cn_temp->weakPoint); //fprintf(stderr,"contig %d and %d, weakPoint %d\n",bal_tc,bal_fc,cn_bal->weakPoint); return 1; } static int setConnectDelete (unsigned int from_c, unsigned int to_c, char flag, boolean cleanBinding) { CONNECT *cn_temp, *cn_bal; unsigned int bal_fc = getTwinCtg (from_c); unsigned int bal_tc = getTwinCtg (to_c); cn_temp = getCntBetween (from_c, to_c); cn_bal = getCntBetween (bal_tc, bal_fc); if (!cn_temp || !cn_bal) { return 0; } cn_temp->deleted = flag; cn_bal->deleted = flag; if (!flag) { return 1; } if (cleanBinding) { cn_temp->prevInScaf = 0; cn_temp->nextInScaf = NULL; cn_bal->prevInScaf = 0; cn_bal->nextInScaf = NULL; } return 1; } static void maskContig (unsigned int ctg, boolean flag) { unsigned int bal_ctg, ctg2, bal_ctg2; CONNECT *cn_temp; bal_ctg = getTwinCtg (ctg); cn_temp = contig_array[ctg].downwardConnect; while (cn_temp) { if (cn_temp->mask || cn_temp->prevInScaf || cn_temp->nextInScaf || cn_temp->singleInScaf) { cn_temp = cn_temp->next; continue; } ctg2 = cn_temp->contigID; setConnectMask (ctg, ctg2, flag); cn_temp = cn_temp->next; } // bal_ctg2 <- bal_ctg cn_temp = contig_array[bal_ctg].downwardConnect; while (cn_temp) { if (cn_temp->mask || cn_temp->prevInScaf || cn_temp->nextInScaf || cn_temp->singleInScaf) { cn_temp = cn_temp->next; continue; } bal_ctg2 = cn_temp->contigID; setConnectMask (bal_ctg, bal_ctg2, flag); cn_temp = cn_temp->next; } contig_array[ctg].mask = flag; contig_array[bal_ctg].mask = flag; } static int maskPuzzle (int num_connect, unsigned int contigLen) { int in_num, out_num, flag = 0, puzzleCounter = 0; unsigned int i, bal_i; for (i = 1; i <= num_ctg; i++) { if (contigLen && contig_array[i].length > contigLen) { break; } if (contig_array[i].mask) { continue; } bal_i = getTwinCtg (i); in_num = validConnect (bal_i, NULL); out_num = validConnect (i, NULL); if ((in_num > 1 || out_num > 1) && (in_num + out_num >= num_connect)) { flag++; maskContig (i, 1); } in_num = validConnect (bal_i, NULL); out_num = validConnect (i, NULL); if (in_num > 1 || out_num > 1) { puzzleCounter++; //debugging2(i); } if (isSmallerThanTwin (i)) { i++; } } printf ("Masked %d contigs, %d puzzle left\n", flag, puzzleCounter); return flag; } static void deleteWeakCnt (int cut_off) { unsigned int i; CONNECT *cn_temp1; int weaks = 0, counter = 0; for (i = 1; i <= num_ctg; i++) { cn_temp1 = contig_array[i].downwardConnect; while (cn_temp1) { if (!cn_temp1->mask && !cn_temp1->deleted && !cn_temp1->nextInScaf && !cn_temp1->singleInScaf && !cn_temp1->prevInScaf) { counter++; } if (cn_temp1->weak && cn_temp1->deleted && cn_temp1->weight >= cut_off) { cn_temp1->deleted = 0; cn_temp1->weak = 0; } else if (!cn_temp1->deleted && cn_temp1->weight > 0 && cn_temp1->weight < cut_off && !cn_temp1->nextInScaf && !cn_temp1->prevInScaf) { cn_temp1->deleted = 1; cn_temp1->weak = 1; if (cn_temp1->singleInScaf) { cn_temp1->singleInScaf = 0; } if (!cn_temp1->mask) { weaks++; } } cn_temp1 = cn_temp1->next; } } printf ("%d weak connects removed (there were %d active cnnects))\n", weaks, counter); checkCircle (); } //check if one contig is linearly connected to the other ->C1->C2... static int linearC2C (unsigned int starter, CONNECT * cnt2c1, unsigned int c2, int min_dis, int max_dis) { int out_num, in_num; CONNECT *prevCNT, *cnt, *cn_temp; unsigned int c1, bal_c1, ctg, bal_c2; int len = 0; unsigned int bal_start = getTwinCtg (starter); boolean excep; c1 = cnt2c1->contigID; if (c1 == c2) { printf ("linearC2C: c1(%d) and c2(%d) are the same contig\n", c1, c2); return -1; } bal_c1 = getTwinCtg (c1); in_num = validConnect (bal_c1, NULL); if (in_num > 1) { return 0; } dsCtgCounter = 1; usCtgCounter = 0; downstreamCTG[dsCtgCounter++] = c1; bal_c2 = getTwinCtg (c2); upstreamCTG[usCtgCounter++] = bal_c2; // check if c1 is linearly connected to c2 by pe connections cnt = prevCNT = cnt2c1; while ((cnt = getNextContig (c1, prevCNT, &excep)) != NULL) { c1 = cnt->contigID; len += cnt->gapLen + contig_array[c1].length; if (c1 == c2) { return 1; } if (len > max_dis || c1 == starter || c1 == bal_start) { return 0; } downstreamCTG[dsCtgCounter++] = c1; if (dsCtgCounter >= MAXCinBetween) { printf ("%d downstream contigs, start at %d, max_dis %d, current dis %d\n", dsCtgCounter, starter, max_dis, len); return 0; } prevCNT = cnt; } out_num = validConnect (c1, NULL); if (out_num) { return 0; } //find the most upstream contig to c2 cnt = prevCNT = NULL; ctg = bal_c2; while ((cnt = getNextContig (ctg, prevCNT, &excep)) != NULL) { ctg = cnt->contigID; len += cnt->gapLen + contig_array[ctg].length; if (len > max_dis || ctg == starter || ctg == bal_start) { return 0; } prevCNT = cnt; upstreamCTG[usCtgCounter++] = ctg; if (usCtgCounter >= MAXCinBetween) { printf ("%d upstream contigs, start at %d, max_dis %d, current dis %d\n", usCtgCounter, starter, max_dis, len); return 0; } } if (dsCtgCounter + usCtgCounter > MAXCinBetween) { printf ("%d downstream and %d upstream contigs\n", dsCtgCounter, usCtgCounter); return 0; } out_num = validConnect (ctg, NULL); if (out_num) { return 0; } c2 = getTwinCtg (ctg); min_dis -= len; max_dis -= len; if (c1 == c2 || c1 == ctg || max_dis < 0) { return 0; } cn_temp = getCntBetween (c1, c2); if (cn_temp) { setConnectMask (c1, c2, 0); setConnectDelete (c1, c2, 0, 0); return 1; } len = (min_dis + max_dis) / 2 >= 0 ? (min_dis + max_dis) / 2 : 0; cn_temp = allocateCN (c2, len); if (cntLookupTable) { putCnt2LookupTable (c1, cn_temp); } cn_temp->weight = 0; // special connect from the original graph cn_temp->next = contig_array[c1].downwardConnect; contig_array[c1].downwardConnect = cn_temp; bal_c1 = getTwinCtg (c1); bal_c2 = getTwinCtg (c2); cn_temp = allocateCN (bal_c1, len); if (cntLookupTable) { putCnt2LookupTable (bal_c2, cn_temp); } cn_temp->weight = 0; // special connect from the original graph cn_temp->next = contig_array[bal_c2].downwardConnect; contig_array[bal_c2].downwardConnect = cn_temp; return 1; } //catenate upstream contig array and downstream contig array to solidArray static void catUsDsContig () { int i; for (i = 0; i < dsCtgCounter; i++) { *(unsigned int *) darrayPut (solidArray, i) = downstreamCTG[i]; } for (i = usCtgCounter - 1; i >= 0; i--) { *(unsigned int *) darrayPut (solidArray, dsCtgCounter++) = getTwinCtg (upstreamCTG[i]); } solidCounter = dsCtgCounter; } //binding the connections between contigs in solidArray static void consolidate () { int i, j; CONNECT *prevCNT = NULL; CONNECT *cnt; unsigned int to_ctg; unsigned int from_ctg = *(unsigned int *) darrayGet (solidArray, 0); for (i = 1; i < solidCounter; i++) { to_ctg = *(unsigned int *) darrayGet (solidArray, i); cnt = checkConnect (from_ctg, to_ctg); if (!cnt) { printf ("consolidate A: no connect from %d to %d\n", from_ctg, to_ctg); for (j = 0; j < solidCounter; j++) { printf ("%d-->", *(unsigned int *) darrayGet (solidArray, j)); } printf ("\n"); return; } cnt->singleInScaf = solidCounter == 2 ? 1 : 0; if (prevCNT) { setNextInScaf (prevCNT, cnt); setPrevInScaf (cnt, 1); } prevCNT = cnt; from_ctg = to_ctg; } //the reverse complementary path from_ctg = getTwinCtg (*(unsigned int *) darrayGet (solidArray, solidCounter - 1)); prevCNT = NULL; for (i = solidCounter - 2; i >= 0; i--) { to_ctg = getTwinCtg (*(unsigned int *) darrayGet (solidArray, i)); cnt = checkConnect (from_ctg, to_ctg); if (!cnt) { printf ("consolidate B: no connect from %d to %d\n", from_ctg, to_ctg); return; } cnt->singleInScaf = solidCounter == 2 ? 1 : 0; if (prevCNT) { setNextInScaf (prevCNT, cnt); setPrevInScaf (cnt, 1); } prevCNT = cnt; from_ctg = to_ctg; } } static void debugging1 (unsigned int ctg1, unsigned int ctg2) { CONNECT *cn1; cn1 = getCntBetween (ctg1, ctg2); if (cn1) { printf ("(%d,%d) mask %d deleted %d w %d,singleInScaf %d\n", ctg1, ctg2, cn1->mask, cn1->deleted, cn1->weight, cn1->singleInScaf); if (cn1->nextInScaf) { printf ("%d->%d->%d\n", ctg1, ctg2, cn1->nextInScaf->contigID); } if (cn1->prevInScaf) { printf ("*->%d->%d\n", ctg1, ctg2); } else if (!cn1->nextInScaf) { printf ("NULL->%d->%d->NULL\n", ctg1, ctg2); } } else { printf ("%d -X- %d\n", ctg1, ctg2); } } //remove transitive connections which cross linear paths (these paths may be broken) //if a->b->c and a->c, mask a->c static void removeTransitive () { unsigned int i, bal_ctg; int flag = 1, out_num, in_num, count, min, max, linear; CONNECT *cn_temp, *cn1 = NULL, *cn2 = NULL; while (flag) { flag = 0; for (i = 1; i <= num_ctg; i++) { if (contig_array[i].mask) { continue; } out_num = validConnect (i, NULL); if (out_num != 2) { continue; } cn_temp = contig_array[i].downwardConnect; count = 0; while (cn_temp) { if (cn_temp->deleted || cn_temp->mask) { cn_temp = cn_temp->next; continue; } count++; if (count == 1) { cn1 = cn_temp; } else if (count == 2) { cn2 = cn_temp; } else // count > 2 { break; } cn_temp = cn_temp->next; } if (count > 2) { printf ("%d valid connections from ctg %d\n", count, i); continue; } if (cn1->gapLen > cn2->gapLen) { cn_temp = cn1; cn1 = cn2; cn2 = cn_temp; } //make sure cn1 is closer to contig i than cn2 if (cn1->prevInScaf && cn2->prevInScaf) { continue; } bal_ctg = getTwinCtg (cn2->contigID); in_num = validConnect (bal_ctg, NULL); if (in_num > 2) { continue; } min = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length - ins_size_var / 2; max = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length + ins_size_var / 2; if (max < 0) { continue; } //temprarily delete cn2 setConnectDelete (i, cn2->contigID, 1, 0); linear = linearC2C (i, cn1, cn2->contigID, min, max); if (linear != 1) { setConnectDelete (i, cn2->contigID, 0, 0); continue; } else { downstreamCTG[0] = i; catUsDsContig (); if (!checkSimple (solidArray, solidCounter)) { continue; } cn1 = getCntBetween (*(unsigned int *) darrayGet (solidArray, solidCounter - 2), *(unsigned int *) darrayGet (solidArray, solidCounter - 1)); if (cn1 && cn1->nextInScaf && cn2->nextInScaf) { setConnectDelete (i, cn2->contigID, 0, 0); continue; } consolidate (); if (cn2->prevInScaf) substitueDSinScaf (cn2, *(unsigned int *) darrayGet (solidArray, 0), *(unsigned int *) darrayGet (solidArray, 1)); if (cn2->nextInScaf) { substitueUSinScaf (cn2, *(unsigned int *) darrayGet (solidArray, solidCounter - 2)); } flag++; } } //for each contig printf ("a remove transitive lag, %d connections removed\n", flag); } } //get repeat contigs back into the scaffold according to connected unique contigs on both sides /* A ------ D > [i] < B E */ static void debugging2 (unsigned int ctg) { CONNECT *cn1 = contig_array[ctg].downwardConnect; while (cn1) { if (cn1->nextInScaf) { fprintf (stderr, "with nextInScaf,"); } if (cn1->prevInScaf) { fprintf (stderr, "with prevInScaf,"); } fprintf (stderr, "%u >> %d, mask %d deleted %d, inherit %d, singleInScaf %d\n", ctg, cn1->contigID, cn1->mask, cn1->deleted, cn1->inherit, cn1->singleInScaf); cn1 = cn1->next; } } static void debugging () { /* debugging1(1777,1468); debugging2(8065); debugging2(8066); */ } static void simplifyCnt () { removeTransitive (); debugging (); general_linearization (1); debugging (); } static int getIndexInArray (unsigned int node) { int index; for (index = 0; index < nodeCounter; index++) if (nodesInSub[index] == node) { return index; } return -1; } static boolean putNodeIntoSubgraph (FibHeap * heap, int distance, unsigned int node, int index) { int pos = getIndexInArray (node); if (pos > 0) { //printf("exists\n"); return 0; } if (index >= MaxNodeInSub) { return -1; } insertNodeIntoHeap (heap, distance, node); nodesInSub[index] = node; nodeDistance[index] = distance; return 1; } static boolean putChainIntoSubgraph (FibHeap * heap, int distance, unsigned int node, int *index, CONNECT * prevC) { unsigned int ctg = node; CONNECT *nextCnt; boolean excep, flag; int counter = *index; while (1) { nextCnt = getNextContig (ctg, prevC, &excep); if (excep || !nextCnt) { *index = counter; return 1; } ctg = nextCnt->contigID; distance += nextCnt->gapLen + contig_array[ctg].length; // distance += nextCnt->gapLen + ctg; flag = putNodeIntoSubgraph (heap, distance, ctg, counter); if (flag < 0) { return 0; } if (flag > 0) { counter++; } prevC = nextCnt; } } // check if a contig is unique by trying to line its downstream/upstream nodes together static boolean checkUnique (unsigned int node, double tolerance) { CONNECT *ite_cnt; unsigned int currNode; int distance; int popCounter = 0; boolean flag; currNode = node; FibHeap *heap = newFibHeap (); putNodeIntoSubgraph (heap, 0, currNode, 0); nodeCounter = 1; ite_cnt = contig_array[currNode].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } currNode = ite_cnt->contigID; distance = ite_cnt->gapLen + contig_array[currNode].length; flag = putNodeIntoSubgraph (heap, distance, currNode, nodeCounter); if (flag < 0) { destroyHeap (heap); return 0; } if (flag > 0) { nodeCounter++; } flag = putChainIntoSubgraph (heap, distance, currNode, &nodeCounter, ite_cnt); if (!flag) { destroyHeap (heap); return 0; } ite_cnt = ite_cnt->next; } if (nodeCounter <= 2) // no more than 2 valid connections { destroyHeap (heap); return 1; } while ((currNode = removeNextNodeFromHeap (heap)) != 0) { nodesInSubInOrder[popCounter++] = currNode; } destroyHeap (heap); flag = checkOverlapInBetween (tolerance); return flag; } //mask contigs with downstream and/or upstream can not be lined static void maskRepeat () { int in_num, out_num, flagA, flagB; int counter = 0; int puzzleCounter = 0; unsigned int i, bal_i; for (i = 1; i <= num_ctg; i++) { if (contig_array[i].mask) { continue; } bal_i = getTwinCtg (i); in_num = validConnect (bal_i, NULL); out_num = validConnect (i, NULL); if (in_num > 1 || out_num > 1) { puzzleCounter++; } else { if (isSmallerThanTwin (i)) { i++; } continue; } if (contig_array[i].cvg > 2 * cvgAvg) { counter++; maskContig (i, 1); //printf("thick mask contig %d and %d\n",i,bal_i); if (isSmallerThanTwin (i)) { i++; } continue; } if (in_num > 1) { flagA = checkUnique (bal_i, OverlapPercent); } else { flagA = 1; } if (out_num > 1) { flagB = checkUnique (i, OverlapPercent); } else { flagB = 1; } if (!flagA || !flagB) { counter++; maskContig (i, 1); } if (isSmallerThanTwin (i)) { i++; } } printf ("maskRepeat: %d contigs masked from %d puzzles\n", counter, puzzleCounter); } static void ordering (boolean deWeak, boolean downS, boolean nonlinear, char *infile) { debugging (); if (downS) { downSlide (); debugging (); if (deWeak) { deleteWeakCnt (weakPE); } } else { if (deWeak) { deleteWeakCnt (weakPE); } } //output_scaf(infile); debugging (); printf ("variance for insert size %d\n", ins_size_var); simplifyCnt (); debugging (); maskRepeat (); debugging (); simplifyCnt (); if (nonlinear) { printf ("non-strict linearization\n"); general_linearization (0); //linearization(0,0); } maskPuzzle (2, 0); debugging (); freezing (); debugging (); } //check if contigs next to each other have reasonable overlap boolean checkOverlapInBetween (double tolerance) { int i, gap; int index; unsigned int node; int lenSum, lenOlp; lenSum = lenOlp = 0; for (i = 0; i < nodeCounter; i++) { node = nodesInSubInOrder[i]; lenSum += contig_array[node].length; index = getIndexInArray (node); nodeDistanceInOrder[i] = nodeDistance[index]; } if (lenSum < 1) { return 1; } for (i = 0; i < nodeCounter - 1; i++) { gap = nodeDistanceInOrder[i + 1] - nodeDistanceInOrder[i] - contig_array[nodesInSubInOrder[i + 1]].length; if (-gap > 0) { lenOlp += -gap; } //if(-gap>ins_size_var) if ((double) lenOlp / lenSum > tolerance) { return 0; } } return 1; } /********* the following codes are for freezing current scaffolds ****************/ //set connections between contigs in a array to used or not //meanwhile set mask to the opposite value static boolean setUsed (unsigned int start, unsigned int *array, int max_steps, boolean flag) { unsigned int prevCtg = start; unsigned int twinA, twinB; int j; CONNECT *cnt; boolean usedFlag = 0; // save 'used' to 'checking' prevCtg = start; for (j = 0; j < max_steps; j++) { if (array[j] == 0) { break; } cnt = getCntBetween (prevCtg, array[j]); if (!cnt) { printf ("setUsed: no connect between %d and %d\n", prevCtg, array[j]); prevCtg = array[j]; continue; } if (cnt->used == flag || cnt->nextInScaf || cnt->prevInScaf || cnt->singleInScaf) { return 1; } cnt->checking = cnt->used; twinA = getTwinCtg (prevCtg); twinB = getTwinCtg (array[j]); cnt = getCntBetween (twinB, twinA); if (cnt) { cnt->checking = cnt->used; } prevCtg = array[j]; } // set used to flag prevCtg = start; for (j = 0; j < max_steps; j++) { if (array[j] == 0) { break; } cnt = getCntBetween (prevCtg, array[j]); if (!cnt) { prevCtg = array[j]; continue; } if (cnt->used == flag) { usedFlag = 1; break; } cnt->used = flag; twinA = getTwinCtg (prevCtg); twinB = getTwinCtg (array[j]); cnt = getCntBetween (twinB, twinA); if (cnt) { cnt->used = flag; } prevCtg = array[j]; } // set mask to 'NOT flag' or set used to original value prevCtg = start; for (j = 0; j < max_steps; j++) { if (array[j] == 0) { break; } cnt = getCntBetween (prevCtg, array[j]); if (!cnt) { prevCtg = array[j]; continue; } if (!usedFlag) { cnt->mask = 1 - flag; } else { cnt->used = cnt->checking; } twinA = getTwinCtg (prevCtg); twinB = getTwinCtg (array[j]); cnt = getCntBetween (twinB, twinA); cnt->used = 1 - flag; if (!usedFlag) { cnt->mask = 1 - flag; } else { cnt->used = cnt->checking; } prevCtg = array[j]; } return usedFlag; } // break down scaffolds poorly supported by longer PE static void recoverMask () { unsigned int i, ctg, bal_ctg, start, finish; int num3, num5, j, t; CONNECT *bindCnt, *cnt; int min, max, max_steps = 5, num_route, length; int tempCounter, recoverCounter = 0; boolean multiUSE, change; for (i = 1; i <= num_ctg; i++) { contig_array[i].flag = 0; } so_far = (unsigned int *) ckalloc (max_n_routes * sizeof (unsigned int)); found_routes = (unsigned int **) ckalloc (max_n_routes * sizeof (unsigned int *)); for (j = 0; j < max_n_routes; j++) { found_routes[j] = (unsigned int *) ckalloc (max_steps * sizeof (unsigned int)); } for (i = 1; i <= num_ctg; i++) { if (contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect) { continue; } bindCnt = getBindCnt (i); if (!bindCnt) { continue; } //first scan get the average coverage by longer pe num5 = num3 = 0; ctg = i; *(unsigned int *) darrayPut (scaf5, num5++) = i; contig_array[i].flag = 1; contig_array[getTwinCtg (i)].flag = 1; while (bindCnt) { if (bindCnt->used) { break; } setConnectUsed (ctg, bindCnt->contigID, 1); ctg = bindCnt->contigID; *(unsigned int *) darrayPut (scaf5, num5++) = ctg; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; bindCnt = bindCnt->nextInScaf; } ctg = getTwinCtg (i); bindCnt = getBindCnt (ctg); while (bindCnt) { if (bindCnt->used) { break; } setConnectUsed (ctg, bindCnt->contigID, 1); ctg = bindCnt->contigID; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; *(unsigned int *) darrayPut (scaf3, num3++) = bal_ctg; bindCnt = bindCnt->nextInScaf; } if (num5 + num3 < 2) { continue; } tempCounter = solidCounter = 0; for (j = num3 - 1; j >= 0; j--) *(unsigned int *) darrayPut (tempArray, tempCounter++) = *(unsigned int *) darrayGet (scaf3, j); for (j = 0; j < num5; j++) *(unsigned int *) darrayPut (tempArray, tempCounter++) = *(unsigned int *) darrayGet (scaf5, j); change = 0; for (t = 0; t < tempCounter - 1; t++) { *(unsigned int *) darrayPut (solidArray, solidCounter++) = *(unsigned int *) darrayGet (tempArray, t); start = *(unsigned int *) darrayGet (tempArray, t); finish = *(unsigned int *) darrayGet (tempArray, t + 1); num_route = num_trace = 0; cnt = checkConnect (start, finish); if (!cnt) { printf ("Warning from recoverMask: no connection (%d %d), start at %d\n", start, finish, i); cnt = getCntBetween (start, finish); if (cnt) { debugging1 (start, finish); } continue; } length = cnt->gapLen + contig_array[finish].length; min = length - 1.5 * ins_size_var; max = length + 1.5 * ins_size_var; traceAlongMaskedCnt (finish, start, max_steps, min, max, 0, 0, &num_route); if (finish == start) { for (j = 0; j < tempCounter; j++) { printf ("->%d", *(unsigned int *) darrayGet (tempArray, j)); } printf (": start at %d\n", i); } if (num_route == 1) { for (j = 0; j < max_steps; j++) if (found_routes[0][j] == 0) { break; } if (j < 1) { continue; } //check if connects have been used more than once multiUSE = setUsed (start, found_routes[0], max_steps, 1); if (multiUSE) { continue; } for (j = 0; j < max_steps; j++) { if (j + 1 == max_steps || found_routes[0][j + 1] == 0) { break; } *(unsigned int *) darrayPut (solidArray, solidCounter++) = found_routes[0][j]; contig_array[found_routes[0][j]].flag = 1; contig_array[getTwinCtg (found_routes[0][j])].flag = 1; } recoverCounter += j; setConnectDelete (start, finish, 1, 1); change = 1; } //end if num_route=1 } // for each gap *(unsigned int *) darrayPut (solidArray, solidCounter++) = *(unsigned int *) darrayGet (tempArray, tempCounter - 1); if (change) { consolidate (); } } printf ("%d contigs recovered\n", recoverCounter); fflush (stdout); for (i = 1; i <= num_ctg; i++) { cnt = contig_array[i].downwardConnect; while (cnt) { cnt->used = 0; cnt->checking = 0; cnt = cnt->next; } } for (j = 0; j < max_n_routes; j++) { free ((void *) found_routes[j]); } free ((void *) found_routes); free ((void *) so_far); } // A -> B -> C -> D un-bind link B->C to link A->B and B->C // A' <- B' <- C' <- D' static void unBindLink (unsigned int CB, unsigned int CC) { //fprintf(stderr,"Unbind link (%d %d) to others...\n",CB,CC); CONNECT *cnt1 = getCntBetween (CB, CC); if (!cnt1) { return; } if (cnt1->singleInScaf) { cnt1->singleInScaf = 0; } CONNECT *cnt2 = getCntBetween (getTwinCtg (CC), getTwinCtg (CB)); if (!cnt2) { return; } if (cnt2->singleInScaf) { cnt2->singleInScaf = 0; } if (cnt1->nextInScaf) { unsigned int CD = cnt1->nextInScaf->contigID; cnt1->nextInScaf->prevInScaf = 0; cnt1->nextInScaf = NULL; CONNECT *cnt3 = getCntBetween (getTwinCtg (CD), getTwinCtg (CC)); if (cnt3) { cnt3->nextInScaf = NULL; } cnt2->prevInScaf = 0; } if (cnt2->nextInScaf) { unsigned int bal_CA = cnt2->nextInScaf->contigID; cnt2->nextInScaf->prevInScaf = 0; cnt2->nextInScaf = NULL; CONNECT *cnt4 = getCntBetween (getTwinCtg (bal_CA), CB); if (cnt4) { cnt4->nextInScaf = NULL; } cnt1->prevInScaf = 0; } } static void freezing () { int num5, num3; unsigned int ctg, bal_ctg; unsigned int i; int j, t; CONNECT *cnt, *prevCNT, *nextCnt; boolean excep; for (i = 1; i <= num_ctg; i++) { contig_array[i].flag = 0; contig_array[i].from_vt = 0; contig_array[i].to_vt = 0; cnt = contig_array[i].downwardConnect; while (cnt) { cnt->used = 0; cnt->checking = 0; cnt->singleInScaf = 0; cnt = cnt->next; } } for (i = 1; i <= num_ctg; i++) { if (contig_array[i].flag || contig_array[i].mask) { continue; } if (!contig_array[i].downwardConnect || !validConnect (i, NULL)) { continue; } num5 = num3 = 0; ctg = i; *(unsigned int *) darrayPut (scaf5, num5++) = i; contig_array[i].flag = 1; contig_array[getTwinCtg (i)].flag = 1; prevCNT = NULL; cnt = getNextContig (ctg, prevCNT, &excep); while (cnt) { if (contig_array[cnt->contigID].flag) { unBindLink (ctg, cnt->contigID); break; } nextCnt = getNextContig (cnt->contigID, cnt, &excep); setConnectUsed (ctg, cnt->contigID, 1); ctg = cnt->contigID; *(unsigned int *) darrayPut (scaf5, num5++) = ctg; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; prevCNT = cnt; cnt = nextCnt; } ctg = getTwinCtg (i); if (num5 >= 2) { prevCNT = checkConnect (getTwinCtg (*(unsigned int *) darrayGet (scaf5, 1)), ctg); } else { prevCNT = NULL; } cnt = getNextContig (ctg, prevCNT, &excep); while (cnt) { if (contig_array[cnt->contigID].flag) { unBindLink (ctg, cnt->contigID); break; } nextCnt = getNextContig (cnt->contigID, cnt, &excep); setConnectUsed (ctg, cnt->contigID, 1); ctg = cnt->contigID; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; *(unsigned int *) darrayPut (scaf3, num3++) = bal_ctg; prevCNT = cnt; cnt = nextCnt; } if (num5 + num3 < 2) { continue; } solidCounter = 0; for (j = num3 - 1; j >= 0; j--) *(unsigned int *) darrayPut (solidArray, solidCounter++) = *(unsigned int *) darrayGet (scaf3, j); for (j = 0; j < num5; j++) *(unsigned int *) darrayPut (solidArray, solidCounter++) = *(unsigned int *) darrayGet (scaf5, j); unsigned int firstCtg = 0; unsigned int lastCtg = 0; unsigned int firstTwin = 0; unsigned int lastTwin = 0; for (t = 0; t < solidCounter; t++) if (!contig_array[*(unsigned int *) darrayGet (solidArray, t)].mask) { firstCtg = *(unsigned int *) darrayGet (solidArray, t); break; } for (t = solidCounter - 1; t >= 0; t--) if (!contig_array[*(unsigned int *) darrayGet (solidArray, t)].mask) { lastCtg = *(unsigned int *) darrayGet (solidArray, t); break; } if (firstCtg == 0 || lastCtg == 0) { printf ("scaffold start at %d, stop at %d, freezing began with %d\n", firstCtg, lastCtg, i); for (j = 0; j < solidCounter; j++) printf ("->%d(%d %d)", *(unsigned int *) darrayGet (solidArray, j), contig_array[*(unsigned int *) darrayGet (solidArray, j)].mask, contig_array[*(unsigned int *) darrayGet (solidArray, j)].flag); printf ("\n"); } else { firstTwin = getTwinCtg (firstCtg); lastTwin = getTwinCtg (lastCtg); } for (t = 0; t < solidCounter; t++) { unsigned int ctg = *(unsigned int *) darrayGet (solidArray, t); if (contig_array[ctg].from_vt > 0) { contig_array[ctg].mask = 1; contig_array[getTwinCtg (ctg)].mask = 1; printf ("Repeat: contig %d (%d) appears more than once\n", ctg, getTwinCtg (ctg)); } else { contig_array[ctg].from_vt = firstCtg; contig_array[ctg].to_vt = lastCtg; contig_array[ctg].indexInScaf = t + 1; contig_array[getTwinCtg (ctg)].from_vt = lastTwin; contig_array[getTwinCtg (ctg)].to_vt = firstTwin; contig_array[getTwinCtg (ctg)].indexInScaf = solidCounter - t; } } consolidate (); } printf ("Freezing is done....\n"); fflush (stdout); for (i = 1; i <= num_ctg; i++) { if (contig_array[i].flag) { contig_array[i].flag = 0; } if (contig_array[i].from_vt == 0) { contig_array[i].from_vt = i; contig_array[i].to_vt = i; } cnt = contig_array[i].downwardConnect; while (cnt) { cnt->used = 0; cnt->checking = 0; cnt = cnt->next; } } } /************** codes below this line are for pulling the scaffolds out ************/ void output1gap (FILE * fo, int max_steps) { int i, len, seg; len = seg = 0; for (i = 0; i < max_steps - 1; i++) { if (found_routes[0][i + 1] == 0) { break; } len += contig_array[found_routes[0][i]].length; seg++; } fprintf (fo, "GAP %d %d", len, seg); for (i = 0; i < max_steps - 1; i++) { if (found_routes[0][i + 1] == 0) { break; } fprintf (fo, " %d", found_routes[0][i]); } fprintf (fo, "\n"); } static int weakCounter; static boolean printCnts (FILE * fp, unsigned int ctg) { CONNECT *cnt = contig_array[ctg].downwardConnect; boolean flag = 0, ret = 0; unsigned int bal_ctg = getTwinCtg (ctg); unsigned int linkCtg; if (isSameAsTwin (ctg)) { return ret; } CONNECT *bindCnt = getBindCnt (ctg); if (bindCnt && bindCnt->bySmall && bindCnt->weakPoint) { weakCounter++; fprintf (fp, "\tWP"); ret = 1; } while (cnt) { if (cnt->weight && !cnt->inherit) { if (!flag) { flag = 1; fprintf (fp, "\t#DOWN "); } linkCtg = cnt->contigID; if (isLargerThanTwin (linkCtg)) { linkCtg = getTwinCtg (linkCtg); } fprintf (fp, "%d:%d:%d ", index_array[linkCtg], cnt->weight, cnt->gapLen); } cnt = cnt->next; } flag = 0; cnt = contig_array[bal_ctg].downwardConnect; while (cnt) { if (cnt->weight && !cnt->inherit) { if (!flag) { flag = 1; fprintf (fp, "\t#UP "); } linkCtg = cnt->contigID; if (isLargerThanTwin (linkCtg)) { linkCtg = getTwinCtg (linkCtg); } fprintf (fp, "%d:%d:%d ", index_array[linkCtg], cnt->weight, cnt->gapLen); } cnt = cnt->next; } fprintf (fp, "\n"); return ret; } void ScafStat ( int len_cut, char * graphfile ) { FILE * fp, *fp2, *fo; char line[1024]; sprintf ( line, "%s.scafSeq", graphfile ); fp = ckopen ( line, "r" ); sprintf ( line, "%s.contig", graphfile ); fp2 = ckopen ( line, "r" ); sprintf ( line, "%s.scafStatistics", graphfile ); fo = ckopen ( line, "w" ); fprintf(fo,"<-- Information for assembly Scaffold '%s.scafSeq'.(cut_off_length < 100bp) -->\n\n",graphfile); int cut_off_len = 0; char Nucleotide; char buf[4000]; long Scaffold_Number = 0; long Scaffold_Number_Scaf = 0; long Scaffold_Number_Contig = 0; long Singleton_Number_Scaf = 0; long Singleton_Number = 0; long *Singleton_Seq = (long *)malloc(sizeof(long)); long long A_num_all = 0; long *A_num = (long *)malloc(sizeof(long)); long long C_num_all = 0; long *C_num = (long *)malloc(sizeof(long)); long long G_num_all = 0; long *G_num = (long *)malloc(sizeof(long)); long long T_num_all = 0; long *T_num = (long *)malloc(sizeof(long)); long long N_num_all = 0; long *N_num = (long *)malloc(sizeof(long)); long long Non_ACGTN_all = 0; long *Non_ACGTN = (long *)malloc(sizeof(long)); long long Size_includeN = 0; long *Size_Seq = (long *)malloc(sizeof(long)); int k; long long Sum =0; int flag[10]; int flag_known = 0; long n100 = 0; long n500 = 0; long n1k = 0; long n10k = 0; long n100k= 0; long n1m = 0; long N50 = 0; long N50_known = 0; long Num_N50_known = 0; cut_off_len = len_cut; A_num[Scaffold_Number] = 0; C_num[Scaffold_Number] = 0; G_num[Scaffold_Number] = 0; T_num[Scaffold_Number] = 0; N_num[Scaffold_Number] = 0; Non_ACGTN[Scaffold_Number] = 0; Size_Seq[Scaffold_Number] = 0; Singleton_Seq[Scaffold_Number] = 0; Nucleotide = fgetc(fp); while (Nucleotide != EOF) { if(Nucleotide == '>') { if ((Scaffold_Number > 0) && (Size_Seq[Scaffold_Number-1] < cut_off_len)) { A_num_all = A_num_all - A_num[Scaffold_Number-1]; C_num_all = C_num_all - C_num[Scaffold_Number-1]; G_num_all = G_num_all - G_num[Scaffold_Number-1]; T_num_all = T_num_all - T_num[Scaffold_Number-1]; N_num_all = N_num_all - N_num[Scaffold_Number-1]; Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number-1]; Size_includeN = Size_includeN - Size_Seq[Scaffold_Number-1]; Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number-1]; Scaffold_Number = Scaffold_Number -1; } else { Size_Seq = (long *)realloc(Size_Seq, (Scaffold_Number+2)*sizeof(long)); A_num = (long *)realloc(A_num, (Scaffold_Number+2)*sizeof(long)); C_num = (long *)realloc(C_num, (Scaffold_Number+2)*sizeof(long)); G_num = (long *)realloc(G_num, (Scaffold_Number+2)*sizeof(long)); T_num = (long *)realloc(T_num, (Scaffold_Number+2)*sizeof(long)); N_num = (long *)realloc(N_num, (Scaffold_Number+2)*sizeof(long)); Non_ACGTN= (long *)realloc(Non_ACGTN,(Scaffold_Number+2)*sizeof(long)); Singleton_Seq = (long *)realloc(Singleton_Seq, (Scaffold_Number+2)*sizeof(long)); } Scaffold_Number++; A_num[Scaffold_Number-1] = 0; C_num[Scaffold_Number-1] = 0; G_num[Scaffold_Number-1] = 0; T_num[Scaffold_Number-1] = 0; N_num[Scaffold_Number-1] = 0; Non_ACGTN[Scaffold_Number-1] = 0; Size_Seq[Scaffold_Number-1] = 0; Singleton_Seq[Scaffold_Number-1] = 0; Nucleotide = fgetc(fp); if(Nucleotide == 'C') { Singleton_Number++; Singleton_Seq[Scaffold_Number-1] ++; } fgets(buf,4000,fp); } else if((Nucleotide == 'N')||(Nucleotide == 'n')) { N_num[Scaffold_Number-1] ++; N_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'A')||(Nucleotide == 'a')) { A_num[Scaffold_Number-1] ++; A_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'C')||(Nucleotide == 'c')) { C_num[Scaffold_Number-1] ++; C_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'G')||(Nucleotide == 'g')) { G_num[Scaffold_Number-1] ++; G_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'T')||(Nucleotide == 't')) { T_num[Scaffold_Number-1] ++; T_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else { if((Nucleotide != '\n')&&(Nucleotide != '\r')) { Non_ACGTN[Scaffold_Number-1] ++; Non_ACGTN_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } } Nucleotide = fgetc(fp); } if (Size_Seq[Scaffold_Number-1] < cut_off_len) { A_num_all = A_num_all - A_num[Scaffold_Number-1]; C_num_all = C_num_all - C_num[Scaffold_Number-1]; G_num_all = G_num_all - G_num[Scaffold_Number-1]; T_num_all = T_num_all - T_num[Scaffold_Number-1]; N_num_all = N_num_all - N_num[Scaffold_Number-1]; Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number-1]; Size_includeN = Size_includeN - Size_Seq[Scaffold_Number-1]; Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number-1]; Scaffold_Number = Scaffold_Number -1; } qsort(Size_Seq,Scaffold_Number,sizeof(Size_Seq[0]),cmp_int); fprintf(fo,"Size_includeN\t%lld\n",Size_includeN); fprintf(fo,"Size_withoutN\t%lld\n",Size_includeN - N_num_all); fprintf(fo,"Scaffold_Num\t%ld\n",Scaffold_Number); fprintf(fo,"Mean_Size\t%lld\n",Size_includeN/Scaffold_Number); fprintf(fo,"Median_Size\t%ld\n",Size_Seq[(Scaffold_Number+1)/2-1]); fprintf(fo,"Longest_Seq\t%ld\n",Size_Seq[Scaffold_Number-1]); fprintf(fo,"Shortest_Seq\t%ld\n",Size_Seq[0]); fprintf(fo,"Singleton_Num\t%ld\n",Singleton_Number); fprintf(fo,"Average_length_of_break(N)_in_scaffold\t%lld\n",N_num_all/Scaffold_Number); fprintf(fo,"\n"); if (known_genome_size) { fprintf(fo,"Known_genome_size\t%ld\n",known_genome_size); fprintf(fo,"Total_scaffold_length_as_percentage_of_known_genome_size\t%.2f%\n",100*(1.0*Size_includeN/known_genome_size)); } else { fprintf(fo,"Known_genome_size\tNaN\n"); fprintf(fo,"Total_scaffold_length_as_percentage_of_known_genome_size\tNaN\n"); } fprintf(fo,"\n"); for(k=0; k 100) { n100++; } if (Size_Seq[k] > 500) { n500++; } if (Size_Seq[k] > 1000) { n1k++; } if (Size_Seq[k] > 10000) { n10k++; } if (Size_Seq[k] > 100000) { n100k++; } if (Size_Seq[k] > 1000000) { n1m++; } } fprintf(fo,"scaffolds>100 \t%ld\t%.2f%\n",n100 ,100*(1.0*n100/Scaffold_Number)); fprintf(fo,"scaffolds>500 \t%ld\t%.2f%\n",n500 ,100*(1.0*n500/Scaffold_Number)); fprintf(fo,"scaffolds>1K \t%ld\t%.2f%\n",n1k ,100*(1.0*n1k/Scaffold_Number)); fprintf(fo,"scaffolds>10K \t%ld\t%.2f%\n",n10k ,100*(1.0*n10k/Scaffold_Number)); fprintf(fo,"scaffolds>100K\t%ld\t%.2f%\n",n100k,100*(1.0*n100k/Scaffold_Number)); fprintf(fo,"scaffolds>1M \t%ld\t%.2f%\n",n1m ,100*(1.0*n1m/Scaffold_Number)); fprintf(fo,"\n"); fprintf(fo,"Nucleotide_A\t%lld\t%.2f%\n",A_num_all,100*(1.0*A_num_all/Size_includeN)); fprintf(fo,"Nucleotide_C\t%lld\t%.2f%\n",C_num_all,100*(1.0*C_num_all/Size_includeN)); fprintf(fo,"Nucleotide_G\t%lld\t%.2f%\n",G_num_all,100*(1.0*G_num_all/Size_includeN)); fprintf(fo,"Nucleotide_T\t%lld\t%.2f%\n",T_num_all,100*(1.0*T_num_all/Size_includeN)); fprintf(fo,"GapContent_N\t%lld\t%.2f%\n",N_num_all,100*(1.0*N_num_all/Size_includeN)); fprintf(fo,"Non_ACGTN\t%lld\t%.2f%\n",Non_ACGTN_all,100*(1.0*Non_ACGTN_all/Size_includeN)); fprintf(fo,"GC_Content\t%.2f%\t\t(G+C)/(A+C+G+T)\n",100*(1.0*(G_num_all+C_num_all)/(A_num_all+C_num_all+G_num_all+T_num_all))); fprintf(fo,"\n"); for (k=0; k<10; k++) flag[k]=0; for(k=Scaffold_Number-1; k>=0; k--) { Sum = Sum + Size_Seq[k]; if ((Sum >= Size_includeN*0.1) && (Sum < Size_includeN*0.2) && (flag[1] == 0)) {fprintf(fo,"N10\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[1] = 1;} else if ((Sum >= Size_includeN*0.2) && (Sum < Size_includeN*0.3) && (flag[2] == 0)) {fprintf(fo,"N20\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[2] = 1;} else if ((Sum >= Size_includeN*0.3) && (Sum < Size_includeN*0.4) && (flag[3] == 0)) {fprintf(fo,"N30\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[3] = 1;} else if ((Sum >= Size_includeN*0.4) && (Sum < Size_includeN*0.5) && (flag[4] == 0)) {fprintf(fo,"N40\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[4] = 1;} else if ((Sum >= Size_includeN*0.5) && (Sum < Size_includeN*0.6) && (flag[5] == 0)) {fprintf(fo,"N50\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[5] = 1; N50 = Size_Seq[k]; } else if ((Sum >= Size_includeN*0.6) && (Sum < Size_includeN*0.7) && (flag[6] == 0)) {fprintf(fo,"N60\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[6] = 1;} else if ((Sum >= Size_includeN*0.7) && (Sum < Size_includeN*0.8) && (flag[7] == 0)) {fprintf(fo,"N70\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[7] = 1;} else if ((Sum >= Size_includeN*0.8) && (Sum < Size_includeN*0.9) && (flag[8] == 0)) {fprintf(fo,"N80\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[8] = 1;} else if ((Sum >= Size_includeN*0.9) && (flag[9] == 0)) {fprintf(fo,"N90\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[9] = 1;} if ((Sum >= known_genome_size*0.5) && (flag_known == 0)) { N50_known = Size_Seq[k]; Num_N50_known = Scaffold_Number-k; flag_known = 1;} } if(flag[5] == 0) { Sum = 0; for(k=Scaffold_Number-1; k>=0; k--) { Sum = Sum + Size_Seq[k]; if (Sum >= Size_includeN*0.5) { fprintf(fo,"N50\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); break; } } } fprintf(fo,"\n"); if (known_genome_size) { fprintf(fo,"NG50\t%ld\t%ld\n",N50_known,Num_N50_known); fprintf(fo,"N50_scaffold-NG50_scaffold_length_difference\t%ld\n",abs(N50 - N50_known)); } else { fprintf(fo,"NG50\tNaN\tNaN\n"); fprintf(fo,"N50_scaffold-NG50_scaffold_length_difference\tNaN\n"); } fprintf(fo,"\n"); free(A_num); free(C_num); free(G_num); free(T_num); free(N_num); free(Non_ACGTN); free(Singleton_Seq); free(Size_Seq); Scaffold_Number_Scaf = Scaffold_Number; Singleton_Number_Scaf = Singleton_Number; //*********************** Contig ****************************** fprintf(fo,"<-- Information for assembly Contig '%s.contig'.(cut_off_length < 100bp) -->\n\n",graphfile); cut_off_len = 0; Scaffold_Number = 0; Singleton_Number = 0; Singleton_Seq = (long *)malloc(sizeof(long)); A_num_all = 0; A_num = (long *)malloc(sizeof(long)); C_num_all = 0; C_num = (long *)malloc(sizeof(long)); G_num_all = 0; G_num = (long *)malloc(sizeof(long)); T_num_all = 0; T_num = (long *)malloc(sizeof(long)); N_num_all = 0; N_num = (long *)malloc(sizeof(long)); Non_ACGTN_all = 0; Non_ACGTN = (long *)malloc(sizeof(long)); Size_includeN = 0; Size_Seq = (long *)malloc(sizeof(long)); Sum =0; n100 = 0; n500 = 0; n1k = 0; n10k = 0; n100k= 0; n1m = 0; N50 = 0; N50_known = 0; Num_N50_known = 0; flag_known = 0; cut_off_len = len_cut; A_num[Scaffold_Number] = 0; C_num[Scaffold_Number] = 0; G_num[Scaffold_Number] = 0; T_num[Scaffold_Number] = 0; N_num[Scaffold_Number] = 0; Non_ACGTN[Scaffold_Number] = 0; Size_Seq[Scaffold_Number] = 0; Singleton_Seq[Scaffold_Number] = 0; Nucleotide = fgetc(fp2); while (Nucleotide != EOF) { if(Nucleotide == '>') { if ((Scaffold_Number > 0) && (Size_Seq[Scaffold_Number-1] < cut_off_len)) { A_num_all = A_num_all - A_num[Scaffold_Number-1]; C_num_all = C_num_all - C_num[Scaffold_Number-1]; G_num_all = G_num_all - G_num[Scaffold_Number-1]; T_num_all = T_num_all - T_num[Scaffold_Number-1]; N_num_all = N_num_all - N_num[Scaffold_Number-1]; Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number-1]; Size_includeN = Size_includeN - Size_Seq[Scaffold_Number-1]; Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number-1]; Scaffold_Number = Scaffold_Number -1; } else { Size_Seq = (long *)realloc(Size_Seq, (Scaffold_Number+2)*sizeof(long)); A_num = (long *)realloc(A_num, (Scaffold_Number+2)*sizeof(long)); C_num = (long *)realloc(C_num, (Scaffold_Number+2)*sizeof(long)); G_num = (long *)realloc(G_num, (Scaffold_Number+2)*sizeof(long)); T_num = (long *)realloc(T_num, (Scaffold_Number+2)*sizeof(long)); N_num = (long *)realloc(N_num, (Scaffold_Number+2)*sizeof(long)); Non_ACGTN= (long *)realloc(Non_ACGTN,(Scaffold_Number+2)*sizeof(long)); Singleton_Seq = (long *)realloc(Singleton_Seq, (Scaffold_Number+2)*sizeof(long)); } Scaffold_Number++; A_num[Scaffold_Number-1] = 0; C_num[Scaffold_Number-1] = 0; G_num[Scaffold_Number-1] = 0; T_num[Scaffold_Number-1] = 0; N_num[Scaffold_Number-1] = 0; Non_ACGTN[Scaffold_Number-1] = 0; Size_Seq[Scaffold_Number-1] = 0; Singleton_Seq[Scaffold_Number-1] = 0; Nucleotide = fgetc(fp2); if(Nucleotide == 'C') { Singleton_Number++; Singleton_Seq[Scaffold_Number-1] ++; } fgets(buf,4000,fp2); } else if((Nucleotide == 'N')||(Nucleotide == 'n')) { N_num[Scaffold_Number-1] ++; N_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'A')||(Nucleotide == 'a')) { A_num[Scaffold_Number-1] ++; A_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'C')||(Nucleotide == 'c')) { C_num[Scaffold_Number-1] ++; C_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'G')||(Nucleotide == 'g')) { G_num[Scaffold_Number-1] ++; G_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else if((Nucleotide == 'T')||(Nucleotide == 't')) { T_num[Scaffold_Number-1] ++; T_num_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } else { if((Nucleotide != '\n')&&(Nucleotide != '\r')) { Non_ACGTN[Scaffold_Number-1] ++; Non_ACGTN_all++; Size_Seq[Scaffold_Number-1] ++; Size_includeN++; } } Nucleotide = fgetc(fp2); } if (Size_Seq[Scaffold_Number-1] < cut_off_len) { A_num_all = A_num_all - A_num[Scaffold_Number-1]; C_num_all = C_num_all - C_num[Scaffold_Number-1]; G_num_all = G_num_all - G_num[Scaffold_Number-1]; T_num_all = T_num_all - T_num[Scaffold_Number-1]; N_num_all = N_num_all - N_num[Scaffold_Number-1]; Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number-1]; Size_includeN = Size_includeN - Size_Seq[Scaffold_Number-1]; Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number-1]; Scaffold_Number = Scaffold_Number -1; } qsort(Size_Seq,Scaffold_Number,sizeof(Size_Seq[0]),cmp_int); fprintf(fo,"Size_includeN\t%lld\n",Size_includeN); fprintf(fo,"Size_withoutN\t%lld\n",Size_includeN - N_num_all); fprintf(fo,"Contig_Num\t%ld\n",Scaffold_Number); fprintf(fo,"Mean_Size\t%lld\n",Size_includeN/Scaffold_Number); fprintf(fo,"Median_Size\t%ld\n",Size_Seq[(Scaffold_Number+1)/2-1]); fprintf(fo,"Longest_Seq\t%ld\n",Size_Seq[Scaffold_Number-1]); fprintf(fo,"Shortest_Seq\t%ld\n",Size_Seq[0]); fprintf(fo,"\n"); for(k=0; k 100) { n100++; } if (Size_Seq[k] > 500) { n500++; } if (Size_Seq[k] > 1000) { n1k++; } if (Size_Seq[k] > 10000) { n10k++; } if (Size_Seq[k] > 100000) { n100k++; } if (Size_Seq[k] > 1000000) { n1m++; } } fprintf(fo,"Contig>100 \t%ld\t%.2f%\n",n100 ,100*(1.0*n100/Scaffold_Number)); fprintf(fo,"Contig>500 \t%ld\t%.2f%\n",n500 ,100*(1.0*n500/Scaffold_Number)); fprintf(fo,"Contig>1K \t%ld\t%.2f%\n",n1k ,100*(1.0*n1k/Scaffold_Number)); fprintf(fo,"Contig>10K \t%ld\t%.2f%\n",n10k ,100*(1.0*n10k/Scaffold_Number)); fprintf(fo,"Contig>100K\t%ld\t%.2f%\n",n100k,100*(1.0*n100k/Scaffold_Number)); fprintf(fo,"Contig>1M \t%ld\t%.2f%\n",n1m ,100*(1.0*n1m/Scaffold_Number)); fprintf(fo,"\n"); fprintf(fo,"Nucleotide_A\t%lld\t%.2f%\n",A_num_all,100*(1.0*A_num_all/Size_includeN)); fprintf(fo,"Nucleotide_C\t%lld\t%.2f%\n",C_num_all,100*(1.0*C_num_all/Size_includeN)); fprintf(fo,"Nucleotide_G\t%lld\t%.2f%\n",G_num_all,100*(1.0*G_num_all/Size_includeN)); fprintf(fo,"Nucleotide_T\t%lld\t%.2f%\n",T_num_all,100*(1.0*T_num_all/Size_includeN)); fprintf(fo,"GapContent_N\t%lld\t%.2f%\n",N_num_all,100*(1.0*N_num_all/Size_includeN)); fprintf(fo,"Non_ACGTN\t%lld\t%.2f%\n",Non_ACGTN_all,100*(1.0*Non_ACGTN_all/Size_includeN)); fprintf(fo,"GC_Content\t%.2f%\t\t(G+C)/(A+C+G+T)\n",100*(1.0*(G_num_all+C_num_all)/(A_num_all+C_num_all+G_num_all+T_num_all))); fprintf(fo,"\n"); for (k=0; k<10; k++) flag[k]=0; for(k=Scaffold_Number-1; k>=0; k--) { Sum = Sum + Size_Seq[k]; if ((Sum >= Size_includeN*0.1) && (Sum < Size_includeN*0.2) && (flag[1] == 0)) {fprintf(fo,"N10\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[1] = 1;} else if ((Sum >= Size_includeN*0.2) && (Sum < Size_includeN*0.3) && (flag[2] == 0)) {fprintf(fo,"N20\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[2] = 1;} else if ((Sum >= Size_includeN*0.3) && (Sum < Size_includeN*0.4) && (flag[3] == 0)) {fprintf(fo,"N30\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[3] = 1;} else if ((Sum >= Size_includeN*0.4) && (Sum < Size_includeN*0.5) && (flag[4] == 0)) {fprintf(fo,"N40\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[4] = 1;} else if ((Sum >= Size_includeN*0.5) && (Sum < Size_includeN*0.6) && (flag[5] == 0)) {fprintf(fo,"N50\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[5] = 1; N50 = Size_Seq[k]; } else if ((Sum >= Size_includeN*0.6) && (Sum < Size_includeN*0.7) && (flag[6] == 0)) {fprintf(fo,"N60\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[6] = 1;} else if ((Sum >= Size_includeN*0.7) && (Sum < Size_includeN*0.8) && (flag[7] == 0)) {fprintf(fo,"N70\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[7] = 1;} else if ((Sum >= Size_includeN*0.8) && (Sum < Size_includeN*0.9) && (flag[8] == 0)) {fprintf(fo,"N80\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[8] = 1;} else if ((Sum >= Size_includeN*0.9) && (flag[9] == 0)) {fprintf(fo,"N90\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); flag[9] = 1;} if ((Sum >= known_genome_size*0.5) && (flag_known == 0)) { N50_known = Size_Seq[k]; Num_N50_known = Scaffold_Number-k; flag_known = 1;} } if(flag[5] == 0) { Sum = 0; for(k=Scaffold_Number-1; k>=0; k--) { Sum = Sum + Size_Seq[k]; if (Sum >= Size_includeN*0.5) { fprintf(fo,"N50\t%ld\t%ld\n",Size_Seq[k],Scaffold_Number-k); break; } } } fprintf(fo,"\n"); if (known_genome_size) { fprintf(fo,"NG50\t%ld\t%ld\n",N50_known,Num_N50_known); fprintf(fo,"N50_contig-NG50_contig_length_difference\t%ld\n",abs(N50 - N50_known)); } else { fprintf(fo,"NG50\tNaN\tNaN\n"); fprintf(fo,"N50_contig-NG50_contig_length_difference\tNaN\n"); } fprintf(fo,"\n"); free(A_num); free(C_num); free(G_num); free(T_num); free(N_num); free(Non_ACGTN); free(Singleton_Seq); free(Size_Seq); Scaffold_Number_Contig = Scaffold_Number; fprintf(fo,"Number_of_contigs_in_scaffolds(Singleton)\t%ld\n",Singleton_Number_Scaf); fprintf(fo,"Number_of_contigs_not_in_scaffolds\t%ld\n",Scaffold_Number_Contig - Singleton_Number_Scaf); fprintf(fo,"Average_number_of_contigs_per_scaffold\t%.1f\n", 1.0 * Scaffold_Number_Contig / Scaffold_Number_Scaf); fprintf(fo,"\n"); fclose ( fp ); fclose ( fp2 ); fclose ( fo ); } void RPKMStat (char * graphfile ) { FILE * fp, *fp2, *fp3,*fo; char line[1024]; sprintf ( line, "%s.scafSeq", graphfile ); fp = ckopen ( line, "r" ); sprintf ( line, "%s.RPKM.Stat", graphfile ); fo = ckopen ( line, "w" ); char Nucleotide; char buf[10000000]; int m; int Scaffold_Num=0; int Singleton_Num=0; int i; Nucleotide = fgetc(fp); while (Nucleotide != EOF) { if(Nucleotide == '>') { Nucleotide = fgetc(fp); if(Nucleotide == 's') { fgets(buf,8,fp); fscanf(fp, "%d",&m); if(m>Scaffold_Num) Scaffold_Num = m; } else if(Nucleotide == 'C') { fscanf(fp, "%d",&m); if(m>Singleton_Num) Singleton_Num = m; } fgets(buf,10000000,fp); } Nucleotide = fgetc(fp); } //printf("Scaffold_Num=%d Singleton_Num=%d\n", Scaffold_Num,Singleton_Num); fclose ( fp ); //*************************************************** sprintf ( line, "%s.scafSeq", graphfile ); fp = ckopen ( line, "r" ); int *S_Length = (int *)malloc(sizeof(int)* (Scaffold_Num+1)); int *C_Length = (int *)malloc(sizeof(int)* (Singleton_Num+1)); char type = 'N'; int index = 0; int len = 0; for(i=0;i<=Scaffold_Num;i++) S_Length[i]=0; for(i=0;i<=Singleton_Num;i++) C_Length[i]=0; Nucleotide = fgetc(fp); while (Nucleotide != EOF) { if(Nucleotide == '>') { if(type == 'S') S_Length[index] = len; else if(type == 'C') C_Length[index] = len; len = 0; Nucleotide = fgetc(fp); if(Nucleotide == 's') { fgets(buf,8,fp); fscanf(fp, "%d",&m); type = 'S'; index = m; } else if(Nucleotide == 'C') { fscanf(fp, "%d",&m); type = 'C'; index = m; } fgets(buf,10000000,fp); } else { if((Nucleotide != '\n')&&(Nucleotide != '\r')) { len ++; } } Nucleotide = fgetc(fp); } //last sequence if(type == 'S') S_Length[index] = len; else if(type == 'C') C_Length[index] = len; //for(index=0;index<=Scaffold_Num;index++) printf(">scaffold%d\t%d\n",index,S_Length[index]); //for(index=0;index<=Singleton_Num;index++) printf(">C%d\t%d\n",index,C_Length[index]); fclose ( fp ); //*************************************************** sprintf ( line, "%s.peGrads", graphfile ); fp3 = ckopen ( line, "r" ); int Reads_Num = 0; fscanf(fp3, "%s %d %d", buf, &i, &Reads_Num); //printf("Reads_Num=%d\n", Reads_Num); fclose (fp3); //*************************************************** sprintf ( line, "%s.readOnScaf", graphfile ); fp2 = ckopen ( line, "r" ); char ch_temp[40]; int *Reads = (int *)malloc(sizeof(int)* (Reads_Num+1)); for(i=0;i<=Reads_Num;i++) Reads[i]=0; while (!feof(fp2)) { fscanf(fp2, "%s ", ch_temp); if(ch_temp[0] == '>') { //printf("%s\n",ch_temp); if(ch_temp[1] == 's') fgets(buf,10000000,fp2); } else { i=atoi(ch_temp); Reads[i]++; fgets(buf,10000000,fp2); //printf("%d\t%d\n",i,Reads[i]); } } Reads[i]--; //last line fclose (fp2); //*************************************************** sprintf ( line, "%s.readOnScaf", graphfile ); fp2 = ckopen ( line, "r" ); int *S_hits = (int *)malloc(sizeof(int)* (Scaffold_Num+1)); int *C_hits = (int *)malloc(sizeof(int)* (Singleton_Num+1)); int uniq_reads_num = 0; for(i=0;i<=Scaffold_Num;i++) S_hits[i]=0; for(i=0;i<=Singleton_Num;i++) C_hits[i]=0; while (!feof(fp2)) { fscanf(fp2, "%s ", ch_temp); if(ch_temp[0] == '>') { Nucleotide = ch_temp[1]; if(Nucleotide == 's') { type = 'S'; sscanf(ch_temp, "%9s%d",buf,&m); index = m; fgets(buf,10000000,fp2); } else if(Nucleotide == 'C') { type = 'C'; sscanf(ch_temp, "%2s%d",buf,&m); index = m; } //printf(">%c\t%d\n",type,index); } else { i=atoi(ch_temp); if(Reads[i] == 1) { uniq_reads_num++; if(type == 'S') S_hits[index]++; else if(type == 'C') C_hits[index]++; //printf("%d\t%d\t%d\t%d\n",i,Reads[i],S_hits[index],C_hits[index]); } fgets(buf,10000000,fp2); } } //last line if(type == 'S') S_hits[index]--; else if(type == 'C') C_hits[index]--; //Output fprintf(fo,"# Notice:RPKM calculation base on K-mer mapping.\n"); fprintf(fo,"# Total_unique_reads_num=%d\n",uniq_reads_num); fprintf(fo,"Transcript_ID\tLength\tUniq_reads_num\tRPKM\n"); double rpkm; for(i=0;i<=Scaffold_Num;i++) { if(S_Length[i] != 0) { rpkm= (S_hits[i]*(1e+9))/(1.0*uniq_reads_num*S_Length[i]); fprintf (fo, "scaffold%d\t%d\t%d\t%f\n", i,S_Length[i],S_hits[i],rpkm); //fprintf (fo, "scaffold%d\t%d\t%f\n", i,S_hits[i],rpkm); } } for(i=0;i<=Singleton_Num;i++) { if(C_Length[i] != 0) { rpkm= (C_hits[i]*(1e+9))/(1.0*uniq_reads_num*C_Length[i]); fprintf (fo, "C%d\t%d\t%d\t%f\n", i,C_Length[i],C_hits[i],rpkm); //fprintf (fo, "C%d\t%d\t%f\n", i,C_hits[i],rpkm); } } printf("Output assembly RPKM statistics finished! \n "); free(S_Length); free(C_Length); free(S_hits); free(C_hits); free(Reads); fclose ( fp2 ); fclose ( fo ); } void scaffolding (unsigned int len_cut, char *outfile) { unsigned int prev_ctg, ctg, bal_ctg, *length_array, count = 0, num_lctg = 0; unsigned int i, max_steps = 5; int num5, num3, j, len, flag, num_route, gap_c = 0; short gap = 0; long long sum = 0, N50, N90; FILE *fp, *fo = NULL; char name[256]; CONNECT *cnt, *prevCNT, *nextCnt; boolean excep, weak; weakCounter = 0; so_far = (unsigned int *) ckalloc (max_n_routes * sizeof (unsigned int)); found_routes = (unsigned int **) ckalloc (max_n_routes * sizeof (unsigned int *)); for (j = 0; j < max_n_routes; j++) { found_routes[j] = (unsigned int *) ckalloc (max_steps * sizeof (unsigned int)); } length_array = (unsigned int *) ckalloc ((num_ctg + 1) * sizeof (unsigned int)); //use length_array to change info in index_array for (i = 1; i <= num_ctg; i++) { length_array[i] = 0; } for (i = 1; i <= num_ctg; i++) { if (index_array[i] > 0) { length_array[index_array[i]] = i; } } for (i = 1; i <= num_ctg; i++) { index_array[i] = length_array[i]; } //contig i with original index: index_array[i] orig2new = 0; sprintf (name, "%s.scaf", outfile); fp = ckopen (name, "w"); sprintf (name, "%s.scaf_gap", outfile); fo = ckopen (name, "w"); scaf3 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); scaf5 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); gap3 = (DARRAY *) createDarray (1000, sizeof (int)); gap5 = (DARRAY *) createDarray (1000, sizeof (int)); for (i = 1; i <= num_ctg; i++) { contig_array[i].flag = 0; } for (i = 1; i <= num_ctg; i++) { if (contig_array[i].length + (unsigned int) overlaplen >= len_cut) { num_lctg++; } else { continue; } if (contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect || !validConnect (i, NULL)) { continue; } num5 = num3 = 0; ctg = i; //printf("%d",i); *(unsigned int *) darrayPut (scaf5, num5++) = i; contig_array[i].flag = 1; bal_ctg = getTwinCtg (ctg); contig_array[bal_ctg].flag = 1; len = contig_array[i].length; prevCNT = NULL; cnt = getNextContig (ctg, prevCNT, &excep); while (cnt) { nextCnt = getNextContig (cnt->contigID, cnt, &excep); if (excep && prevCNT) { printf ("scaffolding: exception --- prev cnt from %u\n", prevCNT->contigID); } if (nextCnt && nextCnt->used) { break; } setConnectUsed (ctg, cnt->contigID, 1); *(int *) darrayPut (gap5, num5 - 1) = cnt->gapLen; ctg = cnt->contigID; *(unsigned int *) darrayPut (scaf5, num5++) = ctg; len += cnt->gapLen + contig_array[ctg].length; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; prevCNT = cnt; cnt = nextCnt; //printf("->%d",ctg); } //printf("\n"); ctg = getTwinCtg (i); if (num5 >= 2) { prevCNT = checkConnect (getTwinCtg (*(unsigned int *) darrayGet (scaf5, 1)), ctg); } else { prevCNT = NULL; } //printf("%d",i); //fflush(stdout); cnt = getNextContig (ctg, prevCNT, &excep); while (cnt) { nextCnt = getNextContig (cnt->contigID, cnt, &excep); if (excep && prevCNT) { printf ("scaffolding: exception -- prev cnt from %u\n", prevCNT->contigID); } if (nextCnt && nextCnt->used) { break; } setConnectUsed (ctg, cnt->contigID, 1); ctg = cnt->contigID; len += cnt->gapLen + contig_array[ctg].length; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; //printf("<-%d",bal_ctg); //fflush(stdout); *(int *) darrayPut (gap3, num3) = cnt->gapLen; *(unsigned int *) darrayPut (scaf3, num3++) = bal_ctg; prevCNT = cnt; cnt = nextCnt; } //printf("\n"); len += overlaplen; sum += len; length_array[count++] = len; if (num5 + num3 < 1) { printf ("no scaffold created for contig %d\n", i); continue; } fprintf (fp, ">scaffold%d %d %d\n", count, num5 + num3, len); fprintf (fo, ">scaffold%d %d %d\n", count, num5 + num3, len); len = prev_ctg = 0; for (j = num3 - 1; j >= 0; j--) { if (!isLargerThanTwin (*(unsigned int *) darrayGet (scaf3, j))) { fprintf (fp, "%-10d %-10d + %d ", index_array[*(unsigned int *) darrayGet (scaf3, j)], len, contig_array[*(unsigned int *) darrayGet (scaf3, j)].length + overlaplen); weak = printCnts (fp, *(unsigned int *) darrayGet (scaf3, j)); /* if(weak) fprintf(stderr,"scaffold%d\n",count); */ } else { fprintf (fp, "%-10d %-10d - %d ", index_array[getTwinCtg (*(unsigned int *) darrayGet (scaf3, j))], len, contig_array[*(unsigned int *) darrayGet (scaf3, j)].length + overlaplen); weak = printCnts (fp, *(unsigned int *) darrayGet (scaf3, j)); /* if(weak) fprintf(stderr,"scaffold%d\n",count); */ } if (prev_ctg) { num_route = num_trace = 0; traceAlongArc (*(unsigned int *) darrayGet (scaf3, j), prev_ctg, max_steps, gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route); if (num_route == 1) { output1gap (fo, max_steps); gap_c++; } } fprintf (fo, "%-10d %-10d\n", *(unsigned int *) darrayGet (scaf3, j), len); len += contig_array[*(unsigned int *) darrayGet (scaf3, j)].length + *(int *) darrayGet (gap3, j); prev_ctg = *(unsigned int *) darrayGet (scaf3, j); gap = *(int *) darrayGet (gap3, j) > 0 ? *(int *) darrayGet (gap3, j) : 0; } for (j = 0; j < num5; j++) { if (!isLargerThanTwin (*(unsigned int *) darrayGet (scaf5, j))) { fprintf (fp, "%-10d %-10d + %d ", index_array[*(unsigned int *) darrayGet (scaf5, j)], len, contig_array[*(unsigned int *) darrayGet (scaf5, j)].length + overlaplen); weak = printCnts (fp, *(unsigned int *) darrayGet (scaf5, j)); /* if(weak) fprintf(stderr,"scaffold%d\n",count); */ } else { fprintf (fp, "%-10d %-10d - %d ", index_array[getTwinCtg (*(unsigned int *) darrayGet (scaf5, j))], len, contig_array[*(unsigned int *) darrayGet (scaf5, j)].length + overlaplen); weak = printCnts (fp, *(unsigned int *) darrayGet (scaf5, j)); /* if(weak) fprintf(stderr,"scaffold%d\n",count); */ } if (prev_ctg) { num_route = num_trace = 0; traceAlongArc (*(unsigned int *) darrayGet (scaf5, j), prev_ctg, max_steps, gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route); if (num_route == 1) { output1gap (fo, max_steps); gap_c++; } } fprintf (fo, "%-10d %-10d\n", *(unsigned int *) darrayGet (scaf5, j), len); if (j < num5 - 1) { len += contig_array[*(unsigned int *) darrayGet (scaf5, j)].length + *(int *) darrayGet (gap5, j); prev_ctg = *(unsigned int *) darrayGet (scaf5, j); gap = *(int *) darrayGet (gap5, j) > 0 ? *(int *) darrayGet (gap5, j) : 0; } } } freeDarray (scaf3); freeDarray (scaf5); freeDarray (gap3); freeDarray (gap5); fclose (fp); fclose (fo); printf ("\nthe final rank"); if (count != 0) { printf ("\n%d scaffolds from %d contigs sum up %lldbp, with average length %lld, %d gaps filled\n", count, num_lctg / 2, sum, sum / count, gap_c); } //output singleton for (i = 1; i <= num_ctg; i++) { if (contig_array[i].length + (unsigned int) overlaplen < len_cut || contig_array[i].flag) { continue; } length_array[count++] = contig_array[i].length; sum += contig_array[i].length; if (isSmallerThanTwin (i)) { i++; } } // calculate N50/N90 if (count != 0) { printf ("%d scaffolds&singleton sum up %lldbp, with average length %lld\n", count, sum, sum / count); qsort (length_array, count, sizeof (length_array[0]), cmp_int); printf ("the longest is %dbp,", length_array[count - 1]); } N50 = sum * 0.5; N90 = sum * 0.9; sum = flag = 0; for (j = count - 1; j >= 0; j--) { sum += length_array[j]; if (!flag && sum >= N50) { printf ("scaffold N50 is %d bp, ", length_array[j]); flag++; } if (sum >= N90) { printf ("scaffold N90 is %d bp\n", length_array[j]); break; } } printf ("Found %d weak points in scaffolds\n", weakCounter); fflush (stdout); free ((void *) length_array); for (j = 0; j < max_n_routes; j++) { free ((void *) found_routes[j]); } free ((void *) found_routes); free ((void *) so_far); } void scaffold_count (unsigned int len_cut) { static DARRAY *scaf3, *scaf5; static DARRAY *gap3, *gap5; unsigned int prev_ctg, ctg, bal_ctg, *length_array, count = 0, num_lctg = 0; unsigned int i, max_steps = 5; int num5, num3, j, len, flag, num_route, gap_c = 0; short gap = 0; long long sum = 0, N50, N90; CONNECT *cnt, *prevCNT, *nextCnt; boolean excep; so_far = (unsigned int *) ckalloc (max_n_routes * sizeof (unsigned int)); found_routes = (unsigned int **) ckalloc (max_n_routes * sizeof (unsigned int *)); for (j = 0; j < max_n_routes; j++) { found_routes[j] = (unsigned int *) ckalloc (max_steps * sizeof (unsigned int)); } length_array = (unsigned int *) ckalloc ((num_ctg + 1) * sizeof (unsigned int)); //use length_array to change info in index_array for (i = 1; i <= num_ctg; i++) { length_array[i] = 0; } for (i = 1; i <= num_ctg; i++) { if (index_array[i] > 0) { length_array[index_array[i]] = i; } } for (i = 1; i <= num_ctg; i++) { index_array[i] = length_array[i]; } //contig i with original index: index_array[i] orig2new = 0; scaf3 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); scaf5 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); gap3 = (DARRAY *) createDarray (1000, sizeof (int)); gap5 = (DARRAY *) createDarray (1000, sizeof (int)); for (i = 1; i <= num_ctg; i++) { contig_array[i].flag = 0; } for (i = 1; i <= num_ctg; i++) { if (contig_array[i].length + (unsigned int) overlaplen >= len_cut) { num_lctg++; } else { continue; } if (contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect || !validConnect (i, NULL)) { continue; } num5 = num3 = 0; ctg = i; //printf("%d",i); *(unsigned int *) darrayPut (scaf5, num5++) = i; contig_array[i].flag = 1; bal_ctg = getTwinCtg (ctg); contig_array[bal_ctg].flag = 1; len = contig_array[i].length; prevCNT = NULL; cnt = getNextContig (ctg, prevCNT, &excep); while (cnt) { nextCnt = getNextContig (cnt->contigID, cnt, &excep); if (excep && prevCNT) { printf ("scaffolding: exception --- prev cnt from %u\n", prevCNT->contigID); } if (nextCnt && nextCnt->used) { break; } setConnectUsed (ctg, cnt->contigID, 1); *(int *) darrayPut (gap5, num5 - 1) = cnt->gapLen; ctg = cnt->contigID; *(unsigned int *) darrayPut (scaf5, num5++) = ctg; len += cnt->gapLen + contig_array[ctg].length; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; prevCNT = cnt; cnt = nextCnt; //printf("->%d",ctg); } //printf("\n"); ctg = getTwinCtg (i); if (num5 >= 2) { prevCNT = checkConnect (getTwinCtg (*(unsigned int *) darrayGet (scaf5, 1)), ctg); } else { prevCNT = NULL; } //printf("%d",i); //fflush(stdout); cnt = getNextContig (ctg, prevCNT, &excep); while (cnt) { nextCnt = getNextContig (cnt->contigID, cnt, &excep); if (excep && prevCNT) { printf ("scaffolding: exception -- prev cnt from %u\n", prevCNT->contigID); } if (nextCnt && nextCnt->used) { break; } setConnectUsed (ctg, cnt->contigID, 1); ctg = cnt->contigID; len += cnt->gapLen + contig_array[ctg].length; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; //printf("<-%d",bal_ctg); //fflush(stdout); *(int *) darrayPut (gap3, num3) = cnt->gapLen; *(unsigned int *) darrayPut (scaf3, num3++) = bal_ctg; prevCNT = cnt; cnt = nextCnt; } //printf("\n"); len += overlaplen; sum += len; length_array[count++] = len; if (num5 + num3 < 1) { printf ("no scaffold created for contig %d\n", i); continue; } len = prev_ctg = 0; for (j = num3 - 1; j >= 0; j--) { if (prev_ctg) { num_route = num_trace = 0; traceAlongArc (*(unsigned int *) darrayGet (scaf3, j), prev_ctg, max_steps, gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route); if (num_route == 1) { gap_c++; } } len += contig_array[*(unsigned int *) darrayGet (scaf3, j)].length + *(int *) darrayGet (gap3, j); prev_ctg = *(unsigned int *) darrayGet (scaf3, j); gap = *(int *) darrayGet (gap3, j) > 0 ? *(int *) darrayGet (gap3, j) : 0; } for (j = 0; j < num5; j++) { if (prev_ctg) { num_route = num_trace = 0; traceAlongArc (*(unsigned int *) darrayGet (scaf5, j), prev_ctg, max_steps, gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route); if (num_route == 1) { gap_c++; } } if (j < num5 - 1) { len += contig_array[*(unsigned int *) darrayGet (scaf5, j)].length + *(int *) darrayGet (gap5, j); prev_ctg = *(unsigned int *) darrayGet (scaf5, j); gap = *(int *) darrayGet (gap5, j) > 0 ? *(int *) darrayGet (gap5, j) : 0; } } } freeDarray (scaf3); freeDarray (scaf5); freeDarray (gap3); freeDarray (gap5); if (count != 0) { printf ("\n%d scaffolds from %d contigs sum up %lldbp, with average length %lld, %d gaps filled\n", count, num_lctg / 2, sum, sum / count, gap_c); } //output singleton for (i = 1; i <= num_ctg; i++) { if (contig_array[i].length + (unsigned int) overlaplen < len_cut || contig_array[i].flag) { continue; } length_array[count++] = contig_array[i].length; sum += contig_array[i].length; if (isSmallerThanTwin (i)) { i++; } } // calculate N50/N90 if (count != 0) { printf ("%d scaffolds&singleton sum up %lldbp, with average length %lld\n", count, sum, sum / count); qsort (length_array, count, sizeof (length_array[0]), cmp_int); printf ("the longest is %dbp,", length_array[count - 1]); } N50 = sum * 0.5; N90 = sum * 0.9; sum = flag = 0; for (j = count - 1; j >= 0; j--) { sum += length_array[j]; if (!flag && sum >= N50) { printf ("scaffold N50 is %d bp, ", length_array[j]); flag++; } if (sum >= N90) { printf ("scaffold N90 is %d bp\n", length_array[j]); break; } } fflush (stdout); free ((void *) length_array); for (j = 0; j < max_n_routes; j++) { free ((void *) found_routes[j]); } free ((void *) found_routes); free ((void *) so_far); } static void outputLinks (FILE * fp, int insertS) { unsigned int i, bal_ctg, bal_toCtg; CONNECT *cnts, *temp_cnt; //printf("outputLinks, %d contigs\n",num_ctg); for (i = 1; i <= num_ctg; i++) { cnts = contig_array[i].downwardConnect; bal_ctg = getTwinCtg (i); while (cnts) { if (cnts->weight < 1) { cnts = cnts->next; continue; } fprintf (fp, "%-10d %-10d\t%d\t%d\t%d\n", i, cnts->contigID, cnts->gapLen, cnts->weight, insertS); cnts->weight = 0; bal_toCtg = getTwinCtg (cnts->contigID); temp_cnt = getCntBetween (bal_toCtg, bal_ctg); if (temp_cnt) { temp_cnt->weight = 0; } cnts = cnts->next; } } } //use pe info in ascent order void PE2Links (char *infile) { char name[256], *line; FILE *linkF; gzFile *fp; int i; int flag = 0; unsigned int j; sprintf (name, "%s.links", infile); /*linkF = gzopen(name,"r"); if(linkF){ printf("file %s exists, skip creating the links...\n",name); fclose(linkF); return; } */ linkF = ckopen (name, "w"); if (!pes) { loadPEgrads (infile); } sprintf (name, "%s.readOnContig", infile); fp = gzopen (name, "r"); lineLen = 1024; line = (char *) ckalloc (lineLen * sizeof (char)); gzgets (fp, line, lineLen); line[0] = '\0'; printf ("\n"); for (i = 0; i < gradsCounter; i++) { createCntMemManager (); createCntLookupTable (); newCntCounter = 0; flag += connectByPE_grad (fp, i, line); printf ("%lld new connections\n", newCntCounter / 2); if (!flag) { destroyConnectMem (); deleteCntLookupTable (); for (j = 1; j <= num_ctg; j++) { contig_array[j].downwardConnect = NULL; } printf ("\n"); continue; } flag = 0; outputLinks (linkF, pes[i].insertS); destroyConnectMem (); deleteCntLookupTable (); for (j = 1; j <= num_ctg; j++) { contig_array[j].downwardConnect = NULL; } } free ((void *) line); gzclose (fp); fclose (linkF); printf ("all PEs attached\n"); } static int inputLinks (FILE * fp, int insertS, char *line) { unsigned int ctg, bal_ctg, toCtg, bal_toCtg; int gap, wt, ins; unsigned int counter = 0, onScafCounter = 0; unsigned int maskCounter = 0; CONNECT * connection; if (strlen (line)) { sscanf (line, "%d %d %d %d %d", &ctg, &toCtg, &gap, &wt, &ins); if (ins != insertS) { return counter; } //if(contig_array[ctg].length>=ctg_short&&contig_array[toCtg].length>=ctg_short){ if (1) { bal_ctg = getTwinCtg (ctg); bal_toCtg = getTwinCtg (toCtg); if(bal_ctg == ctg ||bal_toCtg==toCtg ){ add1Connect (ctg, toCtg, gap, wt, 0); add1Connect (bal_toCtg, bal_ctg, gap, wt, 0); connection=getCntBetween(ctg, toCtg); connection->PECount = wt; connection=getCntBetween(bal_toCtg, bal_ctg); connection->PECount = wt; counter++; if (contig_array[ctg].mask || contig_array[toCtg].mask) { maskCounter++; } if (insertS > 1000 && contig_array[ctg].from_vt == contig_array[toCtg].from_vt && // on the same scaff contig_array[ctg].indexInScaf < contig_array[toCtg].indexInScaf) { add1LongPEcov (ctg, toCtg, wt); onScafCounter++; } } } } while (fgets (line, lineLen, fp) != NULL) { sscanf (line, "%d %d %d %d %d", &ctg, &toCtg, &gap, &wt, &ins); if (ins > insertS) //if(ins>insertS) { break; } /* if(contig_array[ctg].length 1000 && contig_array[ctg].from_vt == contig_array[toCtg].from_vt && // on the same scaff contig_array[ctg].indexInScaf < contig_array[toCtg].indexInScaf) { add1LongPEcov (ctg, toCtg, wt); onScafCounter++; } bal_ctg = getTwinCtg (ctg); bal_toCtg = getTwinCtg (toCtg); if(bal_ctg == ctg ||bal_toCtg==toCtg ) continue; add1Connect (ctg, toCtg, gap, wt, 0); add1Connect (bal_toCtg, bal_ctg, gap, wt, 0); connection=getCntBetween(ctg, toCtg); connection->PECount += wt; connection=getCntBetween(bal_toCtg, bal_ctg); connection->PECount += wt; counter++; if (contig_array[ctg].mask || contig_array[toCtg].mask) { maskCounter++; } } printf ("%d link to masked contigs, %d links on a single scaff\n", maskCounter, onScafCounter); return counter; } static void ArcToConnect() { unsigned int index; preARC* parc=NULL; unsigned int to_contig,bal_contig,bal_to_contig; unsigned int weight; CONNECT * connection; for(index=1;index<=num_ctg;index++) { parc=contig_array[index].arcs; while(parc) { weight=parc->multiplicity; to_contig=parc->to_ed; // printf("%d->%d\n",index,to_contig); bal_contig=getTwinCtg(index); bal_to_contig=getTwinCtg(to_contig); connection=add1Connect(index,to_contig, 0,weight ,0); if(connection) connection->SECount+=weight; connection=add1Connect(bal_to_contig,bal_contig,0,weight,0); if(connection) connection->SECount+=weight; parc=parc->next; } } // exit(0); } //use linkage info in ascent order void Links2Scaf (char *infile) { char name[256], *line; FILE *fp; int i, j = 1, lib_n = 0, cutoff_sum = 0; int flag = 0, flag2; boolean downS, nonLinear = 0, smallPE = 0, isPrevSmall = 0, markSmall; if (!pes) { loadPEgrads (infile); } sprintf (name, "%s.links", infile); fp = ckopen (name, "r"); createCntMemManager (); createCntLookupTable (); lineLen = 1024; line = (char *) ckalloc (lineLen * sizeof (char)); fgets (line, lineLen, fp); line[0] = '\0'; solidArray = (DARRAY *) createDarray (1000, sizeof (unsigned int)); tempArray = (DARRAY *) createDarray (1000, sizeof (unsigned int)); scaf3 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); scaf5 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); gap3 = (DARRAY *) createDarray (1000, sizeof (int)); gap5 = (DARRAY *) createDarray (1000, sizeof (int)); weakPE = 3; //0531 printf ("\n"); for (i = 0; i < gradsCounter; i++) { if (pes[i].insertS < 1000) { isPrevSmall = 1; } else if (pes[i].insertS > 1000 && isPrevSmall) { // smallScaf (); isPrevSmall = 0; } flag2 = inputLinks (fp, pes[i].insertS, line); printf ("Insert size %d: %d links input\n", pes[i].insertS, flag2); if (flag2) { lib_n++; cutoff_sum += pes[i].pair_num_cut; } flag += flag2; if (!flag) { printf ("\n"); continue; } if (i == gradsCounter - 1 || pes[i + 1].rank != pes[i].rank) { flag = nonLinear = downS = markSmall = 0; if (pes[i].insertS > 1000 && pes[i].rank > 1) { downS = 1; } if (pes[i].insertS <= 1000) { smallPE = 1; } if (pes[i].insertS >= 1000) { ins_size_var = 50; OverlapPercent = 0.05; } else if (pes[i].insertS >= 300) { ins_size_var = 30; OverlapPercent = 0.05; } else { ins_size_var = 20; OverlapPercent = 0.05; } if (pes[i].insertS > 1000) { weakPE = 5; } if (lib_n > 0) { weakPE = weakPE < cutoff_sum / lib_n ? cutoff_sum / lib_n : weakPE; lib_n = cutoff_sum = 0; } printf ("Cutoff for number of pairs to make a reliable connection: %d\n", weakPE); if (i == gradsCounter - 1) { nonLinear = 1; } /* if (i == gradsCounter - 1) { ArcToConnect(); }*/ } } freeDarray (tempArray); freeDarray (solidArray); freeDarray (scaf3); freeDarray (scaf5); freeDarray (gap3); freeDarray (gap5); free ((void *) line); fclose (fp); printf ("all links loaded\n"); } /* below for picking up a subgraph (with at most one node has upstream connections to the rest and at most one downstream connections) in general */ // static int nodeCounter static boolean putNodeInArray (unsigned int node, int maxNodes, int dis) { if (contig_array[node].inSubGraph) { return 1; } int index = nodeCounter; if (index > maxNodes) { return 0; } if (contig_array[getTwinCtg (node)].inSubGraph) { return 0; } ctg4heapArray[index].ctgID = node; ctg4heapArray[index].dis = dis; contig_array[node].inSubGraph = 1; ctg4heapArray[index].ds_shut4dheap = 0; ctg4heapArray[index].us_shut4dheap = 0; ctg4heapArray[index].ds_shut4uheap = 0; ctg4heapArray[index].us_shut4uheap = 0; return 1; } static void setInGraph (boolean flag) { int i; int node; nodeCounter = nodeCounter > MaxNodeInSub ? MaxNodeInSub : nodeCounter; for (i = 1; i <= nodeCounter; i++) { node = ctg4heapArray[i].ctgID; if (node > 0) { contig_array[node].inSubGraph = flag; } } } static boolean dispatch1node (int dis, unsigned int tempNode, int maxNodes, FibHeap * dheap, FibHeap * uheap, int *DmaxDis, int *UmaxDis) { boolean ret; if (dis >= 0) // put it to Dheap { nodeCounter++; ret = putNodeInArray (tempNode, maxNodes, dis); if (!ret) { return 0; } insertNodeIntoHeap (dheap, dis, nodeCounter); if (dis > *DmaxDis) { *DmaxDis = dis; } return 1; } else // put it to Uheap { nodeCounter++; ret = putNodeInArray (tempNode, maxNodes, dis); if (!ret) { return 0; } insertNodeIntoHeap (uheap, -dis, nodeCounter); int temp_len = contig_array[tempNode].length; if (-dis + temp_len > *UmaxDis) { *UmaxDis = -dis + contig_array[tempNode].length; } return -1; } return 0; } static boolean canDheapWait (unsigned int currNode, int dis, int DmaxDis) { if (dis < DmaxDis) { return 0; } else { return 1; } } static boolean workOnDheap (FibHeap * dheap, FibHeap * uheap, boolean * Dwait, boolean * Uwait, int *DmaxDis, int *UmaxDis, int maxNodes) { if (*Dwait) { return 1; } unsigned int currNode, twin, tempNode; CTGinHEAP *ctgInHeap; int indexInArray; CONNECT *us_cnt, *ds_cnt; int dis0, dis; boolean ret, isEmpty; while ((indexInArray = removeNextNodeFromHeap (dheap)) != 0) { ctgInHeap = &ctg4heapArray[indexInArray]; currNode = ctgInHeap->ctgID; dis0 = ctgInHeap->dis; isEmpty = IsHeapEmpty (dheap); twin = getTwinCtg (currNode); us_cnt = ctgInHeap->us_shut4dheap ? NULL : contig_array[twin].downwardConnect; while (us_cnt) { if (us_cnt->deleted || us_cnt->mask || contig_array[getTwinCtg (us_cnt->contigID)].inSubGraph) { us_cnt = us_cnt->next; continue; } tempNode = getTwinCtg (us_cnt->contigID); if (contig_array[tempNode].inSubGraph) { us_cnt = us_cnt->next; continue; } dis = dis0 - us_cnt->gapLen - (int) contig_array[twin].length; ret = dispatch1node (dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis); if (ret == 0) { return 0; } else if (ret < 0) { *Uwait = 0; } us_cnt = us_cnt->next; } if (nodeCounter > 1 && isEmpty) { *Dwait = canDheapWait (currNode, dis0, *DmaxDis); if (*Dwait) { isEmpty = IsHeapEmpty (dheap); insertNodeIntoHeap (dheap, dis0, indexInArray); ctg4heapArray[indexInArray].us_shut4dheap = 1; if (isEmpty) { return 1; } else { continue; } } } ds_cnt = ctgInHeap->ds_shut4dheap ? NULL : contig_array[currNode].downwardConnect; while (ds_cnt) { if (ds_cnt->deleted || ds_cnt->mask || contig_array[ds_cnt->contigID].inSubGraph) { ds_cnt = ds_cnt->next; continue; } tempNode = ds_cnt->contigID; dis = dis0 + ds_cnt->gapLen + (int) contig_array[tempNode].length; ret = dispatch1node (dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis); if (ret == 0) { return 0; } else if (ret < 0) { *Uwait = 0; } } // for each downstream connections } // for each node comes off the heap *Dwait = 1; return 1; } static boolean canUheapWait (unsigned int currNode, int dis, int UmaxDis) { int temp_len = contig_array[currNode].length; if (-dis + temp_len < UmaxDis) { return 0; } else { return 1; } } static boolean workOnUheap (FibHeap * dheap, FibHeap * uheap, boolean * Dwait, boolean * Uwait, int *DmaxDis, int *UmaxDis, int maxNodes) { if (*Uwait) { return 1; } unsigned int currNode, twin, tempNode; CTGinHEAP *ctgInHeap; int indexInArray; CONNECT *us_cnt, *ds_cnt; int dis0, dis; boolean ret, isEmpty; while ((indexInArray = removeNextNodeFromHeap (uheap)) != 0) { ctgInHeap = &ctg4heapArray[indexInArray]; currNode = ctgInHeap->ctgID; dis0 = ctgInHeap->dis; isEmpty = IsHeapEmpty (uheap); ds_cnt = ctgInHeap->ds_shut4uheap ? NULL : contig_array[currNode].downwardConnect; while (ds_cnt) { if (ds_cnt->deleted || ds_cnt->mask || contig_array[ds_cnt->contigID].inSubGraph) { ds_cnt = ds_cnt->next; continue; } tempNode = ds_cnt->contigID; dis = dis0 + ds_cnt->gapLen + contig_array[tempNode].length; ret = dispatch1node (dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis); if (ret == 0) { return 0; } else if (ret > 0) { *Dwait = 0; } } // for each downstream connections if (nodeCounter > 1 && isEmpty) { *Uwait = canUheapWait (currNode, dis0, *UmaxDis); if (*Uwait) { isEmpty = IsHeapEmpty (uheap); insertNodeIntoHeap (uheap, dis0, indexInArray); ctg4heapArray[indexInArray].ds_shut4uheap = 1; if (isEmpty) { return 1; } else { continue; } } } twin = getTwinCtg (currNode); us_cnt = ctgInHeap->us_shut4uheap ? NULL : contig_array[twin].downwardConnect; while (us_cnt) { if (us_cnt->deleted || us_cnt->mask || contig_array[getTwinCtg (us_cnt->contigID)].inSubGraph) { us_cnt = us_cnt->next; continue; } tempNode = getTwinCtg (us_cnt->contigID); if (contig_array[tempNode].inSubGraph) { us_cnt = us_cnt->next; continue; } dis = dis0 - us_cnt->gapLen - contig_array[twin].length; ret = dispatch1node (dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis); if (ret == 0) { return 0; } else if (ret > 0) { *Dwait = 1; } us_cnt = us_cnt->next; } } // for each node comes off the heap *Uwait = 1; return 1; } static boolean pickUpGeneralSubgraph (unsigned int node1, int maxNodes) { FibHeap *Uheap = newFibHeap (); // heap for upstream contigs to node1 FibHeap *Dheap = newFibHeap (); int UmaxDis; // max distance upstream to node1 int DmaxDis; boolean Uwait; // wait signal for Uheap boolean Dwait; int dis; boolean ret; //initiate: node1 is put to array once, and to both Dheap and Uheap dis = 0; nodeCounter = 1; putNodeInArray (node1, maxNodes, dis); insertNodeIntoHeap (Dheap, dis, nodeCounter); ctg4heapArray[nodeCounter].us_shut4dheap = 1; Dwait = 0; DmaxDis = 0; insertNodeIntoHeap (Uheap, dis, nodeCounter); ctg4heapArray[nodeCounter].ds_shut4uheap = 1; Uwait = 1; UmaxDis = contig_array[node1].length; while (1) { ret = workOnDheap (Dheap, Uheap, &Dwait, &Uwait, &DmaxDis, &UmaxDis, maxNodes); if (!ret) { setInGraph (0); destroyHeap (Dheap); destroyHeap (Uheap); return 0; } ret = workOnUheap (Dheap, Uheap, &Dwait, &Uwait, &DmaxDis, &UmaxDis, maxNodes); if (!ret) { setInGraph (0); destroyHeap (Dheap); destroyHeap (Uheap); return 0; } if (Uwait && Dwait) { destroyHeap (Dheap); destroyHeap (Uheap); return 1; } } } static int cmp_ctg (const void *a, const void *b) { CTGinHEAP *A, *B; A = (CTGinHEAP *) a; B = (CTGinHEAP *) b; if (A->dis > B->dis) { return 1; } else if (A->dis == B->dis) { return 0; } else { return -1; } } static boolean checkEligible () { unsigned int firstNode = ctg4heapArray[1].ctgID; unsigned int twin; int i; boolean flag = 0; //check if the first node has incoming link from twin of any node in subgraph // or it has multi outgoing links bound to incoming links twin = getTwinCtg (firstNode); CONNECT *ite_cnt = contig_array[twin].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } if (contig_array[ite_cnt->contigID].inSubGraph) { /* if(firstNode==3693) printf("eligible link %d -> %d\n",twin,ite_cnt->contigID); */ return 0; } if (ite_cnt->prevInScaf) { if (flag) { return 0; } flag = 1; } ite_cnt = ite_cnt->next; } //check if the last node has outgoing link to twin of any node in subgraph // or it has multi outgoing links bound to incoming links unsigned int lastNode = ctg4heapArray[nodeCounter].ctgID; ite_cnt = contig_array[lastNode].downwardConnect; flag = 0; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } twin = getTwinCtg (ite_cnt->contigID); if (contig_array[twin].inSubGraph) { /* if(firstNode==3693) printf("eligible link %d -> %d\n",lastNode,ite_cnt->contigID); */ return 0; } if (ite_cnt->prevInScaf) { if (flag) { return 0; } flag = 1; } ite_cnt = ite_cnt->next; } //check if any node has outgoing link to node outside the subgraph for (i = 1; i < nodeCounter; i++) { ite_cnt = contig_array[ctg4heapArray[i].ctgID].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } if (!contig_array[ite_cnt->contigID].inSubGraph) { /* printf("eligible check: ctg %d links to ctg %d\n", ctg4heapArray[i].ctgID,ite_cnt->contigID); */ return 0; } ite_cnt = ite_cnt->next; } } //check if any node has incoming link from node outside the subgraph for (i = 2; i <= nodeCounter; i++) { twin = getTwinCtg (ctg4heapArray[i].ctgID); ite_cnt = contig_array[twin].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } if (!contig_array[getTwinCtg (ite_cnt->contigID)].inSubGraph) { /* printf("eligible check: ctg %d links to ctg %d\n", ctg4heapArray[i].ctgID,ite_cnt->contigID); */ return 0; } ite_cnt = ite_cnt->next; } } return 1; } //put nodes in sub-graph in a line static void arrangeNodes_general () { int i, gap; CONNECT *ite_cnt, *temp_cnt, *bal_cnt, *prev_cnt, *next_cnt; unsigned int node1, node2; unsigned int bal_nd1, bal_nd2; //delete original connections for (i = 1; i <= nodeCounter; i++) { node1 = ctg4heapArray[i].ctgID; ite_cnt = contig_array[node1].downwardConnect; while (ite_cnt) { if (ite_cnt->mask || ite_cnt->deleted || !contig_array[ite_cnt->contigID].inSubGraph) { ite_cnt = ite_cnt->next; continue; } ite_cnt->deleted = 1; setNextInScaf (ite_cnt, NULL); setPrevInScaf (ite_cnt, 0); ite_cnt = ite_cnt->next; } bal_nd1 = getTwinCtg (node1); ite_cnt = contig_array[bal_nd1].downwardConnect; while (ite_cnt) { if (ite_cnt->mask || ite_cnt->deleted || !contig_array[getTwinCtg (ite_cnt->contigID)].inSubGraph) { ite_cnt = ite_cnt->next; continue; } ite_cnt->deleted = 1; setNextInScaf (ite_cnt, NULL); setPrevInScaf (ite_cnt, 0); ite_cnt = ite_cnt->next; } } //create new connections prev_cnt = next_cnt = NULL; for (i = 1; i < nodeCounter; i++) { node1 = ctg4heapArray[i].ctgID; node2 = ctg4heapArray[i + 1].ctgID; bal_nd1 = getTwinCtg (node1); bal_nd2 = getTwinCtg (node2); gap = ctg4heapArray[i + 1].dis - ctg4heapArray[i].dis - contig_array[node2].length; temp_cnt = getCntBetween (node1, node2); if (temp_cnt) { temp_cnt->deleted = 0; temp_cnt->mask = 0; //temp_cnt->gapLen = gap; bal_cnt = getCntBetween (bal_nd2, bal_nd1); bal_cnt->deleted = 0; bal_cnt->mask = 0; //bal_cnt->gapLen = gap; } else { temp_cnt = allocateCN (node2, gap); if (cntLookupTable) { putCnt2LookupTable (node1, temp_cnt); } temp_cnt->next = contig_array[node1].downwardConnect; contig_array[node1].downwardConnect = temp_cnt; bal_cnt = allocateCN (bal_nd1, gap); if (cntLookupTable) { putCnt2LookupTable (bal_nd2, bal_cnt); } bal_cnt->next = contig_array[bal_nd2].downwardConnect; contig_array[bal_nd2].downwardConnect = bal_cnt; } if (prev_cnt) { setNextInScaf (prev_cnt, temp_cnt); setPrevInScaf (temp_cnt, 1); } if (next_cnt) { setNextInScaf (bal_cnt, next_cnt); setPrevInScaf (next_cnt, 1); } prev_cnt = temp_cnt; next_cnt = bal_cnt; } //re-binding connection at both ends bal_nd2 = getTwinCtg (ctg4heapArray[1].ctgID); ite_cnt = contig_array[bal_nd2].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } if (ite_cnt->prevInScaf) { break; } ite_cnt = ite_cnt->next; } if (ite_cnt) { bal_nd1 = ite_cnt->contigID; node1 = getTwinCtg (bal_nd1); node2 = ctg4heapArray[1].ctgID; temp_cnt = checkConnect (node1, node2); bal_cnt = ite_cnt; next_cnt = checkConnect (ctg4heapArray[1].ctgID, ctg4heapArray[2].ctgID); prev_cnt = checkConnect (getTwinCtg (ctg4heapArray[2].ctgID), getTwinCtg (ctg4heapArray[1].ctgID)); if (temp_cnt) { setNextInScaf (temp_cnt, next_cnt); setPrevInScaf (temp_cnt->nextInScaf, 0); setPrevInScaf (next_cnt, 1); setNextInScaf (prev_cnt, bal_cnt); } } node1 = ctg4heapArray[nodeCounter].ctgID; ite_cnt = contig_array[node1].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask) { ite_cnt = ite_cnt->next; continue; } if (ite_cnt->prevInScaf) { break; } ite_cnt = ite_cnt->next; } if (ite_cnt) { node2 = ite_cnt->contigID; bal_nd1 = getTwinCtg (node1); bal_nd2 = getTwinCtg (node2); temp_cnt = ite_cnt; bal_cnt = checkConnect (bal_nd2, bal_nd1); next_cnt = checkConnect (getTwinCtg (ctg4heapArray[nodeCounter].ctgID), getTwinCtg (ctg4heapArray[nodeCounter - 1].ctgID)); prev_cnt = checkConnect (ctg4heapArray[nodeCounter - 1].ctgID, ctg4heapArray[nodeCounter].ctgID); setNextInScaf (prev_cnt, temp_cnt); setNextInScaf (bal_cnt, next_cnt); setPrevInScaf (next_cnt, 1); } } //check if contigs next to each other have reasonable overlap boolean checkOverlapInBetween_general (double tolerance) { int i, gap; unsigned int node; int lenSum, lenOlp; lenSum = lenOlp = 0; for (i = 1; i <= nodeCounter; i++) { node = ctg4heapArray[i].ctgID; lenSum += contig_array[node].length; } if (lenSum < 1) { return 1; } for (i = 1; i < nodeCounter; i++) { gap = ctg4heapArray[i + 1].dis - ctg4heapArray[i].dis - contig_array[ctg4heapArray[i + 1].ctgID].length; if (-gap > 0) { lenOlp += -gap; } //if(-gap>ins_size_var) if ((double) lenOlp / lenSum > tolerance) { return 0; } } return 1; } //check if there's any connect indicates the opposite order between nodes in sub-graph static boolean checkConflictCnt_general (double tolerance) { int i, j; int supportCounter = 0; int objectCounter = 0; CONNECT *cnt; for (i = 1; i < nodeCounter; i++) { for (j = i + 1; j <= nodeCounter; j++) { //cnt=getCntBetween(nodesInSubInOrder[j],nodesInSubInOrder[i]); cnt = checkConnect (ctg4heapArray[i].ctgID, ctg4heapArray[j].ctgID); if (cnt) { supportCounter += cnt->weight; } cnt = checkConnect (ctg4heapArray[j].ctgID, ctg4heapArray[i].ctgID); if (cnt) { objectCounter += cnt->weight; } //return 1; } } if (supportCounter < 1) { return 1; } if ((double) objectCounter / supportCounter < tolerance) { return 0; } return 1; } // turn sub-graph to linear struct static void general_linearization (boolean strict) { unsigned int i; int subCounter = 0; int out_num; boolean flag; int conflCounter = 0, overlapCounter = 0, eligibleCounter = 0; double overlapTolerance, conflTolerance; for (i = num_ctg; i > 0; i--) { if (contig_array[i].mask) { continue; } out_num = validConnect (i, NULL); if (out_num < 2) { continue; } //flag = pickSubGraph(i,strict); flag = pickUpGeneralSubgraph (i, MaxNodeInSub); if (!flag) { continue; } subCounter++; qsort (&ctg4heapArray[1], nodeCounter, sizeof (CTGinHEAP), cmp_ctg); flag = checkEligible (); if (!flag) { eligibleCounter++; setInGraph (0); continue; } if (strict) { overlapTolerance = OverlapPercent; conflTolerance = ConflPercent; } else { overlapTolerance = 2 * OverlapPercent; conflTolerance = 2 * ConflPercent; } flag = checkOverlapInBetween_general (overlapTolerance); if (!flag) { overlapCounter++; setInGraph (0); continue; } flag = checkConflictCnt_general (conflTolerance); if (flag) { conflCounter++; setInGraph (0); continue; } arrangeNodes_general (); setInGraph (0); } printf ("Picked %d subgraphs,%d have conflicting connections,%d have significant overlapping, %d eligible\n", subCounter, conflCounter, overlapCounter, eligibleCounter); } /**** the fowllowing codes for detecting and break down scaffold at weak point **********/ // mark connections in scaffolds made by small pe static void smallScaf () { unsigned int i, ctg, bal_ctg, prevCtg; int counter = 0; CONNECT *bindCnt, *cnt; for (i = 1; i <= num_ctg; i++) { contig_array[i].flag = 0; } for (i = 1; i <= num_ctg; i++) { if (contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect) { continue; } bindCnt = getBindCnt (i); if (!bindCnt) { continue; } counter++; contig_array[i].flag = 1; contig_array[getTwinCtg (i)].flag = 1; prevCtg = getTwinCtg (i); while (bindCnt) { ctg = bindCnt->contigID; bal_ctg = getTwinCtg (ctg); bindCnt->bySmall = 1; cnt = getCntBetween (bal_ctg, prevCtg); if (cnt) { cnt->bySmall = 1; } contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; prevCtg = bal_ctg; bindCnt = bindCnt->nextInScaf; } ctg = getTwinCtg (i); bindCnt = getBindCnt (ctg); prevCtg = i; while (bindCnt) { ctg = bindCnt->contigID; bal_ctg = getTwinCtg (ctg); bindCnt->bySmall = 1; cnt = getCntBetween (bal_ctg, prevCtg); if (cnt) { cnt->bySmall = 1; } contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; prevCtg = bal_ctg; bindCnt = bindCnt->nextInScaf; } } printf ("Report from smallScaf: %d scaffolds by smallPE\n", counter); } static boolean putItem2Sarray (unsigned int scaf, int wt, DARRAY * SCAF, DARRAY * WT, int counter) { int i; unsigned int *scafP, *wtP; for (i = 0; i < counter; i++) { scafP = (unsigned int *) darrayGet (SCAF, i); if ((*scafP) == scaf) { wtP = (unsigned int *) darrayGet (WT, i); *wtP = (*wtP + wt); return 0; } } scafP = (unsigned int *) darrayPut (SCAF, counter); wtP = (unsigned int *) darrayPut (WT, counter); *scafP = scaf; *wtP = wt; return 1; } static int getDSLink2Scaf (STACK * scafStack, DARRAY * SCAF, DARRAY * WT) { CONNECT *ite_cnt; unsigned int ctg, targetCtg, *pt; int counter = 0; boolean inc; stackRecover (scafStack); while ((pt = (unsigned int *) stackPop (scafStack)) != NULL) { ctg = *pt; if (contig_array[ctg].mask || !contig_array[ctg].downwardConnect) { continue; } ite_cnt = contig_array[ctg].downwardConnect; while (ite_cnt) { if (ite_cnt->deleted || ite_cnt->mask || ite_cnt->singleInScaf || ite_cnt->nextInScaf || ite_cnt->prevInScaf || ite_cnt->inherit) { ite_cnt = ite_cnt->next; continue; } targetCtg = ite_cnt->contigID; if (contig_array[ctg].from_vt == contig_array[targetCtg].from_vt) // on the same scaff { ite_cnt = ite_cnt->next; continue; } inc = putItem2Sarray (contig_array[targetCtg].from_vt, ite_cnt->weight, SCAF, WT, counter); if (inc) { counter++; } ite_cnt = ite_cnt->next; } } return counter; } static int getScaffold (unsigned int start, STACK * scafStack) { int len = contig_array[start].length; unsigned int *pt, ctg; emptyStack (scafStack); pt = (unsigned int *) stackPush (scafStack); *pt = start; CONNECT *bindCnt = getBindCnt (start); while (bindCnt) { ctg = bindCnt->contigID; pt = (unsigned int *) stackPush (scafStack); *pt = ctg; len += contig_array[ctg].length; bindCnt = bindCnt->nextInScaf; } stackBackup (scafStack); return len; } static boolean isLinkReliable (DARRAY * WT, int count) { int i; for (i = 0; i < count; i++) if (*(int *) darrayGet (WT, i) >= weakPE) { return 1; } return 0; } static int getWtFromSarray (DARRAY * SCAF, DARRAY * WT, int count, unsigned int scaf) { int i; for (i = 0; i < count; i++) if (*(unsigned int *) darrayGet (SCAF, i) == scaf) { return *(int *) darrayGet (WT, i); } return 0; } static void switch2twin (STACK * scafStack) { unsigned int *pt; stackRecover (scafStack); while ((pt = (unsigned int *) stackPop (scafStack)) != NULL) { *pt = getTwinCtg (*pt); } } /* ------> scaf1 --- --- -- -- --- scaf2 -- --- --- -- ----> */ static boolean checkScafConsist (STACK * scafStack1, STACK * scafStack2) { DARRAY *downwardTo1 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); // scaf links to those scaffolds DARRAY *downwardTo2 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); DARRAY *downwardWt1 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); // scaf links to scaffolds with those wt DARRAY *downwardWt2 = (DARRAY *) createDarray (1000, sizeof (unsigned int)); int linkCount1 = getDSLink2Scaf (scafStack1, downwardTo1, downwardWt1); int linkCount2 = getDSLink2Scaf (scafStack2, downwardTo2, downwardWt2); if (!linkCount1 || !linkCount2) { freeDarray (downwardTo1); freeDarray (downwardTo2); freeDarray (downwardWt1); freeDarray (downwardWt2); return 1; } boolean flag1 = isLinkReliable (downwardWt1, linkCount1); boolean flag2 = isLinkReliable (downwardWt2, linkCount2); if (!flag1 || !flag2) { freeDarray (downwardTo1); freeDarray (downwardTo2); freeDarray (downwardWt1); freeDarray (downwardWt2); return 1; } unsigned int scaf; int i, wt1, wt2, ret = 1; for (i = 0; i < linkCount1; i++) { wt1 = *(int *) darrayGet (downwardWt1, i); if (wt1 < weakPE) { continue; } scaf = *(unsigned int *) darrayGet (downwardTo1, i); wt2 = getWtFromSarray (downwardTo2, downwardWt2, linkCount2, scaf); if (wt2 < 1) { //fprintf(stderr,"Inconsistant link to %d\n",scaf); ret = 0; break; } } freeDarray (downwardTo1); freeDarray (downwardTo2); freeDarray (downwardWt1); freeDarray (downwardWt2); return ret; } static void setBreakPoints (DARRAY * ctgArray, int count, int weakest, int *start, int *finish) { int index = weakest - 1; unsigned int thisCtg; unsigned int nextCtg = *(unsigned int *) darrayGet (ctgArray, weakest); CONNECT *cnt; *start = weakest; while (index >= 0) { thisCtg = *(unsigned int *) darrayGet (ctgArray, index); cnt = getCntBetween (thisCtg, nextCtg); if (cnt->maxGap > 2) { break; } else { *start = index; } nextCtg = thisCtg; index--; } unsigned int prevCtg = *(unsigned int *) darrayGet (ctgArray, weakest + 1); *finish = weakest + 1; index = weakest + 2; while (index < count) { thisCtg = *(unsigned int *) darrayGet (ctgArray, index); cnt = getCntBetween (prevCtg, thisCtg); if (cnt->maxGap > 2) { break; } else { *finish = index; } prevCtg = thisCtg; index++; } } static void changeScafEnd (STACK * scafStack, unsigned int end) { unsigned int ctg, *pt; unsigned int start = getTwinCtg (end); stackRecover (scafStack); while ((pt = (unsigned int *) stackPop (scafStack)) != NULL) { ctg = *pt; contig_array[ctg].to_vt = end; contig_array[getTwinCtg (ctg)].from_vt = start; } } static void changeScafBegin (STACK * scafStack, unsigned int start) { unsigned int ctg, *pt; unsigned int end = getTwinCtg (start); stackRecover (scafStack); while ((pt = (unsigned int *) stackPop (scafStack)) != NULL) { ctg = *pt; contig_array[ctg].from_vt = start; contig_array[getTwinCtg (ctg)].to_vt = end; } } // break down scaffolds poorly supported by longer PE static void detectBreakScaf () { unsigned int i, avgPE, scafLen, len, ctg, bal_ctg, prevCtg, thisCtg; long long peCounter, linkCounter; int num3, num5, weakPoint, tempCounter, j, t, counter = 0; CONNECT *bindCnt, *cnt, *weakCnt; STACK *scafStack1 = (STACK *) createStack (1000, sizeof (unsigned int)); STACK *scafStack2 = (STACK *) createStack (1000, sizeof (unsigned int)); for (i = 1; i <= num_ctg; i++) { contig_array[i].flag = 0; } for (i = 1; i <= num_ctg; i++) { if (contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect) { continue; } bindCnt = getBindCnt (i); if (!bindCnt) { continue; } //first scan get the average coverage by longer pe num5 = num3 = peCounter = linkCounter = 0; scafLen = contig_array[i].length; ctg = i; *(unsigned int *) darrayPut (scaf5, num5++) = i; contig_array[i].flag = 1; contig_array[getTwinCtg (i)].flag = 1; while (bindCnt) { if (!bindCnt->bySmall) { break; } linkCounter++; peCounter += bindCnt->maxGap; ctg = bindCnt->contigID; scafLen += contig_array[ctg].length; *(unsigned int *) darrayPut (scaf5, num5++) = ctg; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; bindCnt = bindCnt->nextInScaf; } ctg = getTwinCtg (i); bindCnt = getBindCnt (ctg); while (bindCnt) { if (!bindCnt->bySmall) { break; } linkCounter++; peCounter += bindCnt->maxGap; ctg = bindCnt->contigID; scafLen += contig_array[ctg].length; bal_ctg = getTwinCtg (ctg); contig_array[ctg].flag = 1; contig_array[bal_ctg].flag = 1; *(unsigned int *) darrayPut (scaf3, num3++) = bal_ctg; bindCnt = bindCnt->nextInScaf; } if (linkCounter < 1 || scafLen < 5000) { continue; } avgPE = peCounter / linkCounter; if (avgPE < 10) { continue; } tempCounter = 0; for (j = num3 - 1; j >= 0; j--) *(unsigned int *) darrayPut (tempArray, tempCounter++) = *(unsigned int *) darrayGet (scaf3, j); for (j = 0; j < num5; j++) *(unsigned int *) darrayPut (tempArray, tempCounter++) = *(unsigned int *) darrayGet (scaf5, j); prevCtg = *(unsigned int *) darrayGet (tempArray, 0); weakCnt = NULL; weakPoint = 0; len = contig_array[prevCtg].length; for (t = 1; t < tempCounter; t++) { thisCtg = *(unsigned int *) darrayGet (tempArray, t); if (len < 2000) { len += contig_array[thisCtg].length; prevCtg = thisCtg; continue; } else if (len > scafLen - 2000) { break; } len += contig_array[thisCtg].length; if (contig_array[prevCtg].from_vt != contig_array[thisCtg].from_vt || contig_array[prevCtg].indexInScaf > contig_array[thisCtg].indexInScaf) { prevCtg = thisCtg; continue; } cnt = getCntBetween (prevCtg, thisCtg); if (!weakCnt || weakCnt->maxGap > cnt->maxGap) { weakCnt = cnt; weakPoint = t; } prevCtg = thisCtg; } if (!weakCnt || (weakCnt->maxGap > 2 && weakCnt->maxGap > avgPE / 5)) { continue; } prevCtg = *(unsigned int *) darrayGet (tempArray, weakPoint - 1); thisCtg = *(unsigned int *) darrayGet (tempArray, weakPoint); if (contig_array[prevCtg].from_vt != contig_array[thisCtg].from_vt || contig_array[prevCtg].indexInScaf > contig_array[thisCtg].indexInScaf) { printf ("contig %d and %d not on the same scaff\n", prevCtg, thisCtg); continue; } setConnectWP (prevCtg, thisCtg, 1); /* fprintf(stderr,"scaffold len %d, avg long pe cov %d (%ld/%ld)\n", scafLen,avgPE,peCounter,linkCounter); fprintf(stderr,"Weak connect (%d) between %d(%dth of %d) and %d\n" ,weakCnt->maxGap,prevCtg,weakPoint-1,tempCounter,thisCtg); */ // set start and end to break down the scaffold int index1, index2; setBreakPoints (tempArray, tempCounter, weakPoint - 1, &index1, &index2); //fprintf(stderr,"break %d ->...-> %d\n",index1,index2); unsigned int start = *(unsigned int *) darrayGet (tempArray, index1); unsigned int finish = *(unsigned int *) darrayGet (tempArray, index2); int len1 = getScaffold (getTwinCtg (start), scafStack1); int len2 = getScaffold (finish, scafStack2); if (len1 < 2000 || len2 < 2000) { continue; } switch2twin (scafStack1); int flag1 = checkScafConsist (scafStack1, scafStack2); switch2twin (scafStack1); switch2twin (scafStack2); int flag2 = checkScafConsist (scafStack2, scafStack1); if (!flag1 || !flag2) { changeScafBegin (scafStack1, getTwinCtg (start)); changeScafEnd (scafStack2, getTwinCtg (finish)); //unbind links unsigned int nextCtg = *(unsigned int *) darrayGet (tempArray, index1 + 1); thisCtg = *(unsigned int *) darrayGet (tempArray, index1); cnt = getCntBetween (getTwinCtg (nextCtg), getTwinCtg (thisCtg)); if (cnt->nextInScaf) { prevCtg = getTwinCtg (cnt->nextInScaf->contigID); cnt->nextInScaf->prevInScaf = 0; cnt = getCntBetween (prevCtg, thisCtg); cnt->nextInScaf = NULL; } prevCtg = *(unsigned int *) darrayGet (tempArray, index2 - 1); thisCtg = *(unsigned int *) darrayGet (tempArray, index2); cnt = getCntBetween (prevCtg, thisCtg); if (cnt->nextInScaf) { nextCtg = cnt->nextInScaf->contigID; cnt->nextInScaf->prevInScaf = 0; cnt = getCntBetween (getTwinCtg (nextCtg), getTwinCtg (thisCtg)); cnt->nextInScaf = NULL; } prevCtg = *(unsigned int *) darrayGet (tempArray, index1); for (t = index1 + 1; t <= index2; t++) { thisCtg = *(unsigned int *) darrayGet (tempArray, t); cnt = getCntBetween (prevCtg, thisCtg); cnt->mask = 1; cnt->nextInScaf = NULL; cnt->prevInScaf = 0; cnt = getCntBetween (getTwinCtg (thisCtg), getTwinCtg (prevCtg)); cnt->mask = 1; cnt->nextInScaf = NULL; cnt->prevInScaf = 0; /* fprintf(stderr,"(%d %d)/(%d %d) ", prevCtg,thisCtg,getTwinCtg(thisCtg),getTwinCtg(prevCtg)); */ prevCtg = thisCtg; } //fprintf(stderr,": BREAKING\n"); counter++; } } freeStack (scafStack1); freeStack (scafStack2); printf ("Report from checkScaf: %d scaffold segments broken\n", counter); } static boolean checkSimple (DARRAY * ctgArray, int count) { int i; unsigned int ctg; for (i = 0; i < count; i++) { ctg = *(unsigned int *) darrayGet (ctgArray, i); contig_array[ctg].flag = 0; contig_array[getTwinCtg (ctg)].flag = 0; } for (i = 0; i < count; i++) { ctg = *(unsigned int *) darrayGet (ctgArray, i); if (contig_array[ctg].flag) { return 0; } contig_array[ctg].flag = 1; contig_array[getTwinCtg (ctg)].flag = 1; } return 1; } static void checkCircle () { unsigned int i, ctg; CONNECT *cn_temp1; int counter = 0; for (i = 1; i <= num_ctg; i++) { cn_temp1 = contig_array[i].downwardConnect; while (cn_temp1) { if (cn_temp1->weak || cn_temp1->deleted) { cn_temp1 = cn_temp1->next; continue; } ctg = cn_temp1->contigID; if (checkConnect (ctg, i)) { counter++; maskContig (i, 1); maskContig (ctg, 1); } cn_temp1 = cn_temp1->next; } } printf ("%d loops found and removed in Graph\n", counter); }