//------------------------------------------------------------------------------ // Programmer: Adam M Phillippy, The Institute for Genomic Research // File: show-coords.cc // Date: 08 / 13 / 2002 // // Usage: show-coords [options] // Try 'show-coords -h' for more information // // Description: For use in conjunction with the MUMmer package. // "show-coords" displays human readable information from the // .delta output of the "nucmer" and "promer" programs. Outputs // various useful information to stdout. Works for both nucleotide // and amino-acid alignments. // //------------------------------------------------------------------------------ #include "delta.hh" #include "tigrinc.hh" #include #include using namespace std; //------------------------------------------------------------- Constants ----// //-- for the -k option, used in comparing two overlapping alignments const float MIN_OVERLAP_P = 0.5; // min percent overlap to spawn knockout const float MIN_LENGTH_P = 0.75; // winner must be atleast this big as loser const float MAX_OHANG_P = 0.05; // max overlap hang as a percentage of overlap const float MAX_PIDYDIFF = 0.01; //------------------------------------------------------ Type Definitions ----// struct AlignStats //-- Alignment statistics data structure { bool isLAS; // involved in a longest ascending subset float Idy; // percent identity (0.0 - 100.0) float Sim; // percent similarity (0.0 - 100.0) float Stp; // precent stop codon (0.0 - 100.0) int FrameA, FrameB; // reading frame long int sA, eA, sB, eB; // start, end in A, start, end in B long int SeqLenA, SeqLenB; // length of seq A, seq B char * IdA, * IdB; // Id of seq A, Id of seq B char annot [12]; // annotation string }; struct LASstats { bool ori; long int score, wscore; long int from, wfrom; bool wpoint; vector::iterator asi; }; struct LAS_Sort { bool operator() (const LASstats & pA, const LASstats & pB) { if ( pA . asi -> sA < pB . asi -> sA ) return true; else return false; } }; struct sA_Sort //-- For sorting AlignStats by sA { bool operator() (const AlignStats & pA, const AlignStats & pB) { //-- sort sA if ( pA.sA < pB.sA ) return true; else return false; } }; struct IdA_sA_IdB_sB_Sort //-- For sorting AlignStats by IdA, sA, IdB, sB in that order (-r option) { bool operator() (const AlignStats & pA, const AlignStats & pB) { int i = strcmp (pA.IdA, pB.IdA); int j = strcmp (pA.IdB, pB.IdB); //-- sort IdA if ( i < 0 ) return true; else if ( i > 0 ) return false; //-- sort sA else { if ( pA.sA < pB.sA ) return true; else if ( pA.sA > pB.sA ) return false; //-- sort IdB else { if ( j < 0 ) return true; else if ( j > 0 ) return false; //-- sort sB else { if ( pA.sB < pB.sB ) return true; else return false; } } } } }; struct IdB_sB_IdA_sA_Sort //-- For sorting AlignStats by IdB, sB, IdA, sA in that order (-q option) { bool operator() (const AlignStats & pA, const AlignStats & pB) { int i = strcmp (pA.IdB, pB.IdB); int j = strcmp (pA.IdA, pB.IdA); //-- sort IdB if ( i < 0 ) return true; else if ( i > 0 ) return false; //-- sort sB else { if ( pA.sB < pB.sB ) return true; else if ( pA.sB > pB.sB ) return false; //-- sort IdA else { if ( j < 0 ) return true; else if ( j > 0 ) return false; //-- sort sA else { if ( pA.sA < pB.sA ) return true; else return false; } } } } }; //--------------------------------------------------- Global Option Flags ----// bool isKnockout = false; // -k option bool isAnnotateOverlaps = false; // -o option bool isBrief = false; // -b option bool isBtab = false; // -B option bool isShowCoverage = false; // -c option bool isShowDir = false; // -d option bool isShowSeqLens = false; // -l option bool isShowWarnings = false; // -w option bool isSortByQuery = false; // -q option bool isSortByReference = false; // -r option bool isLAS = false; // -g option bool isWLAS = false; // -G option bool isPrintTabular = false; // -T option bool isPrintHeader = true; // -H option bool isAnnotation = false; // true if either -w or -o float idyCutoff = 0; // -I option long int lenCutoff = 0; // -L option int whichDataType = NUCMER_DATA; // set by .delta header char InputFileName [MAX_LINE]; // I/O filenames char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; //------------------------------------------------- Function Declarations ----// void generateWarnings (vector & Stats); inline long int Max (long int A, long int B) { if (A < B) return B; else return A; } inline long int Min (long int A, long int B) { if (A >= B) return B; else return A; } void annotateOverlaps (vector & Stats); void flagLAS (vector & Stats); void parseDelta (vector & Stats); void printBtab (vector Stats); void printHuman (vector Stats); void printTabular (vector Stats); inline long int revC (long int Coord, long int Len); void simplifyAlignments (vector & Stats); void printHelp (const char * s); void printUsage (const char * s); inline void swap (long int & a, long int & b) { static long int t; t = a; a = b; b = t; } //-------------------------------------------------- Function Definitions ----// int main (int argc, char ** argv) { vector Stats; // all the alignment regions //-- Parse the command line arguments { int ch, errflg = 0; optarg = NULL; while ( !errflg && ((ch = getopt (argc, argv, "bkBdhTHqrgGclowI:L:")) != EOF) ) switch (ch) { case 'b' : isBrief = true; break; case 'B' : isBtab = true; break; case 'c' : isShowCoverage = true; break; case 'd' : isShowDir = true; break; case 'h' : printHelp (argv[0]); exit (EXIT_SUCCESS); break; case 'H' : isPrintHeader = false; break; case 'I' : idyCutoff = atof (optarg); break; case 'k' : isKnockout = true; break; case 'l' : isShowSeqLens = true; break; case 'L' : lenCutoff = atol (optarg); break; case 'o' : isAnnotateOverlaps = true; break; case 'q' : isSortByQuery = true; break; case 'r' : isSortByReference = true; break; case 'g' : isLAS = true; break; case 'G' : isLAS = true; isWLAS = true; break; case 'T' : isPrintTabular = true; break; case 'w' : isShowWarnings = true; break; default : errflg ++; } if ( errflg > 0 || argc - optind != 1 ) { printUsage (argv[0]); exit (EXIT_FAILURE); } if ( isShowWarnings && isAnnotateOverlaps ) { fprintf (stderr, "WARNING: -o and -w are mutually exclusive, -w ignored\n"); isShowWarnings = false; } if ( isWLAS || isShowWarnings ) { fprintf (stderr, "WARNING: hidden option used, I hope you know what you are doing\n"); } if ( isLAS && (!isSortByReference && !isSortByQuery) ) { fprintf (stderr, "WARNING: -g option requires -r or -q, using -r\n"); isSortByReference = true; } if ( isBtab && (!isPrintHeader || isShowCoverage || isShowSeqLens || isPrintTabular || isShowWarnings || isBrief || isAnnotateOverlaps) ) { fprintf (stderr, "WARNING: the options -bdHcloTw are all ignored for btab\n"); isPrintHeader = isShowCoverage = isShowSeqLens = isPrintTabular = isShowWarnings = isBrief = isAnnotateOverlaps = false; } if ( isBrief && (isKnockout || isLAS) ) { fprintf (stderr, "WARNING: -k and -g have no effect when used with -b, -k and -g ignored\n"); isKnockout = false; isLAS = false; } if ( isSortByReference && isSortByQuery ) fprintf (stderr, "WARNING: both -r and -q were passed, -q ignored\n"); if ( idyCutoff < 0.0 || idyCutoff > 100.0 ) { fprintf(stderr, "\nERROR: Percentages must be within the range [0.0. 100.0]\n"); exit (EXIT_FAILURE); } if ( lenCutoff < 0 ) { fprintf(stderr, "\nERROR: Length values must be positive\n"); exit (EXIT_FAILURE); } if ( isShowWarnings || isAnnotateOverlaps ) isAnnotation = true; } srand (time (NULL)); //-- Open and parse the delta file strcpy (InputFileName, argv[optind ++]); parseDelta (Stats); //-- Can only pick best frame for promer data if ( isKnockout && whichDataType != PROMER_DATA ) { fprintf(stderr,"WARNING: -k only applies to PROmer data, -k ignored\n"); isKnockout = false; } if ( whichDataType == PROMER_DATA ) isShowDir = true; //-- NOTE: simplifyAlignments assumes all alignments from two // sequences are grouped together (as ouput be postpro/nuc) if ( isBrief || isKnockout ) simplifyAlignments (Stats); //-- NOTE: flagLAS assumes all alignments from two // sequences are grouped together (as ouput be postpro/nuc) if ( isLAS ) flagLAS (Stats); //-- Sort the alignment regions if user passed -r or -q option if ( isSortByReference ) sort (Stats.begin( ), Stats.end( ), IdA_sA_IdB_sB_Sort( )); else if ( isSortByQuery ) sort (Stats.begin( ), Stats.end( ), IdB_sB_IdA_sA_Sort( )); //-- Generate overlap warnings if user passed -w option if ( isShowWarnings ) generateWarnings (Stats); //-- Generate overlap annotations if user passed -o option if ( isAnnotateOverlaps ) annotateOverlaps (Stats); //-- Output data to stdout, tabular if -T option was used if ( isBtab ) printBtab (Stats); else if ( isPrintTabular ) printTabular (Stats); else printHuman (Stats); return EXIT_SUCCESS; } void flagLAS (vector & Stats) { LASstats alas; vector las; vector ties; vector::iterator Sp; vector::iterator Sep; long int i, j, n, best, next; long int lenA, lenB, len, olapA, olapB, olap; bool wrap; Sp = Stats.begin( ); while ( Sp < Stats.end( ) ) { las . clear( ); for ( Sep = Sp; Sep < Stats.end( ); Sep ++ ) { if ( strcmp (Sp->IdA, Sep->IdA) != 0 || strcmp (Sp->IdB, Sep->IdB) != 0 ) break; Sep -> isLAS = false; if ( Sep -> sA < 0 ) continue; if ( Sep -> FrameA < 0 ) swap (Sep -> sA, Sep -> eA); if ( Sep -> FrameB < 0 ) swap (Sep -> sB, Sep -> eB); if ( (Sep -> FrameA < 0 && Sep -> FrameB < 0) || (Sep -> FrameA > 0 && Sep -> FrameB > 0) ) alas . ori = 0; else { alas . ori = 1; swap (Sep -> sB, Sep -> eB); Sep -> sB = revC (Sep -> sB, Sep -> SeqLenB); Sep -> eB = revC (Sep -> eB, Sep -> SeqLenB); } lenA = Sep -> eA - Sep -> sA + 1; lenB = Sep -> eB - Sep -> sB + 1; alas . score = alas . wscore = lenA < lenB ? lenA : lenB; alas . from = alas . wfrom = -1; alas . wpoint = false; alas . asi = Sep; las . push_back (alas); } sort (las . begin( ), las . end( ), LAS_Sort( )); n = las . size( ); for ( i = 0; i < n; i ++ ) for ( j = 0; j < i; j ++ ) { if ( las [i] . ori != las [j] . ori ) continue; lenA = las [i] . asi -> eA - las [i] . asi -> sA + 1; lenB = las [i] . asi -> eB - las [i] . asi -> sB + 1; len = lenA < lenB ? lenA : lenB; olapA = las [j] . asi -> eA - las [i] . asi -> sA + 1; olap = olapA > 0 ? olapA : 0; if ( las [j] . score + len - olap > las [i] . wscore ) { las [i] . wfrom = j; las [i] . wscore = las [j] . score + len - olap; las [i] . wpoint = true; } olapB = las [j] . asi -> eB - las [i] . asi -> sB + 1; olap = olapB > olap ? olapB : olap; if ( las [j] . score + len - olap > las [i] . score ) { las [i] . from = j; las [i] . score = las [j] . score + len - olap; } if ( las [j] . wscore + len - olap > las [i] . wscore ) { las [i] . wfrom = j; las [i] . wscore = las [j] . wscore + len - olap; las [i] . wpoint = false; } } best = 0; if ( isWLAS ) { for ( i = 0; i < n; i ++ ) { if ( las [i] . wscore > las [best] . wscore || (las [i] . wscore == las [best] . wscore && las [i] . asi -> Idy > las [best] . asi -> Idy) ) best = i; } } else { for ( i = 0; i < n; i ++ ) { if ( las [i] . score > las [best] . score || (las [i] . score == las [best] . score && las [i] . asi -> Idy > las [best] . asi -> Idy) ) best = i; } } ties . clear( ); for ( i = 0; i < n; i ++ ) { if ( las [i] . ori == 1 ) { swap (las [i] . asi -> sB, las [i] . asi -> eB); las[i].asi->sB = revC (las[i].asi->sB,las[i].asi->SeqLenB); las[i].asi->eB = revC (las[i].asi->eB,las[i].asi->SeqLenB); } if ( las [i] . asi -> FrameA < 0 ) swap (las [i] . asi -> sA, las [i] . asi -> eA); if ( las [i] . asi -> FrameB < 0 ) swap (las [i] . asi -> sB, las [i] . asi -> eB); if ( ((isWLAS && las [i] . wscore == las [best] . wscore) || (!isWLAS && las [i] . score == las [best] . score)) && las [i] . asi -> Idy >= las [best] . asi -> Idy - MAX_PIDYDIFF) ties . push_back (i); } if ( ties . size( ) > 1 ) best = ties [(long int)((double)ties . size( ) * rand( ) / (RAND_MAX + 1.0))]; if ( isWLAS ) { wrap = false; for ( i = best; i >= 0; i = next ) { las [i] . asi -> isLAS = true; if ( wrap ) { next = las [i] . from; las [i] . wpoint = false; } else next = las [i] . wfrom; if ( las [i] . wpoint ) wrap = true; } } else { for ( i = best; i >= 0; i = las [i] . from ) las [i] . asi -> isLAS = true; } Sp = Sep; } for ( Sp = Stats . begin( ); Sp < Stats . end( ); Sp ++ ) if ( Sp -> sA > 0 && !Sp -> isLAS ) Sp -> sA *= -1; } void annotateOverlaps (vector & Stats) { vector::iterator Sip; long int loR, hiR, loQ, hiQ; long int max_ohang; for ( Sip = Stats . begin( ); Sip < Stats . end( ); Sip ++ ) { Sip -> annot [0] = '\0'; if ( Sip -> sA < 0 ) continue; loR = Sip -> sA; hiR = Sip -> eA; loQ = Sip -> sB; hiQ = Sip -> eB; if ( Sip -> FrameA < 0 ) swap (loR, hiR); if ( Sip -> FrameB < 0 ) swap (loQ, hiQ); if ( !(Sip -> FrameA < 0 && Sip -> FrameB < 0) && !(Sip -> FrameA > 0 && Sip -> FrameB > 0) ) { swap (loQ, hiQ); loQ = revC (loQ, Sip -> SeqLenB); hiQ = revC (hiQ, Sip -> SeqLenB); } max_ohang = (long int)(MAX_OHANG_P * (float)(hiR - loR > hiQ - loQ ? hiR - loR + 1 : hiQ - loQ + 1)); if ( (loR <= max_ohang || loQ <= max_ohang) && (hiR + max_ohang > Sip -> SeqLenA || hiQ + max_ohang > Sip -> SeqLenB) ) { if ( loR <= max_ohang && loQ <= max_ohang && hiR + max_ohang > Sip -> SeqLenA && hiQ + max_ohang > Sip -> SeqLenB ) strcpy (Sip -> annot, "[IDENTITY]"); else if ( loR <= max_ohang && hiR + max_ohang > Sip -> SeqLenA ) strcpy (Sip -> annot, "[CONTAINED]"); else if ( loQ <= max_ohang && hiQ + max_ohang > Sip -> SeqLenB ) strcpy (Sip -> annot, "[CONTAINS]"); else if ( loQ <= max_ohang ) strcpy (Sip -> annot, "[END]"); else if ( loR <= max_ohang ) strcpy (Sip -> annot, "[BEGIN]"); else assert (false); } } } void generateWarnings (vector & Stats) // Generate warnings for each alignment region if it overlaps or in any // other way conflicts with the alignment preceding it. Set its // warning field appropriately. { vector::iterator Sip; vector::iterator Sprev; long int s1, e1, s2, e2, ps1, pe1, ps2, pe2; long int tmp; if ( !Stats.empty( ) ) Stats.begin()->annot [0] = '\0'; for ( Sip = Stats.begin( ) + 1; Sip < Stats.end( ); Sip ++ ) { Sip->annot [0] = '\0'; if ( Sip->sA < 0 ) continue; for ( Sprev = Sip - 1; Sprev >= Stats.begin( ) && Sprev->sA < 0; Sprev -- ) { }; if ( Sprev < Stats.begin( ) || // Sip->FrameA != Sprev->FrameA || // Sip->FrameB != Sprev->FrameB || strcmp ( Sip->IdA , Sprev->IdA ) != 0 || strcmp ( Sip->IdB , Sprev->IdB ) != 0 ) continue; ps1 = Sprev->sA; pe1 = Sprev->eA; ps2 = Sprev->sB; pe2 = Sprev->eB; s1 = Sip->sA; e1 = Sip->eA; s2 = Sip->sB; e2 = Sip->eB; if ( Sprev->FrameA < 0 ) { tmp = ps1; ps1 = pe1; pe1 = tmp; } if ( Sprev->FrameB < 0 ) { tmp = ps2; ps2 = pe2; pe2 = tmp; } if ( Sip->FrameA < 0 ) { tmp = s1; s1 = e1; e1 = tmp; } if ( Sip->FrameB < 0 ) { tmp = s2; s2 = e2; e2 = tmp; } //-- Duplicate region if ( s1 == ps1 && s2 == ps2 && e1 == pe1 && e2 == pe2 ) strcpy ( Sip->annot , "[DUPLICATE]" ); //-- Shadowing region else if ( s1 < ps1 && s2 < ps2 && e1 > pe1 && e2 > pe2 ) strcpy ( Sip->annot , "[CONTAINS]" ); //-- Shadowed region else if ( ps1 < s1 && ps2 < s2 && pe1 > e1 && pe2 > e2 ) strcpy ( Sip->annot , "[SHADOWED]" ); //-- Overlapping region in reference sequence else if ( (s1 >= ps1 && s1 <= pe1) || (e1 >= ps1 && e1 <= pe1) ) strcpy ( Sip->annot , "[OVERLAPS]" ); //-- Overlapping region in query sequence else if ( (s2 >= ps2 && s2 <= pe2) || (e2 >= ps2 && e2 <= pe2) ) strcpy ( Sip->annot , "[OVERLAPS]" ); //-- else No Conflict } } void printBtab (vector Stats) { time_t currtime; char * type; char date[MAX_LINE]; long int len; vector::iterator Sip; currtime = time(NULL); strftime (date, MAX_LINE, "%b %d %Y", localtime(&currtime)); if ( whichDataType == NUCMER_DATA ) type = "NUCMER"; else if ( whichDataType == PROMER_DATA ) type = "PROMER"; else type = "NULL"; //-- Print the stats for ( Sip = Stats.begin( ); Sip < Stats.end( ); Sip ++ ) { if ( Sip->sA < 0 ) continue; len = labs(Sip->eB - Sip->sB) + 1; //-- Output the stats for this alignment in btab format printf("%s\t%s\t%ld\t%s\t%s\t%s\t", Sip->IdB, date, Sip->SeqLenB, type, RefFileName, Sip->IdA); printf("%ld\t%ld\t%ld\t%ld\t%f\t%f\t%ld\t0\t0\tNULL\t", Sip->sB, Sip->eB, Sip->sA, Sip->eA, Sip->Idy, Sip->Sim, len); printf("%d\t%s\t%ld\t0\t0\n", whichDataType == NUCMER_DATA ? 0 : Sip->FrameA, Sip->FrameB < 0 ? "Minus" : "Plus", Sip->SeqLenA); } return; } void printHuman (vector Stats) // Print stats in a human readable format { long int len1, len2; float covA, covB; vector::iterator Sip; //-- Print the output header if ( isPrintHeader ) { printf ("%s %s\n%s\n\n", RefFileName, QryFileName, whichDataType == NUCMER_DATA ? "NUCMER" : "PROMER"); printf("%8s %8s | ", "[S1]", "[E1]"); printf("%8s %8s | ", "[S2]", "[E2]"); printf("%8s %8s | ", "[LEN 1]", "[LEN 2]"); if ( !isBrief ) { printf("%8s ", "[% IDY]"); if ( whichDataType == PROMER_DATA ) printf("%8s %8s ", "[% SIM]", "[% STP]"); printf(" | "); } if ( isShowSeqLens ) printf("%8s %8s | ", "[LEN R]", "[LEN Q]"); if ( isShowCoverage ) printf("%8s %8s | ", "[COV R]", "[COV Q]"); if ( !isBrief && isShowDir ) printf("%5s ", "[FRM]"); printf("%s", "[TAGS]"); printf("\n"); if ( isShowSeqLens ) printf("====================="); if ( isShowCoverage ) printf("====================="); if ( !isBrief ) { printf("============"); if ( isShowDir ) printf("======="); if ( whichDataType == PROMER_DATA ) printf("=================="); } printf("====================================" "=====================================\n"); } //-- Print the stats for ( Sip = Stats.begin( ); Sip < Stats.end( ); Sip ++ ) { if ( Sip->sA < 0 ) continue; len1 = labs(Sip->eA - Sip->sA) + 1; len2 = labs(Sip->eB - Sip->sB) + 1; covA = (float)len1 / (float)Sip->SeqLenA * 100.0; covB = (float)len2 / (float)Sip->SeqLenB * 100.0; //-- Output the statistics for this alignment region printf("%8ld %8ld | ", Sip->sA, Sip->eA); printf("%8ld %8ld | ", Sip->sB, Sip->eB); printf("%8ld %8ld | ", len1, len2); if ( !isBrief ) { printf("%8.2f ", Sip->Idy); if ( whichDataType == PROMER_DATA ) printf("%8.2f %8.2f ", Sip->Sim, Sip->Stp); printf(" | "); } if ( isShowSeqLens ) printf("%8ld %8ld | ", Sip->SeqLenA, Sip->SeqLenB); if ( isShowCoverage ) printf("%8.2f %8.2f | ", covA, covB); if ( !isBrief && isShowDir ) printf("%2d %2d ", Sip->FrameA, Sip->FrameB); printf("%s\t%s", Sip->IdA, Sip->IdB); if ( isAnnotation ) printf("\t%s", Sip->annot); printf("\n"); } return; } void printTabular (vector Stats) // Print stats in a column delimited format { long int len1, len2; float covA, covB; vector::iterator Sip; //-- Print the output header if ( isPrintHeader ) { printf ("%s %s\n%s\n\n", RefFileName, QryFileName, whichDataType == NUCMER_DATA ? "NUCMER" : "PROMER"); printf("%s\t%s\t", "[S1]", "[E1]"); printf("%s\t%s\t", "[S2]", "[E2]"); printf("%s\t%s\t", "[LEN 1]", "[LEN 2]"); if ( !isBrief ) { printf("%s\t", "[% IDY]"); if ( whichDataType == PROMER_DATA ) printf("%s\t%s\t", "[% SIM]", "[% STP]"); } if ( isShowSeqLens ) printf("%s\t%s\t", "[LEN R]", "[LEN Q]"); if ( isShowCoverage ) printf("%s\t%s\t", "[COV R]", "[COV Q]"); if ( !isBrief && isShowDir ) printf("%s\t", "[FRM]"); printf("%s\n", "[TAGS]"); } //-- Print the stats for ( Sip = Stats.begin( ); Sip < Stats.end( ); Sip ++ ) { if ( Sip->sA < 0 ) continue; len1 = labs(Sip->eA - Sip->sA) + 1; len2 = labs(Sip->eB - Sip->sB) + 1; covA = (float)len1 / (float)Sip->SeqLenA * 100.0; covB = (float)len2 / (float)Sip->SeqLenB * 100.0; //-- Output the statistics for this alignment region printf("%ld\t%ld\t", Sip->sA, Sip->eA); printf("%ld\t%ld\t", Sip->sB, Sip->eB); printf("%ld\t%ld\t", len1, len2); if ( !isBrief ) { printf("%.2f\t", Sip->Idy); if ( whichDataType == PROMER_DATA ) printf("%.2f\t%.2f\t", Sip->Sim, Sip->Stp); } if ( isShowSeqLens ) printf("%ld\t%ld\t", Sip->SeqLenA, Sip->SeqLenB); if ( isShowCoverage ) printf("%.2f\t%.2f\t", covA, covB); if ( !isBrief && isShowDir ) printf("%d\t%d\t", Sip->FrameA, Sip->FrameB); printf("%s\t%s", Sip->IdA, Sip->IdB); if ( isAnnotation ) printf("\t%s", Sip->annot); printf("\n"); } return; } void parseDelta (vector & Stats) // parse the delta file { int frameA, frameB; // sequence frame long int sA, eA, sB, eB; AlignStats CurrStats; // single alignment region DeltaReader_t dr; dr.open (InputFileName); whichDataType = dr.getDataType( ) == NUCMER_STRING ? NUCMER_DATA : PROMER_DATA; strcpy (RefFileName, dr.getReferencePath( ).c_str( )); strcpy (QryFileName, dr.getQueryPath( ).c_str( )); //-- for each delta record while ( dr.readNext( ) ) { CurrStats.SeqLenA = dr.getRecord( ).lenR; CurrStats.SeqLenB = dr.getRecord( ).lenQ; CurrStats.IdA = (char *) Safe_malloc (dr.getRecord( ).idR.length( ) + 1); CurrStats.IdB = (char *) Safe_malloc (dr.getRecord( ).idQ.length( ) + 1); strcpy (CurrStats.IdA, dr.getRecord( ).idR.c_str( )); strcpy (CurrStats.IdB, dr.getRecord( ).idQ.c_str( )); //-- for each alignment for ( unsigned int i = 0; i < dr.getRecord( ).aligns.size( ); i ++ ) { CurrStats.sA = dr.getRecord( ).aligns[i].sR; CurrStats.eA = dr.getRecord( ).aligns[i].eR; CurrStats.sB = dr.getRecord( ).aligns[i].sQ; CurrStats.eB = dr.getRecord( ).aligns[i].eQ; CurrStats.Idy = dr.getRecord( ).aligns[i].idy; CurrStats.Sim = dr.getRecord( ).aligns[i].sim; CurrStats.Stp = dr.getRecord( ).aligns[i].stp; if ( CurrStats.Idy < idyCutoff ) continue; if ( CurrStats.Idy > 99.99 && CurrStats.Idy != 100 ) CurrStats.Idy = 99.99; if ( CurrStats.Sim > 99.99 && CurrStats.Sim != 100 ) CurrStats.Sim = 99.99; if ( CurrStats.Stp > 99.99 && CurrStats.Stp != 100 ) CurrStats.Stp = 99.99; if ( CurrStats.Stp < 0.01 && CurrStats.Stp != 0 ) CurrStats.Stp = 0.01; if ( labs (CurrStats.sA - CurrStats.eA) + 1 < lenCutoff || labs (CurrStats.sB - CurrStats.eB) + 1 < lenCutoff ) continue; sA = CurrStats.sA; eA = CurrStats.eA; sB = CurrStats.sB; eB = CurrStats.eB; //-- Reset the coordinates to reference the appropriate strand frameA = 1; frameB = 1; if ( sA > eA ) { sA = revC (sA, CurrStats.SeqLenA); eA = revC (eA, CurrStats.SeqLenA); frameA += 3; } if ( sB > eB ) { sB = revC (sB, CurrStats.SeqLenB); eB = revC (eB, CurrStats.SeqLenB); frameB += 3; } if ( isBrief ) { CurrStats.FrameA = frameA > 3 ? (frameA - 3) * -1 : frameA; CurrStats.FrameB = frameB > 3 ? (frameB - 3) * -1 : frameB; Stats.push_back (CurrStats); continue; } //-- Set/Generate the correct frame for sequences A and B if ( whichDataType == NUCMER_DATA ) assert ( frameA == 1 && (frameB == 1 || frameB == 4)); else // PROMER_DATA { //-- Set the reading frame frameA += (sA + 2) % 3; frameB += (sB + 2) % 3; //-- Translated the coordinates from DNA to Amino Acid // remeber that eA and eB point to the last nucleotide in the // end codon sA = (sA + 2) / 3; eA = (eA) / 3; sB = (sB + 2) / 3; eB = (eB) / 3; } //-- Set the statistics for this alignment region CurrStats.FrameA = frameA > 3 ? (frameA - 3) * -1 : frameA; CurrStats.FrameB = frameB > 3 ? (frameB - 3) * -1 : frameB; //-- Add the alignment region Stats.push_back (CurrStats); } } dr.close ( ); return; } inline long int revC (long int Coord, long int Len) // Reverse complement the given coordinate for the given length. { assert (Len - Coord + 1 > 0); return (Len - Coord + 1); } void simplifyAlignments (vector & Stats) // clean up alignments if the isBrief or isKnockout options were selected { long int tmp; long int olapA, olapB, lenA, lenAi, lenB, lenBi; vector::iterator Sp; vector::iterator Sip; vector::iterator Sep; Sp = Stats.begin( ); while ( Sp < Stats.end( ) ) { for ( Sep = Sp; Sep < Stats.end( ); Sep ++ ) { if ( strcmp (Sp->IdA, Sep->IdA) != 0 || strcmp (Sp->IdB, Sep->IdB) != 0 ) break; if ( Sep->FrameA < 0 ) { tmp = Sep->sA; Sep->sA = Sep->eA; Sep->eA = tmp; } if ( Sep->FrameB < 0 ) { tmp = Sep->sB; Sep->sB = Sep->eB; Sep->eB = tmp; } if ( isBrief ) { Sep->FrameA = 1; Sep->FrameB = 1; } } sort (Sp, Sep, sA_Sort( )); for ( ; Sp < Sep; Sp ++ ) { if ( Sp->sA < 0 ) continue; for ( Sip = Sp + 1; Sip < Sep; Sip ++ ) { if ( Sip->sA > Sp->eA ) break; if ( isBrief ) { if ( Sip->sB <= Sp->eB && Sip->eB >= Sp->sB ) { Sp->sA = Min ( Sp->sA, Sip->sA ); Sp->sB = Min ( Sp->sB, Sip->sB ); Sp->eA = Max ( Sp->eA, Sip->eA ); Sp->eB = Max ( Sp->eB, Sip->eB ); Sip->sA *= -1; } } else if ( isKnockout ) { //-- Are they on the same strand? if ( (Sp->FrameA * Sp->FrameB) * (Sip->FrameA * Sip->FrameB) < 0 ) continue; //-- If overlap by more than 50% of their length // pick the best frame and throw out the loser lenA = Sp->eA - Sp->sA + 1; lenAi = Sip->eA - Sip->sA + 1; lenB = Sp->eB - Sp->sB + 1; lenBi = Sip->eB - Sip->sB + 1; olapA = Sp->eA - Sip->sA + 1; olapB = Sp->eB - Sp->sB + 1; tmp = Sp->eB - Sip->eB; if ( tmp > 0 ) olapB -= tmp; tmp = Sip->sB - Sp->sB; if ( tmp > 0 ) olapB -= tmp; if ( (float)olapA / (float)(lenA < lenAi ? lenA : lenAi) >= MIN_OVERLAP_P && (float)olapB / (float)(lenB < lenBi ? lenB : lenBi) >= MIN_OVERLAP_P ) { if ( Sp->Sim > Sip->Sim && ((float)lenA / (float)lenAi >= MIN_LENGTH_P && (float)lenB / (float)lenBi >= MIN_LENGTH_P) ) Sip->sA *= -1; else { Sp->sA *= -1; break; } } } else assert (false); } if ( Sp->sA > 0 && Sp->FrameA < 0 ) { tmp = Sp->sA; Sp->sA = Sp->eA; Sp->eA = tmp; } if ( Sp->sA > 0 && Sp->FrameB < 0 ) { tmp = Sp->sB; Sp->sB = Sp->eB; Sp->eB = tmp; } } } return; } void printHelp (const char * s) // Display the program's help information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] \n\n", s); fprintf (stderr, "-b Merges overlapping alignments regardless of match dir\n" " or frame and does not display any idenitity information.\n" "-B Switch output to btab format\n" "-c Include percent coverage information in the output\n" "-d Display the alignment direction in the additional\n" " FRM columns (default for promer)\n" "-g Deprecated option. Please use 'delta-filter' instead\n" "-h Display help information\n" "-H Do not print the output header\n" "-I float Set minimum percent identity to display\n" "-k Knockout (do not display) alignments that overlap\n" " another alignment in a different frame by more than 50%%\n" " of their length, AND have a smaller percent similarity\n" " or are less than 75%% of the size of the other alignment\n" " (promer only)\n" "-l Include the sequence length information in the output\n" "-L long Set minimum alignment length to display\n" "-o Annotate maximal alignments between two sequences, i.e.\n" " overlaps between reference and query sequences\n" "-q Sort output lines by query IDs and coordinates\n" "-r Sort output lines by reference IDs and coordinates\n" "-T Switch output to tab-delimited format\n\n"); fprintf (stderr, " Input is the .delta output of either the \"nucmer\" or the\n" "\"promer\" program passed on the command line.\n" " Output is to stdout, and consists of a list of coordinates,\n" "percent identity, and other useful information regarding the\n" "alignment data contained in the .delta file used as input.\n" " NOTE: No sorting is done by default, therefore the alignments\n" "will be ordered as found in the input.\n\n"); return; } void printUsage (const char * s) // Display the program's usage information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] \n\n", s); fprintf (stderr, "Try '%s -h' for more information.\n", s); return; }