//------------------------------------------------------------------------------ // Programmer: Adam M Phillippy, The Institute for Genomic Research // File: postnuc.cc // Date: 07 / 16 / 2002 // // Revision: 08 / 01 / 2002 // Added MUM extension functionality // // Purpose: To translate the coordinates referencing the concatenated // reference sequences back to the individual sequences and deal // with any conflict that may have arisen (i.e. a MUM that spans // the boundry between two sequences). Then to extend each cluster // via Smith-Waterman techniques to expand the alignment coverage. // Alignments which encounter each other will be fused to form one // encompasing alignment when appropriate. // // Input: Input is the output of the .mgaps program from stdin. On the // command line, the file names of the two original sequence files // should come first, followed by the prefix that should be // placed in front of the two output filenames .cluster and // .delta // // NOTE: Cluster file is now suppressed by default (see -d option). // // Output: Output is to two output files, .cluster and .delta. // .cluster lists MUM clusters as identified by "mgaps". // However, the clusters now reference their corresponding // sequence and are all listed under headers that specify this // sequence. The .delta file is the alignment object that // contains all the information necessary to reproduce the alignments // generated by the MUM extension process. Please refer to the // output file documentation for an in-depth description of these // file formats. // // Usage: postnuc < // Where and are the original input sequences of // NUCmer and is the prefix that should be added to the // beginning of the .cluster and .delta output filenames. // //------------------------------------------------------------------------------ //-- NOTE: this option will significantly hamper program performance, // mostly the alignment extension performance (sw_align.h) //#define _DEBUG_ASSERT // self testing assert functions #include "tigrinc.hh" #include "sw_align.hh" #include #include using namespace std; //------------------------------------------------------ Globals -------------// const signed char FORWARD_CHAR = 1; const signed char REVERSE_CHAR = -1; bool DO_DELTA = true; bool DO_EXTEND = true; bool TO_SEQEND = false; bool DO_SHADOWS = false; //------------------------------------------------------ Type Definitions ----// enum LineType //-- The type of input line from { NO_LINE, HEADER_LINE, MATCH_LINE }; struct FastaRecord //-- The essential data of a sequence { char * Id; // the fasta ID header tag long int len; // the length of the sequence char * seq; // the sequence data }; struct Match //-- An exact match between two sequences A and B { long int sA, sB, len; // start coordinate in A, in B and the length }; struct Cluster //-- An ordered list of matches between two sequences A and B { bool wasFused; // have the cluster matches been fused yet? signed char dirB; // the query sequence direction // FORWARD_CHAR or REVERSE_CHAR vector matches; // the ordered set of matches in the cluster }; struct Synteny //-- An ordered list of clusters between two sequences A and B { FastaRecord * AfP; // a pointer to the reference sequence record FastaRecord Bf; // the query sequence record (w/o the sequence) vector clusters; // the ordered set of clusters between A and B }; struct Alignment //-- An alignment object between two sequences A and B { signed char dirB; // the query sequence direction long int sA, sB, eA, eB; // the start in A, B and the end in A, B vector delta; // the delta values, with NO zero at the end long int deltaApos; // sum of abs(deltas) - #of negative deltas // trust me, it is a very helpful value long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas }; struct AscendingClusterSort //-- For sorting clusters in ascending order of their sA coordinate { bool operator() (const Cluster & pA, const Cluster & pB) { return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); } }; //------------------------------------------------- Function Declarations ----// void addNewAlignment (vector & Alignments, vector::iterator Cp, vector::iterator Mp); bool extendBackward (vector & Alignments, vector::iterator CurrAp, vector::iterator TargetAp, const char * A, const char * B); bool extendForward (vector::iterator Ap, const char * A, long int targetA, const char * B, long int targetB, unsigned int m_o); void extendClusters (vector & Clusters, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile); void flushAlignments (vector & Alignments, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile); void flushSyntenys (vector & Syntenys, FILE * ClusterFile); vector::iterator getForwardTargetCluster (vector & Clusters, vector::iterator CurrCp, long int & targetA, long int & targetB); vector::iterator getReverseTargetAlignment (vector & Alignments, vector::iterator CurrAp); bool isShadowedCluster (vector::iterator CurrCp, vector & Alignments, vector::iterator Ap); void parseAbort (const char *); void parseDelta (vector & Alignments, const FastaRecord * Af, const FastaRecord *Bf); void processSyntenys (vector & Syntenys, FastaRecord * Af, long int As, FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); inline long int revC (long int Coord, long int Len); void printHelp (const char *); void printUsage (const char *); void validateData (vector Alignments, vector Clusters, const FastaRecord * Af, const FastaRecord * Bf); //-------------------------------------------------- Function Definitions ----// int main (int argc, char ** argv) { FastaRecord * Af; // array of all the reference sequences vector Syntenys; // vector of all sets of clusters vector::reverse_iterator CurrSp; // current set of clusters vector::reverse_iterator Sp; // temporary synteny pointer Synteny Asyn; // a single set of clusters Cluster Aclu; // a single cluster of matches Match Amat; // a single match LineType PrevLine; // the current input line bool Found; // temporary search flag char Line [MAX_LINE]; // a single line of input char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; signed char DirB = FORWARD_CHAR; // the current query strand direction long int i; // temporary counter long int Seqi; // current reference sequence index long int InitSize; // initial sequence memory space long int As = 0; // the size of the reference array long int Ac = 100; // the capacity of the reference array long int sA, sB, len; // current match start in A, B and length FILE * RefFile, * QryFile; // reference and query input files FILE * ClusterFile, * DeltaFile; // cluster and delta output files //-- Set the alignment data type and break length (sw_align.h) setMatrixType ( NUCLEOTIDE ); setBreakLen ( 200 ); //-- Parse the command line arguments { optarg = NULL; int ch, errflg = 0; while ( !errflg && ((ch = getopt (argc, argv, "dehb:st")) != EOF) ) switch (ch) { case 'b' : setBreakLen( atoi (optarg) ); break; case 'd' : DO_DELTA = false; break; case 'e' : DO_EXTEND = false; break; case 'h' : printHelp (argv[0]); exit (EXIT_SUCCESS); break; case 's' : DO_SHADOWS = true; break; case 't' : TO_SEQEND = true; break; default : errflg ++; } if ( errflg > 0 || argc - optind != 3 ) { printUsage (argv[0]); exit (EXIT_FAILURE); } } //-- Read and create the I/O file names strcpy (RefFileName, argv[optind ++]); strcpy (QryFileName, argv[optind ++]); strcpy (ClusterFileName, argv[optind ++]); strcpy (DeltaFileName, ClusterFileName); strcat (ClusterFileName, ".cluster"); strcat (DeltaFileName, ".delta"); //-- Open all the files RefFile = File_Open (RefFileName, "r"); QryFile = File_Open (QryFileName, "r"); if ( DO_DELTA ) { ClusterFile = NULL; DeltaFile = File_Open (DeltaFileName, "w"); } else { ClusterFile = File_Open (ClusterFileName, "w"); DeltaFile = NULL; } //-- Print the headers of the output files if ( DO_DELTA ) fprintf (DeltaFile, "%s %s\nNUCMER\n", RefFileName, QryFileName); else fprintf (ClusterFile, "%s %s\nNUCMER\n", RefFileName, QryFileName); //-- Generate the array of the reference sequences InitSize = INIT_SIZE; Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) { Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); strcpy (Af[As].Id, IdA); Af[As].len = strlen (Af[As].seq + 1); if ( ++ As >= Ac ) { Ac *= 2; Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); } InitSize = INIT_SIZE; Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); } fclose (RefFile); if ( As <= 0 ) parseAbort ( RefFileName ); //-- Process the input from line by line PrevLine = NO_LINE; IdA[0] = '\0'; IdB[0] = '\0'; CurrIdB[0] = '\0'; while ( fgets(Line, MAX_LINE, stdin) != NULL ) { //-- If the current line is a fasta HEADER_LINE if ( Line[0] == '>' ) { if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) parseAbort ("stdin"); DirB = strstr (Line," Reverse") == NULL ? FORWARD_CHAR : REVERSE_CHAR; PrevLine = HEADER_LINE; } //-- If the current line is a cluster HEADER_LINE else if ( Line[0] == '#' ) { PrevLine = HEADER_LINE; } //-- If the current line is a MATCH_LINE else { if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) parseAbort ("stdin"); //-- Re-map the reference coordinate back to its original sequence for ( Seqi = 0; sA > Af[Seqi].len; Seqi ++ ) sA -= Af[Seqi].len + 1; // extra +1 for the x at the end of each seq if ( Seqi >= As ) { fprintf (stderr, "ERROR: A MUM was found with a start coordinate greater than\n" " the sequence length, a serious error has occured.\n" " Please file a bug report\n"); exit (EXIT_FAILURE); } //-- If the match spans across a sequence boundry if ( sA + len - 1 > Af[Seqi].len || sA <= 0) { fprintf (stderr, "WARNING: A MUM was found extending beyond the boundry of:\n" " Reference sequence '>%s'\n\n" "Please check that the '-n' option is activated on 'mummer2'\n" "and try again, or file a bug report\n" "Attempting to continue.\n", Af[Seqi].Id); continue; } //-- Check and update the current synteny region if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 ) { Found = false; if ( strcmp (IdB, CurrIdB) == 0 ) { //-- Has this header been seen before? for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) { if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) { if ( Sp->AfP->len != Af[Seqi].len ) { fprintf (stderr, "ERROR: The reference file may contain" " sequences with non-unique\n" " header Ids, please check your input" " files and try again\n"); exit (EXIT_FAILURE); } assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); CurrSp = Sp; Found = true; break; } } } else { //-- New B sequence header, process all the old synteny's processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); } strcpy (IdA, Af[Seqi].Id); strcpy (IdB, CurrIdB); //-- If not seen yet, create a new synteny region if ( ! Found ) { Asyn.AfP = Af + Seqi; Asyn.Bf.len = -1; Asyn.Bf.Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdB) + 1)); strcpy (Asyn.Bf.Id, IdB); Syntenys.push_back (Asyn); CurrSp = Syntenys.rbegin( ); } //-- Add a new cluster to the current synteny if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); // hack to remove empties Aclu.wasFused = false; Aclu.dirB = DirB; CurrSp->clusters.push_back (Aclu); } else if ( PrevLine == HEADER_LINE ) { //-- Add a new cluster to the current synteny if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); Aclu.wasFused = false; Aclu.dirB = DirB; CurrSp->clusters.push_back (Aclu); } //-- Add a new match to the current cluster if ( len > 1 ) { Amat.sA = sA; Amat.sB = sB; Amat.len = len; CurrSp->clusters.rbegin( )->matches.push_back (Amat); } PrevLine = MATCH_LINE; } } //-- Process the left-over syntenys if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); fclose (QryFile); //-- Free the reference sequences for ( i = 0; i < As; i ++ ) { free (Af[i].Id); free (Af[i].seq); } free (Af); return EXIT_SUCCESS; } void addNewAlignment (vector & Alignments, vector::iterator Cp, vector::iterator Mp) // Create and add a new alignment object based on the current match // and cluster information pointed to by Cp and Mp. { Alignment Align; //-- Initialize the data Align.sA = Mp->sA; Align.sB = Mp->sB; Align.eA = Mp->sA + Mp->len - 1; Align.eB = Mp->sB + Mp->len - 1; Align.dirB = Cp->dirB; Align.delta.clear( ); Align.deltaApos = 0; //-- Add the alignment object Alignments.push_back (Align); return; } bool extendBackward (vector & Alignments, vector::iterator CurrAp, vector::iterator TargetAp, const char * A, const char * B) // Extend an alignment backwards off of the current alignment object. // The current alignment object must be freshly created and consist // only of an exact match (i.e. the delta vector MUST be empty). // If the TargetAp alignment object is reached by the extension, it will // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is // NULL the function will extend as far as possible. It is a strange // and dangerous function because it can delete CurrAp, so edit with // caution. Returns true if TargetAp was reached and merged, else false // Designed only as a subroutine for extendClusters, should be used // nowhere else. { bool target_reached = false; bool overflow_flag = false; bool double_flag = false; vector::iterator Dp; unsigned int m_o; long int targetA, targetB; m_o = BACKWARD_SEARCH; //-- Set the target coordinates if ( TargetAp != Alignments.end( ) ) { targetA = TargetAp->eA; targetB = TargetAp->eB; } else { targetA = 1; targetB = 1; m_o |= OPTIMAL_BIT; } //-- If alignment is too long, bring with bounds and set overflow_flag true if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) { targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) { targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; if ( overflow_flag ) double_flag = true; else overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( TO_SEQEND && !double_flag ) m_o |= SEQEND_BIT; //-- Attempt to reach the target target_reached = alignSearch (A, CurrAp->sA, targetA, B, CurrAp->sB, targetB, m_o); if ( overflow_flag || TargetAp == Alignments.end( ) ) target_reached = false; if ( target_reached ) { //-- Merge the two alignment objects extendForward (TargetAp, A, CurrAp->sA, B, CurrAp->sB, FORCED_FORWARD_ALIGN); TargetAp->eA = CurrAp->eA; TargetAp->eB = CurrAp->eB; Alignments.pop_back( ); } else { alignTarget (A, targetA, CurrAp->sA, B, targetB, CurrAp->sB, CurrAp->delta, FORCED_FORWARD_ALIGN); CurrAp->sA = targetA; CurrAp->sB = targetB; //-- Update the deltaApos value for the alignment object for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; } return target_reached; } bool extendForward (vector::iterator CurrAp, const char * A, long int targetA, const char * B, long int targetB, unsigned int m_o) // Extend an alignment forwards off the current alignment object until // target or end of sequence is reached, and merge the delta values of the // alignment object with the new delta values generated by the extension. // Return true if the target was reached, else false { long int ValA; long int ValB; unsigned int Di; bool target_reached; bool overflow_flag = false; bool double_flag = false; vector::iterator Dp; //-- Set Di to the end of the delta vector Di = CurrAp->delta.size( ); //-- Calculate the distance between the start and end positions ValA = targetA - CurrAp->eA + 1; ValB = targetB - CurrAp->eB + 1; //-- If the distance is too long, shrink it and set the overflow_flag if ( ValA > MAX_ALIGNMENT_LENGTH ) { targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( ValB > MAX_ALIGNMENT_LENGTH ) { targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; if ( overflow_flag ) double_flag = true; else overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( double_flag ) m_o &= ~SEQEND_BIT; target_reached = alignTarget (A, CurrAp->eA, targetA, B, CurrAp->eB, targetB, CurrAp->delta, m_o); //-- Notify user if alignment was chopped short if ( target_reached && overflow_flag ) target_reached = false; //-- Pick up delta where left off (Di) and merge with new deltas if ( Di < CurrAp->delta.size( ) ) { //-- Merge the deltas ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); if ( CurrAp->delta[Di] == 0 || ValA < 0 ) { fprintf(stderr, "ERROR: failed to merge alignments at position %ld\n" " Please file a bug report\n", CurrAp->eA); exit (EXIT_FAILURE); } //-- Update the deltaApos for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; } //-- Set the alignment coordinates CurrAp->eA = targetA; CurrAp->eB = targetB; return target_reached; } void extendClusters (vector & Clusters, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) // Connect all the matches in every cluster between sequences A and B. // Also, extend alignments off of the front and back of each cluster to // expand total alignment coverage. When these extensions encounter an // adjacent cluster, fuse the two regions to create one single // encompassing region. This routine will create alignment objects from // these extensions and output the resulting delta information to the // delta output file. { //-- Sort the clusters (ascending) by their start coordinate in sequence A sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); //-- If no delta file is requested if ( ! DO_DELTA ) return; bool target_reached = false; // reached the adjacent match or cluster char * A, * B; // the sequences A and B char * Brev = NULL; // the reverse complement of B unsigned int m_o; long int targetA, targetB; // alignment extension targets in A and B vector::iterator Mp; // match pointer vector::iterator PrevCp; // where the extensions last left off vector::iterator CurrCp; // the current cluster being extended vector::iterator TargetCp = Clusters.end( ); // the target cluster vector Alignments; // the vector of alignment objects vector::iterator CurrAp = Alignments.begin( ); // current align vector::iterator TargetAp; // target align //-- Extend each cluster A = Af->seq; PrevCp = Clusters.begin( ); CurrCp = Clusters.begin( ); while ( CurrCp < Clusters.end( ) ) { if ( DO_EXTEND ) { //-- Ignore if shadowed or already extended if ( ! target_reached ) if ( CurrCp->wasFused || (!DO_SHADOWS && isShadowedCluster (CurrCp, Alignments, CurrAp)) ) { CurrCp->wasFused = true; CurrCp = ++ PrevCp; continue; } } //-- Pick the right directional sequence for B if ( CurrCp->dirB == FORWARD_CHAR ) B = Bf->seq; else if ( Brev != NULL ) B = Brev; else { Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); strcpy ( Brev + 1, Bf->seq + 1 ); Brev[0] = '\0'; Reverse_Complement (Brev, 1, Bf->len); B = Brev; } //-- Extend each match in the cluster for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) { //-- Try to extend the current match backwards if ( target_reached ) { //-- Merge with the previous match if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) { if ( Mp >= CurrCp->matches.end( ) - 1 ) { fprintf (stderr, "ERROR: Target match does not exist, please\n" " file a bug report\n"); exit (EXIT_FAILURE); } continue; } CurrAp->eA += Mp->len - 1; CurrAp->eB += Mp->len - 1; } else { //-- Create a new alignment object addNewAlignment (Alignments, CurrCp, Mp); CurrAp = Alignments.end( ) - 1; if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) { //-- Target the closest/best alignment object TargetAp = getReverseTargetAlignment (Alignments, CurrAp); //-- Extend the new alignment object backwards if ( extendBackward (Alignments, CurrAp, TargetAp, A, B) ) CurrAp = TargetAp; } } m_o = FORWARD_ALIGN; //-- Try to extend the current match forwards if ( Mp < CurrCp->matches.end( ) - 1 ) { //-- Target the next match in the cluster targetA = (Mp + 1)->sA; targetB = (Mp + 1)->sB; //-- Extend the current alignment object forward target_reached = extendForward (CurrAp, A, targetA, B, targetB, m_o); } else if ( DO_EXTEND ) { targetA = Af->len; targetB = Bf->len; //-- Target the closest/best match in a future cluster TargetCp = getForwardTargetCluster (Clusters, CurrCp, targetA, targetB); if ( TargetCp == Clusters.end( ) ) { m_o |= OPTIMAL_BIT; if ( TO_SEQEND ) m_o |= SEQEND_BIT; } //-- Extend the current alignment object forward target_reached = extendForward (CurrAp, A, targetA, B, targetB, m_o); } } if ( TargetCp == Clusters.end( ) ) target_reached = false; CurrCp->wasFused = true; if ( target_reached == false ) CurrCp = ++ PrevCp; else CurrCp = TargetCp; } #ifdef _DEBUG_ASSERT validateData (Alignments, Clusters, Af, Bf); #endif //-- Output the alignment data to the delta file flushAlignments (Alignments, Af, Bf, DeltaFile); if ( Brev != NULL ) free (Brev); return; } void flushAlignments (vector & Alignments, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) // Simply output the delta information stored in Alignments to the // given delta file. Free the memory used by Alignments once the // data is successfully output to the file. { vector::iterator Ap; // alignment pointer vector::iterator Dp; // delta pointer fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); //-- Generate the error counts parseDelta (Alignments, Af, Bf); for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( Ap->dirB == FORWARD_CHAR ) fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", Ap->sA, Ap->eA, Ap->sB, Ap->eB, Ap->Errors, Ap->SimErrors, Ap->NonAlphas); else fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", Ap->sA, Ap->eA, revC (Ap->sB, Bf->len), revC (Ap->eB, Bf->len), Ap->Errors, Ap->SimErrors, Ap->NonAlphas); for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) fprintf (DeltaFile, "%ld\n", *Dp); fprintf (DeltaFile, "0\n"); Ap->delta.clear( ); } Alignments.clear( ); return; } void flushSyntenys (vector & Syntenys, FILE * ClusterFile) // Simply output the synteny/cluster information generated by the mgaps // program. However, now the coordinates reference their appropriate // reference sequence, and the reference sequecne header is added to // the appropriate lines. Free the memory used by Syntenys once the // data is successfully output to the file. { vector::iterator Sp; // synteny pointer vector::iterator Cp; // cluster pointer vector::iterator Mp; // match pointer if ( ClusterFile ) { for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) { fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, Sp->AfP->len, Sp->Bf.len); for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) { fprintf (ClusterFile, "%2d %2d\n", FORWARD_CHAR, Cp->dirB); for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) { if ( Cp->dirB == FORWARD_CHAR ) fprintf (ClusterFile, "%8ld %8ld %6ld", Mp->sA, Mp->sB, Mp->len); else fprintf (ClusterFile, "%8ld %8ld %6ld", Mp->sA, revC (Mp->sB, Sp->Bf.len), Mp->len); if ( Mp != Cp->matches.begin( ) ) fprintf (ClusterFile, "%6ld %6ld\n", Mp->sA - (Mp - 1)->sA - (Mp - 1)->len, Mp->sB - (Mp - 1)->sB - (Mp - 1)->len); else fprintf (ClusterFile, "%6s %6s\n", "-", "-"); } Cp->matches.clear( ); } Sp->clusters.clear( ); } } else { for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) { for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) { Cp->matches.clear( ); } Sp->clusters.clear( ); } } Syntenys.clear( ); return; } vector::iterator getForwardTargetCluster (vector & Clusters, vector::iterator CurrCp, long int & targetA, long int & targetB) // Return the cluster that is most likely to successfully join (in a // forward direction) with the current cluster. The returned cluster // must contain 1 or more matches that are strictly greater than the end // of the current cluster. The targeted cluster must also be on a // diagonal close enough to the current cluster, so that a connection // could possibly be made by the alignment extender. Assumes clusters // have been sorted via AscendingClusterSort. Returns targeted cluster // and stores the target coordinates in targetA and targetB. If no // suitable cluster was found, the function will return NULL and target // A and targetB will remain unchanged. { vector::iterator Mip; // match iteratrive pointer vector::iterator Cp; // cluster pointer vector::iterator Cip; // cluster iterative pointer long int eA, eB; // possible target long int greater, lesser; // gap sizes between two clusters long int sA = CurrCp->matches.rbegin( )->sA + CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster long int sB = CurrCp->matches.rbegin( )->sB + CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster //-- End of sequences is the default target, set distance accordingly long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); //-- For all clusters greater than the current cluster (on sequence A) Cp = Clusters.end( ); for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) { //-- If the cluster is on the same direction if ( CurrCp->dirB == Cip->dirB ) { eA = Cip->matches.begin( )->sA; eB = Cip->matches.begin( )->sB; //-- If the cluster overlaps the current cluster, strip some matches if ( ( eA < sA || eB < sB ) && Cip->matches.rbegin( )->sA >= sA && Cip->matches.rbegin( )->sB >= sB ) { for ( Mip = Cip->matches.begin( ); Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); Mip ++ ) { eA = Mip->sA; eB = Mip->sB; } } //-- If the cluster is strictly greater than current cluster if ( eA >= sA && eB >= sB ) { if ( eA - sA > eB - sB ) { greater = eA - sA; lesser = eB - sB; } else { lesser = eA - sA; greater = eB - sB; } //-- If the cluster is close enough if ( greater < getBreakLen( ) || (lesser) * GOOD_SCORE [getMatrixType( )] + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 ) { Cp = Cip; targetA = eA; targetB = eB; break; } else if ( (greater << 1) - lesser < dist ) { Cp = Cip; targetA = eA; targetB = eB; dist = (greater << 1) - lesser; } } } } return Cp; } vector::iterator getReverseTargetAlignment (vector & Alignments, vector::iterator CurrAp) // Return the alignment that is most likely to successfully join (in a // reverse direction) with the current alignment. The returned alignment // must be strictly less than the current cluster and be on a diagonal // close enough to the current alignment, so that a connection // could possibly be made by the alignment extender. Assumes clusters // have been sorted via AscendingClusterSort and processed in order, so // therefore all alignments are in order by their start A coordinate. { vector::iterator Ap; // alignment pointer vector::iterator Aip; // alignment iterative pointer long int eA, eB; // possible targets long int greater, lesser; // gap sizes between the two alignments long int sA = CurrAp->sA; // the startA of the current alignment long int sB = CurrAp->sB; // the startB of the current alignment //-- Beginning of sequences is the default target, set distance accordingly long int dist = (sA < sB ? sA : sB); //-- For all alignments less than the current alignment (on sequence A) Ap = Alignments.end( ); for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) { //-- If the alignment is on the same direction if ( CurrAp->dirB == Aip->dirB ) { eA = Aip->eA; eB = Aip->eB; //-- If the alignment is strictly less than current cluster if ( eA <= sA && eB <= sB ) { if ( sA - eA > sB - eB ) { greater = sA - eA; lesser = sB - eB; } else { lesser = sA - eA; greater = sB - eB; } //-- If the cluster is close enough if ( greater < getBreakLen( ) || (lesser) * GOOD_SCORE [getMatrixType( )] + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 ) { Ap = Aip; break; } else if ( (greater << 1) - lesser < dist ) { Ap = Aip; dist = (greater << 1) - lesser; } } } } return Ap; } bool isShadowedCluster (vector::iterator CurrCp, vector & Alignments, vector::iterator Ap) // Check if the current cluster is shadowed by a previously produced // alignment region. Return true if it is, else false. { vector::iterator Aip; // alignment pointer long int sA = CurrCp->matches.begin( )->sA; long int eA = CurrCp->matches.rbegin( )->sA + CurrCp->matches.rbegin( )->len - 1; long int sB = CurrCp->matches.begin( )->sB; long int eB = CurrCp->matches.rbegin( )->sB + CurrCp->matches.rbegin( )->len - 1; if ( ! Alignments.empty( ) ) // if there are alignments to use { //-- Look backwards in hope of finding a shadowing alignment for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) { //-- If in the same direction and shadowing the current cluster, break if ( Aip->dirB == CurrCp->dirB ) if ( Aip->eA >= eA && Aip->eB >= eB ) if ( Aip->sA <= sA && Aip->sB <= sB ) break; } //-- Return true if the loop was broken, i.e. shadow found if ( Aip >= Alignments.begin( ) ) return true; } //-- Return false if Alignments was empty or loop was not broken return false; } void parseAbort (const char * s) // Abort the program if there was an error in parsing file 's' { fprintf (stderr, "ERROR: Could not parse input from '%s'. \n" "Please check the filename and format, or file a bug report\n", s); exit (EXIT_FAILURE); } void parseDelta (vector & Alignments, const FastaRecord * Af, const FastaRecord *Bf) // Use the delta information to generate the error counts for each // alignment, and fill this information into the data type { char * A, * B, * Brev = NULL; char ch1, ch2; long int Delta; int Sign; long int i, Apos, Bpos; long int Remain, Total; long int Errors, SimErrors; long int NonAlphas; vector::iterator Ap; vector::iterator Dp; for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { A = Af->seq; B = Bf->seq; if ( Ap->dirB == REVERSE_CHAR ) { if ( Brev == NULL ) { Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); strcpy ( Brev + 1, Bf->seq + 1 ); Brev[0] = '\0'; Reverse_Complement (Brev, 1, Bf->len); B = Brev; } B = Brev; } Apos = Ap->sA; Bpos = Ap->sB; Errors = 0; SimErrors = 0; NonAlphas = 0; Remain = Ap->eA - Ap->sA + 1; Total = Remain; //-- For all delta's in this alignment for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) { Delta = *Dp; Sign = Delta > 0 ? 1 : -1; Delta = labs ( Delta ); //-- For all the bases before the next indel for ( i = 1; i < Delta; i ++ ) { ch1 = A [Apos ++]; ch2 = B [Bpos ++]; if ( !isalpha (ch1) ) { ch1 = STOP_CHAR; NonAlphas ++; } if ( !isalpha (ch2) ) { ch2 = STOP_CHAR; NonAlphas ++; } ch1 = toupper(ch1); ch2 = toupper(ch2); if ( 1 > MATCH_SCORE [getMatrixType( )] [ch1 - 'A'] [ch2 - 'A'] ) SimErrors ++; if ( ch1 != ch2 ) Errors ++; } //-- Process the current indel Remain -= i - 1; Errors ++; SimErrors ++; if ( Sign == 1 ) { Apos ++; Remain --; } else { Bpos ++; Total ++; } } //-- For all the bases after the final indel for ( i = 0; i < Remain; i ++ ) { //-- Score character match and update error counters ch1 = A [Apos ++]; ch2 = B [Bpos ++]; if ( !isalpha (ch1) ) { ch1 = STOP_CHAR; NonAlphas ++; } if ( !isalpha (ch2) ) { ch2 = STOP_CHAR; NonAlphas ++; } ch1 = toupper(ch1); ch2 = toupper(ch2); if ( 1 > MATCH_SCORE [getMatrixType( )] [ch1 - 'A'] [ch2 - 'A'] ) SimErrors ++; if ( ch1 != ch2 ) Errors ++; } Ap->Errors = Errors; Ap->SimErrors = SimErrors; Ap->NonAlphas = NonAlphas; } if ( Brev != NULL ) free ( Brev ); return; } void processSyntenys (vector & Syntenys, FastaRecord * Af, long int As, FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) // For each syntenic region with clusters, read in the B sequence and // extend the clusters to expand total alignment coverage. Only should // be called once all the clusters for the contained syntenic regions have // been stored in the data structure. Frees the memory used by the // the syntenic regions once the output of extendClusters and // flushSyntenys has been produced. { FastaRecord Bf; // the current B sequence Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); long int InitSize = INIT_SIZE; // the initial memory capacity of B vector::iterator CurrSp; // current synteny pointer //-- Initialize the B sequence storage Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); //-- For all the contained syntenys for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) { //-- If no clusters, ignore if ( CurrSp->clusters.empty( ) ) continue; //-- If a B sequence not seen yet, read it in //-- IMPORTANT: The B sequences in the synteny object are assumed to be // ordered as output by mgaps, if they are not in order the program // will fail. (All like tags must be adjacent and in the same order // as the query file) if ( CurrSp == Syntenys.begin( ) || strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) { //-- Read in the B sequence while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) break; if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) parseAbort ( "Query File" ); Bf.len = strlen (Bf.seq + 1); } //-- Extend clusters and create the alignment information CurrSp->Bf.len = Bf.len; extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); } //-- Create the cluster information flushSyntenys (Syntenys, ClusterFile); free (Bf.Id); free (Bf.seq); return; } inline long int revC (long int Coord, long int Len) // Reverse complement the given coordinate for the given length. { assert (Len - Coord + 1 > 0); return (Len - Coord + 1); } void printHelp (const char * s) // Display the program's help information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] < \n\n", s); fprintf (stderr, "-b int set the alignment break (give-up) length to int\n" "-d output only match clusters rather than extended alignments\n" "-e do not extend alignments outward from clusters\n" "-h display help information\n" "-s don't remove shadowed alignments, useful for aligning a\n" " sequence to itself to identify repeats\n" "-t force alignment to ends of sequence if within -b distance\n\n"); fprintf (stderr, " Input is the output of the \"mgaps\" program from stdin, and\n" "the two original NUCmer sequence files passed on the command\n" "line. is the prefix to be added to the front of the\n" "output file .delta\n" " .delta is the alignment object that catalogs the distance\n" "between insertions and deletions. For further information\n" "regarding this file, please refer to the documentation under\n" "the .delta output description.\n\n"); return; } void printUsage (const char * s) // Display the program's usage information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] < \n\n", s); fprintf (stderr, "Try '%s -h' for more information.\n", s); return; } void validateData (vector Alignments, vector Clusters, const FastaRecord * Af, const FastaRecord * Bf) // Self test function to check that the cluster and alignment information // is valid { char * A, * B, * Brev = NULL; vector::iterator Cp; vector::iterator Mp; vector::iterator Ap; long int x, y, i; char Xc, Yc; A = Af->seq; for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) { assert ( Cp->wasFused ); //-- Pick the right directional sequence for B if ( Cp->dirB == FORWARD_CHAR ) B = Bf->seq; else if ( Brev != NULL ) B = Brev; else { Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) ); strcpy ( Brev + 1, Bf->seq + 1 ); Brev[0] = '\0'; Reverse_Complement (Brev, 1, Bf->len); B = Brev; } for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) { //-- assert for each match in cluster, it is indeed a match x = Mp->sA; y = Mp->sB; for ( i = 0; i < Mp->len; i ++ ) assert ( A[x ++] == B[y ++] ); //-- assert for each match in cluster, it is contained in an alignment for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && Ap->eA >= Mp->sA + Mp->len - 1 && Ap->eB >= Mp->sB + Mp->len - 1 ) break; } assert ( Ap < Alignments.end( ) ); } } //-- assert alignments are optimal (quick check if first and last chars equal) for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( Ap->dirB == REVERSE_CHAR ) { assert (Brev != NULL); B = Brev; } else B = Bf->seq; assert ( Ap->sA <= Ap->eA ); assert ( Ap->sB <= Ap->eB ); assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); Xc = toupper(isalpha(A[Ap->sA]) ? A[Ap->sA] : STOP_CHAR); Yc = toupper(isalpha(B[Ap->sB]) ? B[Ap->sB] : STOP_CHAR); assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] ); Xc = toupper(isalpha(A[Ap->eA]) ? A[Ap->eA] : STOP_CHAR); Yc = toupper(isalpha(B[Ap->eB]) ? B[Ap->eB] : STOP_CHAR); assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] ); } if ( Brev != NULL ) free (Brev); return; }