//------------------------------------------------------------------------------ // Programmer: Adam M Phillippy, The Institute for Genomic Research // File: postpro.cc // Date: 08 / 20 / 2002 // // Purpose: To translate the coordinates referencing the concatenated // reference sequences back to the individual sequences and deal // with any conflict that may have arisen (i.e. a MUM that spans // the boundry between two sequences). Then to extend each cluster // via Smith-Waterman techniques to expand the alignment coverage. // Alignments which encounter each other will be fused to form one // encompasing alignment when appropriate. // // Input: Input is the output of the .mgaps program from stdin. On the // command line, the file names of the two original sequence files // should come first, followed by the prefix that should be // placed in front of the two output filenames .cluster and // .delta // // NOTE: Cluster file is now suppressed by default (see -d option). // // Output: Output is to two output files, .cluster and .delta. // .cluster lists MUM clusters as identified by "mgaps". // However, the clusters now reference their corresponding // sequence and are all listed under headers that specify this // sequence. In addition, the coordinates now reference the original // DNA input and not the amino acid translations. The .delta file // is the alignment object that contains all the information necessary // to reproduce the alignments generated by the MUM extension process. // The coordinates in this file reference the original DNA input, however // the delta values represent amino acids, so 1 delta = 3 nucleotides. // Please refer to the output file documentation for an in-depth // description of these file formats. // // Usage: postpro < // Where and are the original input sequences of // PROmer and is the prefix that should be added to the // beginning of the .cluster and .delta output filenames. // //------------------------------------------------------------------------------ //-- NOTE: this option will significantly hamper program performance, // mostly the alignment extension performance (sw_align.h) //#define _DEBUG_ASSERT // self testing assert functions #include "tigrinc.hh" #include "sw_align.hh" #include "translate.hh" #include #include using namespace std; //------------------------------------------------------ Globals -------------// bool DO_DELTA = true; bool DO_EXTEND = true; bool TO_SEQEND = false; //------------------------------------------------------ Type Definitions ----// enum LineType //-- The type of input line from { NO_LINE, HEADER_LINE, MATCH_LINE }; struct FastaRecord //-- The essential data of a sequence { char * Id; // the fasta ID header tag long int len; // the length of the sequence char * seq; // the sequence data }; struct Match //-- An exact match between two sequences A and B { long int sA, sB, len; // start coordinate in A, in B and the length }; struct Cluster //-- An ordered list of matches between two sequences A and B { bool wasFused; // have the cluster matches been extended yet? int frameA; // the reference sequence frame (1-6) int frameB; // the query sequence frame vector matches; // the ordered set of matches in the cluster }; struct Synteny //-- An ordered list of clusters between two sequences A and B { FastaRecord * AfP; // a pointer to the reference sequence record FastaRecord Bf; // the query sequence record (w/o the sequence) vector clusters; // the ordered set of clusters between A and B }; struct Alignment //-- An alignment object between two sequences A and B { int frameA; // the reference sequence frame (1-6) int frameB; // the query sequence frame long int sA, sB, eA, eB; // the start in A, B and the end in A, B vector delta; // the delta values, with NO zero at the end long int deltaApos; // sum of abs(deltas) - #of negative deltas // trust me, it is a very helpful value long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas }; struct AscendingClusterSort //-- For sorting clusters in ascending order of their sA coordinate { bool operator() (const Cluster & pA, const Cluster & pB) { return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA ); } }; //------------------------------------------------- Function Declarations ----// void addNewAlignment (vector & Alignments, vector::iterator Cp, vector::iterator Mp); bool extendBackward (vector & Alignments, vector::iterator CurrAp, vector::iterator TargetAp, const char * A, const char * B); bool extendForward (vector::iterator Ap, const char * A, long int targetA, const char * B, long int targetB, unsigned int m_o); void extendClusters (vector & Clusters, const FastaRecord * A, const FastaRecord * B, FILE * DeltaFile); void flushAlignments (vector & Alignments, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile); void flushSyntenys (vector & Syntenys, FILE * ClusterFile); vector::iterator getForwardTargetCluster (vector & Clusters, vector::iterator CurrCp, long int & targetA, long int & targetB); vector::iterator getReverseTargetAlignment (vector & Alignments, vector::iterator CurrAp); bool isShadowedCluster (vector::iterator CurrCp, vector & Alignments, vector::iterator Ap); void parseAbort (const char *); void parseDelta (vector & Alignments, const FastaRecord * Af, const FastaRecord *Bf); void processSyntenys (vector & Syntenys, FastaRecord * Af, long int As, FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile); inline long int transC (long int Coord, int Frame, long int Len); inline long int refLen (long int ntLen); inline long int revC (long int Coord, long int Len); void printHelp (const char *); void printUsage (const char *); void validateData (vector Alignments, vector Clusters, const FastaRecord * Af, const FastaRecord * Bf); //-------------------------------------------------- Function Definitions ----// int main (int argc, char ** argv) { FastaRecord * Af; // array of all the reference sequences vector Syntenys; // vector of all sets of clusters vector::reverse_iterator CurrSp; // current set of clusters vector::reverse_iterator Sp; // temporary synteny pointer Synteny Asyn; // a single set of clusters Cluster Aclu; // a single cluster of matches Match Amat; // a single match LineType PrevLine; // the current input line bool Found; // temporary search flag char Line [MAX_LINE]; // a single line of input char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names char RefFileName [MAX_LINE], QryFileName [MAX_LINE]; int FrameA, FrameB; // reading frames (1-6) int CurrFrameA, CurrFrameB; long int i; // temporary counter long int Seqi; // current reference sequence index long int InitSize; // initial sequence memory space long int As = 0; // the size of the reference array long int Ac = 100; // the capacity of the reference array long int sA, sB, len; // current match start in A, B and length FILE * RefFile, * QryFile; // reference and query input files FILE * ClusterFile, * DeltaFile; // cluster and delta output files //-- Set the alignment data type and break length (sw_align.h) setMatrixType ( BLOSUM62 ); setBreakLen ( 60 ); //-- Parse the command line arguments { optarg = NULL; int ch, errflg = 0; while ( !errflg && ((ch = getopt (argc, argv, "dehb:tx:")) != EOF) ) switch (ch) { case 'b' : setBreakLen (atoi (optarg)); break; case 'd' : DO_DELTA = false; break; case 'e' : DO_EXTEND = false; break; case 'h' : printHelp (argv[0]); exit (EXIT_SUCCESS); break; case 't' : TO_SEQEND = true; break; case 'x' : setMatrixType ( atoi (optarg) ); if ( getMatrixType( ) == NUCLEOTIDE ) { fprintf (stderr, "WARNING: invalid matrix type %d, ignoring\n", getMatrixType( )); setMatrixType ( BLOSUM62 ); } break; default : errflg ++; } if ( errflg > 0 || argc - optind != 3 ) { printUsage (argv[0]); exit (EXIT_FAILURE); } } //-- Read and create the I/O file names strcpy (RefFileName, argv[optind ++]); strcpy (QryFileName, argv[optind ++]); strcpy (ClusterFileName, argv[optind ++]); strcpy (DeltaFileName, ClusterFileName); strcat (ClusterFileName, ".cluster"); strcat (DeltaFileName, ".delta"); //-- Open all the files RefFile = File_Open (RefFileName, "r"); QryFile = File_Open (QryFileName, "r"); if ( DO_DELTA ) { ClusterFile = NULL; DeltaFile = File_Open (DeltaFileName, "w"); } else { ClusterFile = File_Open (ClusterFileName, "w"); DeltaFile = NULL; } //-- Print the headers of the output files if ( DO_DELTA ) fprintf (DeltaFile, "%s %s\nPROMER\n", RefFileName, QryFileName); else fprintf (ClusterFile, "%s %s\nPROMER\n", RefFileName, QryFileName); //-- Generate the array of the reference sequences InitSize = INIT_SIZE; Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac ); Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) ) { Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1)); strcpy (Af[As].Id, IdA); Af[As].len = strlen (Af[As].seq + 1); if ( ++ As >= Ac ) { Ac *= 2; Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac ); } InitSize = INIT_SIZE; Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); } fclose (RefFile); if ( As <= 0 ) parseAbort ( RefFileName ); //-- Process the input from line by line PrevLine = NO_LINE; IdA[0] = '\0'; IdB[0] = '\0'; FrameA = 0; FrameB = 0; CurrFrameA = -1; CurrFrameB = -1; CurrIdB[0] = '\0'; while ( fgets(Line, MAX_LINE, stdin) != NULL ) { //-- If the current line is a fasta HEADER_LINE if ( Line[0] == '>' ) { if ( sscanf (Line + 1, "%s", CurrIdB) != 1 ) parseAbort ("stdin"); sscanf (CurrIdB + strlen(CurrIdB) - 1, "%d", &CurrFrameB); CurrIdB [strlen(CurrIdB) - 2] = '\0'; PrevLine = HEADER_LINE; } //-- If the current line is a cluster HEADER_LINE else if ( Line[0] == '#' ) { PrevLine = HEADER_LINE; } //-- If the current line is a MATCH_LINE else { if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 ) parseAbort ("stdin"); //-- Re-map the reference coordinate back to its original sequence // NOTE: (len + 1) * 2 = refLen(len) = concatenated frame length // including the appended 'X' on each frame translation for ( Seqi = 0; sA > refLen (Af[Seqi].len); Seqi ++ ) sA -= refLen (Af[Seqi].len); assert ( Seqi < As ); //-- Get the correct frame, translate startA coordinate to frame for ( i = 0; sA > (Af[Seqi].len - (i % 3)) / 3 + 1; i ++ ) sA -= (Af[Seqi].len - (i % 3)) / 3 + 1; CurrFrameA = i + 1; assert ( CurrFrameA > 0 && CurrFrameA < 7 ); //-- If the match spans across a frame boundry if ( CurrFrameA < 1 || CurrFrameA > 6 || sA + len - 1 > (Af[Seqi].len - ((CurrFrameA - 1) % 3)) / 3 + 1 || sA <= 0 ) { fprintf (stderr, "\nWARNING: A MUM was found extending beyond the boundry of:\n" " Reference sequence '>%s', frame %d\n" "Please file a bug report\n" "Attempting to continue.\n", Af[Seqi].Id, CurrFrameA); continue; } //-- If the match spans across a sequence boundry if ( sA + len - 1 > refLen (Af[Seqi].len) || sA <= 0 ) { fprintf (stderr, "\nWARNING: A MUM was found extending beyond the boundry of:\n" " Reference sequence '>%s'\n" "Please file a bug report\n" "Attempting to continue.\n", Af[Seqi].Id); continue; } //-- Check and update the current synteny region if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 || CurrFrameA != FrameA || CurrFrameB != FrameB ) { Found = false; if ( strcmp (IdB, CurrIdB) == 0 ) { //-- Has this header been seen before? for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ ) { if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 ) { assert ( Sp->AfP->len == Af[Seqi].len ); assert ( strcmp (Sp->Bf.Id, IdB) == 0 ); CurrSp = Sp; Found = true; break; } } } else { //-- New B sequence header, process all the old synteny's processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); } strcpy (IdA, Af[Seqi].Id); strcpy (IdB, CurrIdB); FrameA = CurrFrameA; FrameB = CurrFrameB; //-- If not seen yet, create a new synteny region if ( ! Found ) { Asyn.AfP = Af + Seqi; Asyn.Bf.len = -1; Asyn.Bf.Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdB) + 1)); strcpy (Asyn.Bf.Id, IdB); Syntenys.push_back (Asyn); CurrSp = Syntenys.rbegin( ); } //-- Add a new cluster to the current synteny if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); // hack to remove empties Aclu.frameA = FrameA; Aclu.frameB = FrameB; Aclu.wasFused = false; CurrSp->clusters.push_back (Aclu); } else if ( PrevLine == HEADER_LINE ) { //-- Add a new cluster to the current synteny if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); Aclu.frameA = FrameA; Aclu.frameB = FrameB; Aclu.wasFused = false; CurrSp->clusters.push_back (Aclu); } //-- Add a new match to the current cluster // NOTE: A and B coordinates still reference the appropriate // amino acid sequence frame, not the DNA (same with len) if ( len > 1 ) { Amat.sA = sA; Amat.sB = sB; Amat.len = len; CurrSp->clusters.rbegin( )->matches.push_back (Amat); } PrevLine = MATCH_LINE; } } //-- Process the left-over syntenys if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) ) if ( CurrSp->clusters.rbegin( )->matches.empty( ) ) CurrSp->clusters.pop_back( ); processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile); fclose (QryFile); //-- Free the reference sequences for ( i = 0; i < As; i ++ ) { free (Af[i].Id); free (Af[i].seq); } free (Af); return EXIT_SUCCESS; } void addNewAlignment (vector & Alignments, vector::iterator Cp, vector::iterator Mp) // Create and add a new alignment object based on the current match // and cluster information pointed to by Cp and Mp. { Alignment Align; //-- Initialize the data Align.sA = Mp->sA; Align.sB = Mp->sB; Align.eA = Mp->sA + Mp->len - 1; Align.eB = Mp->sB + Mp->len - 1; Align.frameA = Cp->frameA; Align.frameB = Cp->frameB; Align.delta.clear( ); Align.deltaApos = 0; //-- Add the alignment object Alignments.push_back (Align); return; } bool extendBackward (vector & Alignments, vector::iterator CurrAp, vector::iterator TargetAp, const char * A, const char * B) // Extend an alignment backwards off of the current alignment object. // The current alignment object must be freshly created and consist // only of an exact match (i.e. the delta vector MUST be empty). // If the TargetAp alignment object is reached by the extension, it will // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is // NULL the function will extend as far as possible. It is a strange // and dangerous function because it can delete CurrAp, so edit with // caution. Returns true if TargetAp was reached and merged, else false // Designed only as a subroutine for extendClusters, should be used // nowhere else. { bool target_reached = false; bool overflow_flag = false; bool double_flag = false; vector::iterator Dp; unsigned int m_o; long int targetA, targetB; m_o = BACKWARD_SEARCH; //-- Set the target coordinates if ( TargetAp != Alignments.end( ) ) { targetA = TargetAp->eA; targetB = TargetAp->eB; } else { targetA = 1; targetB = 1; m_o |= OPTIMAL_BIT; } //-- If alignment is too long, bring with bounds and set overflow_flag true if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH ) { targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1; overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH ) { targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1; if ( overflow_flag ) double_flag = true; else overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( TO_SEQEND && !double_flag ) m_o |= SEQEND_BIT; //-- Attempt to reach the target target_reached = alignSearch (A, CurrAp->sA, targetA, B, CurrAp->sB, targetB, m_o); if ( overflow_flag || TargetAp == Alignments.end( ) ) target_reached = false; if ( target_reached ) { //-- assert that CurrAp is new and at the end of the Alignment vector assert ( CurrAp->delta.empty( ) ); assert ( CurrAp == Alignments.end( ) - 1 ); //-- Merge the two alignment objects extendForward (TargetAp, A, CurrAp->sA, B, CurrAp->sB, FORCED_FORWARD_ALIGN); TargetAp->eA = CurrAp->eA; TargetAp->eB = CurrAp->eB; Alignments.pop_back( ); } else { alignTarget (A, targetA, CurrAp->sA, B, targetB, CurrAp->sB, CurrAp->delta, FORCED_FORWARD_ALIGN); CurrAp->sA = targetA; CurrAp->sB = targetB; //-- Update the deltaApos value for the alignment object for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ ) CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; } return target_reached; } bool extendForward (vector::iterator CurrAp, const char * A, long int targetA, const char * B, long int targetB, unsigned int m_o) // Extend an alignment forwards off the current alignment object until // target or end of sequence is reached, and merge the delta values of the // alignment object with the new delta values generated by the extension. // Return true if the target was reached, else false { long int ValA; long int ValB; unsigned int Di; bool target_reached; bool overflow_flag = false; bool double_flag = false; vector::iterator Dp; //-- Set Di to the end of the delta vector Di = CurrAp->delta.size( ); //-- Calculate the distance between the start and end positions ValA = targetA - CurrAp->eA + 1; ValB = targetB - CurrAp->eB + 1; //-- If the distance is too long, shrink it and set the overflow_flag if ( ValA > MAX_ALIGNMENT_LENGTH ) { targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1; overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( ValB > MAX_ALIGNMENT_LENGTH ) { targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1; if ( overflow_flag ) double_flag = true; else overflow_flag = true; m_o |= OPTIMAL_BIT; } if ( double_flag ) m_o &= ~SEQEND_BIT; target_reached = alignTarget (A, CurrAp->eA, targetA, B, CurrAp->eB, targetB, CurrAp->delta, m_o); //-- Notify user if alignment was chopped short if ( target_reached && overflow_flag ) target_reached = false; //-- Pick up delta where left off (Di) and merge with new deltas if ( Di < CurrAp->delta.size( ) ) { //-- Merge the deltas ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1; CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA); if ( CurrAp->delta[Di] == 0 || ValA < 0 ) { fprintf(stderr, "ERROR: failed to merge alignments at position %ld\n" " Please file a bug report\n", CurrAp->eA); exit (EXIT_FAILURE); } //-- Update the deltaApos for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ ) CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1; } //-- Set the alignment coordinates CurrAp->eA = targetA; CurrAp->eB = targetB; return target_reached; } void extendClusters (vector & Clusters, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) // Connect all the matches in every cluster between sequences A and B. // Also, extend alignments off of the front and back of each cluster to // expand total alignment coverage. When these extensions encounter an // adjacent cluster (in a consistent frame), fuse the two regions to // create one single encompassing region. This routine will create // alignment objects from these extensions and output the resulting // delta information to the delta output file. Also, all coordintes // for the clusters and the alignment regions are translated to reference // the original DNA sequecne. { //-- Sort the clusters (ascending) by their start coordinate in sequence A sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( )); //-- If no delta file is requested if ( ! DO_DELTA ) return; bool target_reached = false; // reached the adjacent match or cluster // the amino acid sequences for A and B char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; long int Alen [7], Blen [7]; // the length of the amino acid sequences int i; unsigned int m_o; long int targetA, targetB; // alignment extension targets in A and B vector::iterator Mp; // match pointer vector::iterator PrevCp; // where the extensions last left off vector::iterator CurrCp; // the current cluster being extended vector::iterator TargetCp = Clusters.end( ); // the target cluster vector Alignments; // the vector of alignment objects vector::iterator CurrAp = Alignments.begin( ); // current align vector::iterator TargetAp; // target align //-- Calculate the length of each amino acid frame translation for ( i = 0; i < 6; i ++ ) { Alen [i + 1] = (Af->len - (i % 3)) / 3; Blen [i + 1] = (Bf->len - (i % 3)) / 3; } //-- Extend each cluster PrevCp = Clusters.begin( ); CurrCp = Clusters.begin( ); while ( CurrCp < Clusters.end( ) ) { if ( DO_EXTEND ) { //-- Ignore if shadowed or already extended if ( ! target_reached ) if ( CurrCp->wasFused || isShadowedCluster (CurrCp, Alignments, CurrAp) ) { CurrCp->wasFused = true; CurrCp = ++ PrevCp; continue; } } //-- Initialize the right amino acid sequence for A and B if need be if ( A [CurrCp->frameA] == NULL ) { A [CurrCp->frameA] = (char *) Safe_malloc ( sizeof(char) * ( (Af->len / 3) + 2) ); A [CurrCp->frameA] [0] = '\0'; Alen [CurrCp->frameA] = Translate_DNA ( Af->seq, A [CurrCp->frameA], CurrCp->frameA ); } if ( B [CurrCp->frameB] == NULL ) { B [CurrCp->frameB] = (char *) Safe_malloc ( sizeof(char) * ( (Bf->len / 3) + 2) ); B [CurrCp->frameB] [0] = '\0'; Blen [CurrCp->frameB] = Translate_DNA ( Bf->seq, B [CurrCp->frameB], CurrCp->frameB ); } //-- Extend each match in the cluster for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ ) { //-- Try to extend the current match backwards if ( target_reached ) { //-- Merge with the previous match if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB ) { //-- Strip matches until the targeted match is found assert (Mp < CurrCp->matches.end( ) - 1); continue; } CurrAp->eA += Mp->len - 1; CurrAp->eB += Mp->len - 1; } else { //-- Create a new alignment object addNewAlignment (Alignments, CurrCp, Mp); CurrAp = Alignments.end( ) - 1; if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) ) { //-- Target the closest/best alignment object TargetAp = getReverseTargetAlignment (Alignments, CurrAp); //-- Extend the new alignment object backwards if ( extendBackward (Alignments, CurrAp, TargetAp, A [CurrCp->frameA], B [CurrCp->frameB]) ) CurrAp = TargetAp; } } m_o = FORWARD_ALIGN; //-- Try to extend the current match forwards if ( Mp < CurrCp->matches.end( ) - 1 ) { //-- Target the next match in the cluster targetA = (Mp + 1)->sA; targetB = (Mp + 1)->sB; //-- Extend the current alignment object forward target_reached = extendForward (CurrAp, A [CurrCp->frameA], targetA, B [CurrCp->frameB], targetB, m_o); } else if ( DO_EXTEND ) { targetA = Alen [CurrCp->frameA]; targetB = Blen [CurrCp->frameB]; //-- Target the closest/best match in a future cluster TargetCp = getForwardTargetCluster (Clusters, CurrCp, targetA, targetB); if ( TargetCp == Clusters.end( ) ) { m_o |= OPTIMAL_BIT; if ( TO_SEQEND ) m_o |= SEQEND_BIT; } //-- Extend the current alignment object forward target_reached = extendForward (CurrAp, A [CurrCp->frameA], targetA, B [CurrCp->frameB], targetB, m_o); } } if ( TargetCp == Clusters.end( ) ) target_reached = false; CurrCp->wasFused = true; if ( target_reached == false ) CurrCp = ++ PrevCp; else CurrCp = TargetCp; } #ifdef _DEBUG_ASSERT validateData (Alignments, Clusters, Af, Bf); #endif //-- Output the alignment data to the delta file flushAlignments (Alignments, Af, Bf, DeltaFile); for ( i = 0; i < 7; i ++ ) { if ( A [i] != NULL ) free ( A [i] ); if ( B [i] != NULL ) free ( B [i] ); } return; } void flushAlignments (vector & Alignments, const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile) // Simply output the delta information stored in Alignments to the // given delta file. Free the memory used by Alignments once the // data is successfully output to the file. { vector::iterator Ap; // alignment pointer vector::iterator Dp; // delta pointer fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len); //-- Generate the error counts parseDelta (Alignments, Af, Bf); for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n", transC (Ap->sA, Ap->frameA, Af->len), transC (Ap->eA, Ap->frameA, Af->len) + (Ap->frameA > 3 ? -2:2), transC (Ap->sB, Ap->frameB, Bf->len), transC (Ap->eB, Ap->frameB, Bf->len) + (Ap->frameB > 3 ? -2:2), Ap->Errors, Ap->SimErrors, Ap->NonAlphas); for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) fprintf (DeltaFile, "%ld\n", *Dp); fprintf (DeltaFile, "0\n"); Ap->delta.clear( ); } Alignments.clear( ); return; } void flushSyntenys (vector & Syntenys, FILE * ClusterFile) // Simply output the synteny/cluster information generated by the mgaps // program. However, now the coordinates reference their appropriate // reference sequence, and the reference sequecne header is added to // the appropriate lines. Free the memory used by Syntenys once the // data is successfully output to the file. { vector::iterator Sp; // synteny pointer vector::iterator Cp; // cluster pointer vector::iterator Mp; // match pointer if ( ClusterFile ) { for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) { fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id, Sp->AfP->len, Sp->Bf.len); for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) { fprintf (ClusterFile, "%2d %2d\n", Cp->frameA > 3 ? (Cp->frameA - 3) * -1 : Cp->frameA, Cp->frameB > 3 ? (Cp->frameB - 3) * -1 : Cp->frameB); for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) { fprintf (ClusterFile, "%8ld %8ld %6ld", transC (Mp->sA, Cp->frameA, Sp->AfP->len), transC (Mp->sB, Cp->frameB, Sp->Bf.len), Mp->len * 3); if ( Mp != Cp->matches.begin( ) ) fprintf (ClusterFile, "%6ld %6ld\n", (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) * 3, (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) * 3); else fprintf (ClusterFile, "%6s %6s\n", "-", "-"); } Cp->matches.clear( ); } Sp->clusters.clear( ); } } else { for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ ) { for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ ) { Cp->matches.clear( ); } Sp->clusters.clear( ); } } Syntenys.clear( ); return; } vector::iterator getForwardTargetCluster (vector & Clusters, vector::iterator CurrCp, long int & targetA, long int & targetB) // Return the cluster that is most likely to successfully join (in a // forward direction) with the current cluster. The returned cluster // must contain 1 or more matches that are strictly greater than the end // of the current cluster. The targeted cluster must also be on a // diagonal close enough to the current cluster, so that a connection // could possibly be made by the alignment extender. Also, this targeted // cluster must be consistent in frame with the current cluster. Assumes // clusters have been sorted via AscendingClusterSort. Returns targeted // cluster and stores the target coordinates in targetA and targetB. If no // suitable cluster was found, the function will return NULL and target // A and targetB will remain unchanged. { vector::iterator Mip; // match iteratrive pointer vector::iterator Cp; // cluster pointer vector::iterator Cip; // cluster iterative pointer long int eA, eB; // possible target long int greater, lesser; // gap sizes between two clusters long int sA = CurrCp->matches.rbegin( )->sA + CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster long int sB = CurrCp->matches.rbegin( )->sB + CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster //-- End of sequences is the default target, set distance accordingly long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB); //-- For all clusters greater than the current cluster (on sequence A) Cp = Clusters.end( ); for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ ) { //-- If the cluster is in the same frame if ( CurrCp->frameA == Cip->frameA && CurrCp->frameB == Cip->frameB ) { eA = Cip->matches.begin( )->sA; eB = Cip->matches.begin( )->sB; //-- If the cluster overlaps the current cluster, strip some matches if ( ( eA < sA || eB < sB ) && Cip->matches.rbegin( )->sA >= sA && Cip->matches.rbegin( )->sB >= sB ) { for ( Mip = Cip->matches.begin( ); Mip < Cip->matches.end( ) && ( eA < sA || eB < sB ); Mip ++ ) { eA = Mip->sA; eB = Mip->sB; } } //-- If the cluster is strictly greater than current cluster if ( eA >= sA && eB >= sB ) { if ( eA - sA > eB - sB ) { greater = eA - sA; lesser = eB - sB; } else { lesser = eA - sA; greater = eB - sB; } //-- If the cluster is close enough if ( greater <= getBreakLen( ) || (lesser) * GOOD_SCORE [getMatrixType( )] + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) { Cp = Cip; targetA = eA; targetB = eB; break; } else if ( (greater << 1) - lesser < dist ) { Cp = Cip; targetA = eA; targetB = eB; dist = (greater << 1) - lesser; } } } } return Cp; } vector::iterator getReverseTargetAlignment (vector & Alignments, vector::iterator CurrAp) // Return the alignment that is most likely to successfully join (in a // reverse direction) with the current alignment. The returned alignment // must be strictly less than the current cluster and be on a diagonal // close enough to the current alignment, so that a connection // could possibly be made by the alignment extender. Also, this targeted // alignment must be consistent in frame with the current alignment. // Assumes clusters have been sorted via AscendingClusterSort and // processed in order, so therefore all alignments are in order by their // start A coordinate. { vector::iterator Ap; // alignment pointer vector::iterator Aip; // alignment iterative pointer long int eA, eB; // possible targets long int greater, lesser; // gap sizes between the two alignments long int sA = CurrAp->sA; // the startA of the current alignment long int sB = CurrAp->sB; // the startB of the current alignment //-- Beginning of sequences is the default target, set distance accordingly long int dist = (sA < sB ? sA : sB); //-- For all alignments less than the current alignment (on sequence A) Ap = Alignments.end( ); for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- ) { //-- If the alignment is on the same direction if ( CurrAp->frameA == Aip->frameA && CurrAp->frameB == Aip->frameB ) { eA = Aip->eA; eB = Aip->eB; //-- If the alignment is strictly greater than current cluster if ( eA <= sA && eB <= sB ) { if ( sA - eA > sB - eB ) { greater = sA - eA; lesser = sB - eB; } else { lesser = sA - eA; greater = sB - eB; } //-- If the cluster is close enough if ( greater <= getBreakLen( ) || (lesser) * GOOD_SCORE [getMatrixType( )] + (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] > 0 ) { Ap = Aip; break; } else if ( (greater << 1) - lesser < dist ) { Ap = Aip; dist = (greater << 1) - lesser; } } } } return Ap; } bool isShadowedCluster (vector::iterator CurrCp, vector & Alignments, vector::iterator Ap) // Check if the current cluster is shadowed by a previously produced // alignment region. Return true if it is, else false. { vector::iterator Aip; // alignment pointer long int sA = CurrCp->matches.begin( )->sA; long int eA = CurrCp->matches.rbegin( )->sA + CurrCp->matches.rbegin( )->len - 1; long int sB = CurrCp->matches.begin( )->sB; long int eB = CurrCp->matches.rbegin( )->sB + CurrCp->matches.rbegin( )->len - 1; if ( ! Alignments.empty( ) ) // if there are alignments to use { //-- Look backwards in hope of finding a shadowing alignment for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- ) { //-- If in the same frames and shadowing the current cluster, break if ( Aip->frameA == CurrCp->frameA && Aip->frameB == CurrCp->frameB ) if ( Aip->eA >= eA && Aip->eB >= eB ) if ( Aip->sA <= sA && Aip->sB <= sB ) break; } //-- Return true if the loop was broken, i.e. shadow found if ( Aip >= Alignments.begin( ) ) return true; } //-- Return false if Alignments was empty or loop was not broken return false; } void parseAbort (const char * s) // Abort the program if there was an error in parsing file 's' { fprintf (stderr, "ERROR: Could not parse input from '%s'. \n" "Please check the filename and format, or file a bug report\n", s); exit (EXIT_FAILURE); } void parseDelta (vector & Alignments, const FastaRecord * Af, const FastaRecord *Bf) // Use the delta information to generate the error counts for each // alignment, and fill this information into the data type { // pointers to Af.seq and Bf.seq char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char ch1, ch2; long int Delta; int Sign; long int i, Apos, Bpos; long int Remain, Total; long int Errors, SimErrors; long int NonAlphas; vector::iterator Ap; vector::iterator Dp; for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( A [Ap->frameA] == NULL ) { A [Ap->frameA] = (char *) Safe_malloc ( sizeof(char) * ( (Af->len / 3) + 2 ) ); A [Ap->frameA] [0] = '\0'; Translate_DNA ( Af->seq, A [Ap->frameA], Ap->frameA ); } if ( B [Ap->frameB] == NULL ) { B [Ap->frameB] = (char *) Safe_malloc ( sizeof(char) * ( (Bf->len / 3) + 2) ); B [Ap->frameB] [0] = '\0'; Translate_DNA ( Bf->seq, B [Ap->frameB], Ap->frameB ); } Apos = Ap->sA; Bpos = Ap->sB; Errors = 0; SimErrors = 0; NonAlphas = 0; Remain = Ap->eA - Ap->sA + 1; Total = Remain; //-- For all delta's in this alignment for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ ) { Delta = *Dp; Sign = Delta > 0 ? 1 : -1; Delta = labs ( Delta ); //-- For all the bases before the next indel for ( i = 1; i < Delta; i ++ ) { ch1 = A [Ap->frameA] [Apos ++]; ch2 = B [Ap->frameB] [Bpos ++]; if ( !isalpha (ch1) ) { ch1 = STOP_CHAR; NonAlphas ++; } if ( !isalpha (ch2) ) { ch2 = STOP_CHAR; NonAlphas ++; } ch1 = toupper(ch1); ch2 = toupper(ch2); if ( 1 > MATCH_SCORE [getMatrixType( )] [ch1 - 'A'] [ch2 - 'A'] ) SimErrors ++; if ( ch1 != ch2 ) Errors ++; } //-- Process the current indel Remain -= i - 1; Errors ++; SimErrors ++; if ( Sign == 1 ) { if ( !isalpha (A [Ap->frameA] [Apos ++]) ) NonAlphas ++; Remain --; } else { if ( !isalpha (B [Ap->frameB] [Bpos ++]) ) NonAlphas ++; Total ++; } } //-- For all the bases after the final indel for ( i = 0; i < Remain; i ++ ) { //-- Score character match and update error counters ch1 = A [Ap->frameA] [Apos ++]; ch2 = B [Ap->frameB] [Bpos ++]; if ( !isalpha (ch1) ) { ch1 = STOP_CHAR; NonAlphas ++; } if ( !isalpha (ch2) ) { ch2 = STOP_CHAR; NonAlphas ++; } ch1 = toupper(ch1); ch2 = toupper(ch2); if ( 1 > MATCH_SCORE [getMatrixType( )] [ch1 - 'A'] [ch2 - 'A'] ) SimErrors ++; if ( ch1 != ch2 ) Errors ++; } Ap->Errors = Errors; Ap->SimErrors = SimErrors; Ap->NonAlphas = NonAlphas; } for ( i = 1; i <= 6; i ++ ) { if ( A [i] != NULL ) free ( A [i] ); A [i] = NULL; if ( B [i] != NULL ) free ( B [i] ); B [i] = NULL; } return; } void processSyntenys (vector & Syntenys, FastaRecord * Af, long int As, FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile) // For each syntenic region with clusters, read in the B sequence and // extend the clusters to expand total alignment coverage. Clusters should // still reference the amino acid concatenations, the translation back to // DNA will be taken care of by the extendClusters function. // processSyntenys should ONLY be called once all the clusters for // the contained syntenic regions have been stored in the data structure. // Frees the memory used by the the syntenic regions once the output of // extendClusters and flushSyntenys has been produced. { FastaRecord Bf; // the current B sequence Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1)); long int InitSize = INIT_SIZE; // the initial memory capacity of B vector::iterator CurrSp; // current synteny pointer //-- Initialize the B sequence storage Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize ); //-- For all the contained syntenys for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ ) { //-- If no clusters, ignore if ( CurrSp->clusters.empty( ) ) continue; //-- If a B sequence not seen yet, read it in //-- IMPORTANT: The B sequences in the synteny object are assumed to be // ordered as output by mgaps, if they are not in order the program // will fail. (All like tags must be adjacent and in the same order // as the query file) if ( CurrSp == Syntenys.begin( ) || strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 ) { //-- Read in the B sequence while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) ) if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 ) break; if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 ) { fprintf(stderr,"%s\n", CurrSp->Bf.Id); parseAbort ( "Query File" ); } Bf.len = strlen (Bf.seq + 1); } //-- Extend clusters and create the alignment information CurrSp->Bf.len = Bf.len; extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile); } //-- Create the cluster information and clear the Synteny structure flushSyntenys (Syntenys, ClusterFile); free (Bf.Id); free (Bf.seq); return; } inline long int transC (long int Coord, int Frame, long int Len) // Translate an amino acid coordinate to a nucleotide coordinate // relative to the forward strand. The Len parameter should be the // length of the original DNA strand. { assert ( Frame > 0 && Frame < 7 ); if ( Frame > 3 ) return revC ( (Coord * 3) - (3 - (Frame - 3)), Len ); else return (Coord * 3) - (3 - (Frame)); } inline long int refLen (long int ntLen) // Return the length of the concatenated amino acid sequence frames, // including the appended 'X' at the end of each sequence frame for the // given DNA input length. (The returned length will be the length of // the concatenated frames for this sequence as output by prepro.cc) { return (ntLen + 1) << 1; } inline long int revC (long int Coord, long int Len) // Reverse complement the given coordinate for the given length. { assert (Len - Coord + 1 > 0); return (Len - Coord + 1); } void printHelp (const char * s) // Display the program's help information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] < \n\n", s); fprintf (stderr, "-b int set the alignment break (give-up) length to int (amino acids)\n" "-d output only match clusters rather than extended alignments\n" "-e do not extend alignments outward from clusters\n" "-h display help information\n" "-t force alignment to ends of sequence if within -b distance\n" "-x type set the matrix type to \"type\" - Default is 2 (BLOSUM 62),\n" " other options include 1 (BLOSUM 45) and 3 (BLOSUM 80)\n\n"); fprintf (stderr, " Input is the output of the \"mgaps\" program from stdin, and\n" "the two original PROmer sequence files passed on the command\n" "line. is the prefix to be added to the front of the\n" "output file .delta\n" " .delta is the alignment object that catalogs the distance\n" "between insertions and deletions. For further information\n" "regarding this file, please refer to the documentation under\n" "the .delta output description.\n\n"); return; } void printUsage (const char * s) // Display the program's usage information to stderr. { fprintf (stderr, "\nUSAGE: %s [options] < \n\n", s); fprintf (stderr, "Try '%s -h' for more information.\n", s); return; } void validateData (vector Alignments, vector Clusters, const FastaRecord * Af, const FastaRecord * Bf) // Self test function to check that the cluster and alignment information // is valid { char * A [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; char * B [7] = {NULL, NULL, NULL, NULL, NULL, NULL, NULL}; long int Alen [7], Blen [7]; // the length of the amino acid sequences vector::iterator Cp; vector::iterator Mp; vector::iterator Ap; long int x, y, i; char Xc, Yc; for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ ) { assert ( Cp->wasFused ); //-- Initialize the right amino acid sequence for A and B if need be if ( A [Cp->frameA] == NULL ) { A [Cp->frameA] = (char *) Safe_malloc ( sizeof(char) * ( (Af->len / 3) + 2) ); A [Cp->frameA] [0] = '\0'; Alen [Cp->frameA] = Translate_DNA ( Af->seq, A [Cp->frameA], Cp->frameA ); } if ( B [Cp->frameB] == NULL ) { B [Cp->frameB] = (char *) Safe_malloc ( sizeof(char) * ( (Bf->len / 3) + 2) ); B [Cp->frameB] [0] = '\0'; Blen [Cp->frameB] = Translate_DNA ( Bf->seq, B [Cp->frameB], Cp->frameB ); } for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ ) { //-- assert for each match in cluster, it is indeed a match x = Mp->sA; y = Mp->sB; for ( i = 0; i < Mp->len; i ++ ) assert ( A[Cp->frameA][x ++] == B[Cp->frameB][y ++] ); //-- assert for each match in cluster, it is contained in an alignment for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB && Ap->eA >= Mp->sA + Mp->len - 1 && Ap->eB >= Mp->sB + Mp->len - 1 ) break; } assert ( Ap < Alignments.end( ) ); } } //-- assert alignments are optimal (quick check if first and last chars equal) for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ ) { assert ( Ap->sA <= Ap->eA ); assert ( Ap->sB <= Ap->eB ); assert ( Ap->sA >= 1 && Ap->sA <= Af->len ); assert ( Ap->eA >= 1 && Ap->eA <= Af->len ); assert ( Ap->sB >= 1 && Ap->sB <= Bf->len ); assert ( Ap->eB >= 1 && Ap->eB <= Bf->len ); Xc = toupper(isalpha(A[Ap->frameA][Ap->sA]) ? A[Ap->frameA][Ap->sA] : STOP_CHAR); Yc = toupper(isalpha(B[Ap->frameB][Ap->sB]) ? B[Ap->frameB][Ap->sB] : STOP_CHAR); assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); Xc = toupper(isalpha(A[Ap->frameA][Ap->eA]) ? A[Ap->frameA][Ap->eA] : STOP_CHAR); Yc = toupper(isalpha(B[Ap->frameB][Ap->eB]) ? B[Ap->frameB][Ap->eB] : STOP_CHAR); assert ( 0 <= MATCH_SCORE [getMatrixType( )] [Xc - 'A'] [Yc - 'A'] ); } for ( i = 0; i < 7; i ++ ) { if ( A [i] != NULL ) free ( A [i] ); if ( B [i] != NULL ) free ( B [i] ); } return; }