/* * Programmer: Doug Harmon * File: icm.h * Last Updated: Thu Jul 16 11:31:20 EDT 1998 * * Purpose: Routines for reading in the model and * scoring orfs produced by build-icm */ #ifndef _ICM_H_INCLUDED #define _ICM_H_INCLUDED #include "delcher.h" /* macro that returns the parent of x in our tree */ #define PARENT(x) ( (int) ((x)-1)/4 ) #define SAMPLE_SIZE_BOUND 400 #define SAMPLE_WEIGHT 400 /* the 4 possible bases */ enum base {a, c, g, t}; typedef struct Model { int mut_info_pos; float prob[4]; } tModel; tModel ** MODEL[MODEL_NO]; //tModel * NMODEL; //tModel * IMODEL; int TOTAL_NODES; int MAX_DEPTH; int MODEL_READ[MODEL_NO]; /* TRUE iff model has been read in, FALSE otherwise */ void Fast_Evaluate (char *, int, int, tModel *, tModel *, tModel *, double &); double get_prob_of_window1 (int start, tModel * Delta, char *orf); double get_prob_of_window2 (int pred_pos, tModel * Delta, char *orf); double get_prob_of_window1_max (int start, tModel * Delta, char *orf,double &probmax); double get_prob_of_window2_max (int pred_pos, tModel * Delta, char *orf,double &probmax); void Read_Scoring_Model (FILE *fptr,int m); tModel * Read_NonScoring_Model (FILE *fptr); int ModelNo; int INTERVAL; /******************************************************** * Read_NonCoding_Model ********************************************************* * PURPOSE: read in the DNA sequence model generated by build-icm * INPUT: an OPEN file pointer associated with the file to read * OUTPUT: the MODEL (tModel *) containing the probabilities * to be used to score non-coding regions: upstream, downstream, or introns. *********************************************************/ tModel * Read_NonScoring_Model (FILE *fptr) { tModel *NCModel; char line[150]; int num_node = 0; int prev_node = 0; int i; /* read from bin file */ fgets (line, 150, fptr); //read the text header // read the maximum number of nodes we need and the depth of the tree model if (fread (&TOTAL_NODES, sizeof (int), 1, fptr) == 0) { fprintf (stderr, "ERROR reading binary file\n"); exit (0); } if (fread (&MAX_DEPTH, sizeof (int), 1, fptr) == 0) { fprintf (stderr, "ERROR reading binary file\n"); exit (0); } /* if ( (fscanf (fptr, "%d %d", &TOTAL_NODES, &MAX_DEPTH)) == EOF ) { fprintf (stderr, "Error in model file. Rerun build-icm\n"); exit (1); } // remove newline fgets (line, 150, fptr); */ NCModel = (tModel *) Safe_calloc (TOTAL_NODES, sizeof (tModel)); while (fread (&num_node, sizeof (int), 1, fptr) != 0) { /* read in the probabilities */ if (fread (NCModel[num_node].prob, sizeof (float), 4, fptr) == 0) { fprintf (stderr, "ERROR reading probs from binary file\n"); exit (0); } /* read in the max m.i. pos */ if (fread (&(NCModel[num_node].mut_info_pos), sizeof (int), 1, fptr) == 0) { fprintf (stderr, "ERROR reading m.i. pos from binary file\n"); exit (0); } /* check for cut nodes; */ if (num_node != 0 && (num_node-1) != prev_node) for (i = prev_node+1; i < num_node; i++) NCModel[i].mut_info_pos = -1; prev_node = num_node; } //check for cut nodes in last offset if (num_node != TOTAL_NODES-1) for (i = num_node+1; i < TOTAL_NODES; i++) NCModel[i].mut_info_pos = -1; // MODEL_READ = TRUE; return NCModel; } /******************************************************** * Read_Scoring_Model ********************************************************* * PURPOSE: read in the DNA sequence model generated by build-icm * INPUT: an OPEN file pointer associated with the file to read * OUTPUT: the MODEL (tModel **) containing the probabilities * to be used to score orfs. *********************************************************/ void Read_Scoring_Model (FILE *fptr,int m) { char line[150]; int num_node = 0; int prev_node = 0; int offset = -1; int i; ModelNo=m; /* read from bin file */ fgets (line, 150, fptr); //read the text header // read the maximum number of nodes we need and the depth of the tree model if (fread (&TOTAL_NODES, sizeof (int), 1, fptr) == 0) { fprintf (stderr, "ERROR reading binary file\n"); exit (0); } if (fread (&MAX_DEPTH, sizeof (int), 1, fptr) == 0) { fprintf (stderr, "ERROR reading binary file\n"); exit (0); } /* if ( (fscanf (fptr, "%d %d", &TOTAL_NODES, &MAX_DEPTH)) == EOF ) { fprintf (stderr, "Error in model file. Rerun build-icm\n"); exit (1); } // remove newline fgets (line, 150, fptr); */ MODEL[ModelNo] = (tModel **) Safe_malloc (sizeof (tModel *) * 3); for (i = 0; i < 3; i++) MODEL[ModelNo][i] = (tModel *) Safe_calloc (TOTAL_NODES, sizeof (tModel)); while (fread (&num_node, sizeof (int), 1, fptr) != 0) { if (num_node == 0) offset++; /* read in the probabilities */ if (fread (MODEL[ModelNo][offset][num_node].prob, sizeof (float), 4, fptr) == 0) { fprintf (stderr, "ERROR reading probs from binary file\n"); exit (0); } /* read in the max m.i. pos */ if (fread (&(MODEL[ModelNo][offset][num_node].mut_info_pos), sizeof (int), 1, fptr) == 0) { fprintf (stderr, "ERROR reading m.i. pos from binary file\n"); exit (0); } /* check for cut nodes; */ if (num_node != 0 && (num_node-1) != prev_node) for (i = prev_node+1; i < num_node; i++) MODEL[ModelNo][offset][i].mut_info_pos = -1; if (num_node == 0 && offset != 0) for (i = prev_node+1; i < TOTAL_NODES; i++) MODEL [ModelNo][offset-1][i].mut_info_pos = -1; prev_node = num_node; } if (offset != 2) { fprintf (stderr, "ERROR: offset = %d, rerun build-icm with OFFSET = 3\n", offset); exit (1); } //check for cut nodes in last offset if (num_node != TOTAL_NODES-1) for (i = num_node+1; i < TOTAL_NODES; i++) MODEL[ModelNo][offset][i].mut_info_pos = -1; MODEL_READ[ModelNo] = TRUE; } /************************************************************** * get_prob_of_window1 ************************************************************** * PURPOSE: get the probability of the window contained in orf, * starting at position start. * INPUT: the orf. starting position, start. the Model with the * correct offset to use for computing the probabilities * OUTPUT: the probability of the window *****************************************************************/ double get_prob_of_window1 (int start, tModel * Delta, char *orf) { int i, num_node = 0; double prob; int pos; INTERVAL = MODEL_LEN[ModelNo]; // start at the root's max mut info pos & go from there until // no more info can be obtained or we reach MAX_DEPTH for (i = 0; i < MAX_DEPTH; i++) { pos = Delta [num_node] . mut_info_pos; if (pos == -1) //FLAG indicating no more info is obtainable { num_node = PARENT (num_node); pos = Delta [num_node] . mut_info_pos; break; // no more info avail } switch (Filter (orf [pos+start])) { case 'a': case 'A': num_node = (num_node * 4) + 1; break; case 'c': case 'C': num_node = (num_node * 4) + 2; break; case 'g': case 'G': num_node = (num_node * 4) + 3; break; case 't': case 'T': num_node = (num_node * 4) + 4; break; default: fprintf (stderr, "switch error .%c. in get_prob_of_window1 start=%d\n", orf [pos+start], start); exit (1); } } // either we reached MAX_DEPTH or no more info is obtainable pos = Delta[num_node].mut_info_pos; if (pos == -1) { num_node = PARENT (num_node); pos = Delta[num_node] . mut_info_pos; } switch ( Filter (orf [start+INTERVAL-1])) { case 'a': case 'A': prob = Delta[num_node].prob[0]; break; case 'c': case 'C': prob = Delta[num_node].prob[1]; break; case 'g': case 'G': prob = Delta[num_node].prob[2]; break; case 't': case 'T': prob = Delta[num_node].prob[3]; break; default: fprintf (stderr, "switch error .%c. in 2nd switch of get_prob_of_window1 start=%d\n", orf [start+INTERVAL-1], start); exit (1); } if (prob == 0.0 || pos == -1) fprintf (stderr, "w1 prob = 0 at node %d %d\n", num_node, Delta[num_node].mut_info_pos); return prob; } double get_prob_of_window1_max (int start, tModel * Delta, char *orf, double &probmax) { int i, num_node = 0; double prob; int pos; INTERVAL = MODEL_LEN[ModelNo]; // start at the root's max mut info pos & go from there until // no more info can be obtained or we reach MAX_DEPTH for (i = 0; i < MAX_DEPTH; i++) { pos = Delta [num_node] . mut_info_pos; if (pos == -1) //FLAG indicating no more info is obtainable { num_node = PARENT (num_node); pos = Delta [num_node] . mut_info_pos; break; // no more info avail } switch (Filter (orf [pos+start])) { case 'a': case 'A': num_node = (num_node * 4) + 1; break; case 'c': case 'C': num_node = (num_node * 4) + 2; break; case 'g': case 'G': num_node = (num_node * 4) + 3; break; case 't': case 'T': num_node = (num_node * 4) + 4; break; default: fprintf (stderr, "switch error .%c. in get_prob_of_window1 start=%d\n", orf [pos+start], start); exit (1); } } // either we reached MAX_DEPTH or no more info is obtainable pos = Delta[num_node].mut_info_pos; if (pos == -1) { num_node = PARENT (num_node); pos = Delta[num_node] . mut_info_pos; } switch ( Filter (orf [start+INTERVAL-1])) { case 'a': case 'A': prob = Delta[num_node].prob[0]; break; case 'c': case 'C': prob = Delta[num_node].prob[1]; break; case 'g': case 'G': prob = Delta[num_node].prob[2]; break; case 't': case 'T': prob = Delta[num_node].prob[3]; break; default: fprintf (stderr, "switch error .%c. in 2nd switch of get_prob_of_window1 start=%d\n", orf [start+INTERVAL-1], start); exit (1); } probmax=(double) Delta[num_node].prob[0]; for(i=1;i<4;i++) if(Delta[num_node].prob[i]>probmax) probmax=(double) Delta[num_node].prob[i]; if (prob == 0.0 || pos == -1) fprintf (stderr, "w1 prob = 0 at node %d %d\n", num_node, Delta[num_node].mut_info_pos); return prob; } /************************************************************* * get_prob_of_window2 ************************************************************* * Purpose: Compute the probability of the orf using the positions * prior to pred_pos. * Input: pred_pos: the position to predict. Delta: the Model with the * correct offset to use for computing the probabilities. * orf: the open reading frame to score * Output: the probability at pred_pos **************************************************************/ double get_prob_of_window2 (int pred_pos, tModel * Delta, char *orf) { int num_node, i, offset, pos; double prob; INTERVAL = MODEL_LEN[ModelNo]; offset = INTERVAL - pred_pos - 1; num_node = 0; for (i = 0; i < MAX_DEPTH; i++) { pos = Delta[num_node].mut_info_pos - offset; if (pos < 0) break; switch (Filter (orf [pos]) ) { case 'a': case 'A': num_node = (num_node * 4) + 1; break; case 'c': case 'C': num_node = (num_node * 4) + 2; break; case 'g': case 'G': num_node = (num_node * 4) + 3; break; case 't': case 'T': num_node = (num_node * 4) + 4; break; default: fprintf (stderr, "switch error %c in get_prob_of_window2\n", orf[pos]); exit (-1); } } if (Delta[num_node].mut_info_pos == -1) num_node = PARENT (num_node); switch ( Filter (orf [pred_pos]) ) { case 'a': case 'A': prob = (double) Delta[num_node].prob[0]; break; case 'c': case 'C': prob = (double) Delta[num_node].prob[1]; break; case 'g': case 'G': prob = (double) Delta[num_node].prob[2]; break; case 't': case 'T': prob = (double) Delta[num_node].prob[3]; break; default: fprintf (stderr, "switch error %c in 2nd switch of get_prob_of_window2\n", orf [pred_pos]); exit (-1); } if (prob == 0.0) { fprintf (stderr, "warning, w2 ret prob 0 from node %d %d pred pos: %d %f %f %f %f\n", num_node, Delta[num_node].mut_info_pos, pred_pos, Delta[num_node].prob[0], Delta[num_node].prob[1],Delta[num_node].prob[2], Delta[num_node].prob[3] ); } // fprintf (stderr, "node = %d prob = %f\n", num_node, prob); return prob; } double get_prob_of_window2_max (int pred_pos, tModel * Delta, char *orf,double &probmax) { int num_node, i, offset, pos; double prob; INTERVAL = MODEL_LEN[ModelNo]; offset = INTERVAL - pred_pos - 1; num_node = 0; for (i = 0; i < MAX_DEPTH; i++) { pos = Delta[num_node].mut_info_pos - offset; if (pos < 0) break; switch (Filter (orf [pos]) ) { case 'a': case 'A': num_node = (num_node * 4) + 1; break; case 'c': case 'C': num_node = (num_node * 4) + 2; break; case 'g': case 'G': num_node = (num_node * 4) + 3; break; case 't': case 'T': num_node = (num_node * 4) + 4; break; default: fprintf (stderr, "switch error %c in get_prob_of_window2\n", orf[pos]); exit (-1); } } if (Delta[num_node].mut_info_pos == -1) num_node = PARENT (num_node); switch ( Filter (orf [pred_pos]) ) { case 'a': case 'A': prob = (double) Delta[num_node].prob[0]; break; case 'c': case 'C': prob = (double) Delta[num_node].prob[1]; break; case 'g': case 'G': prob = (double) Delta[num_node].prob[2]; break; case 't': case 'T': prob = (double) Delta[num_node].prob[3]; break; default: fprintf (stderr, "switch error %c in 2nd switch of get_prob_of_window2\n", orf [pred_pos]); exit (-1); } probmax=(double) Delta[num_node].prob[0]; for(i=1;i<4;i++) if(Delta[num_node].prob[i]>probmax) probmax=(double) Delta[num_node].prob[i]; if (prob == 0.0) { fprintf (stderr, "warning, w2 ret prob 0 from node %d %d pred pos: %d %f %f %f %f\n", num_node, Delta[num_node].mut_info_pos, pred_pos, Delta[num_node].prob[0], Delta[num_node].prob[1],Delta[num_node].prob[2], Delta[num_node].prob[3] ); } // fprintf (stderr, "node = %d prob = %f\n", num_node, prob); return prob; } /***************************************************************** * Fast_Evaluate ***************************************************************** * Purpose: Set score to the log of the probability of generating * DNA string orf [0 .. length] in the simple nonhomogeneous Markov * model with window length window_length & conditional probabilites Delta ******************************************************************/ void Fast_Evaluate (char *orf, int length, int window_length, tModel * Delta0, tModel * Delta1, tModel *Delta2, double & score) { int start = 0, stop = window_length-1, i; score = 0.0; //check_for_stop_codons (orf); //Dr D already does for (i = 0; i < window_length - 1; i ++) switch (i % 3) { case 0 : score += log (get_prob_of_window2 (i, Delta1, orf)); break; case 1 : score += log (get_prob_of_window2 (i, Delta2, orf)); break; case 2 : score += log (get_prob_of_window2 (i, Delta0, orf)); break; } for (start = 0; stop < length; start++, stop++) { switch (start % 3) { case 0: score += log (get_prob_of_window1 (start, Delta0, orf)); break; case 1: score += log (get_prob_of_window1 (start, Delta1, orf)); break; case 2: score += log (get_prob_of_window1 (start, Delta2, orf)); break; default: fprintf (stderr, "ERROR in switch in Fast Eval\n"); exit (1); } } } /***************************************************************** * NC_Fast_Evaluate ***************************************************************** * Purpose: Set score to the log of the probability of generating * DNA string orf [0 .. length] in the simple nonhomogeneous Markov * model with window length window_length & conditional probabilites Delta; * since string orf is non-coding (!) there is no need to use * different models according to reading frame ******************************************************************/ void NC_Fast_Evaluate (char *orf, int length, int window_length, tModel * Delta, double &score) { int start = 0, stop = window_length-1, i; score = 0.0; for (i = 0; i < window_length - 1; i ++) score += log (get_prob_of_window2 (i, Delta, orf)); for (start = 0; stop < length; start++, stop++) score += log (get_prob_of_window1 (start, Delta, orf)); } #endif