//Copyright (c) 2003 by Mihaela Pertea and A. L. Delcher. // A. L. Delcher // // File: ~delcher/TIGR/nonuniform.h // Version: 1.02 4 Dec 97 // // Definitions for an array class whose index is strings. // #ifndef STRARRAY_H_INCLUDED #define STRARRAY_H_INCLUDED #include "delcher.h" #include "gene.h" #define USE_CHI_SQUARE 1 #define NEW_VERSION 0 const double ESTIMATE_THRESHOLD = 50.0; const double MINIMUM_DELTA_VAL = 1e-5; const double MINIMUM_COUNT = 0.5; const double MIN_CORRECT_PREDICTS = 15.0; const int SAMPLE_SIZE_BOUND = 400; const double SAMPLE_WEIGHT = 400.0; const double SIGNIFICANCE_THRESHOLD = 0.01; const double Z_SCORE_THRESHOLD = 2.33; const double CHI2_THRESHOLD = 11.3; // 0.01 significance v=3 float Analyze_Counts (float [], float [], int); float New_Analyze (float [], float [], int, float []); int Is_Significant (float, double, double); void Reverse_Complement (char [], long int); class String_Array { private: float * Val; int Max_Str_Len; int Alphabet_Size; int Num_Entries; double Combine (double, double); int String_To_Sub (char [], int); public: String_Array (); String_Array (int, int); ~ String_Array (); void Append (char []); void Combine (String_Array &, String_Array &); void Condense (String_Array &); void Convert_To_Logs (); float * Distribution (char [], int); void Estimate (String_Array &); void Incr (char [], int, double); double Maximization_Step (String_Array &, String_Array &); double Min_Val (); void Normalize (); void Read (char []); void Read (FILE *); void Set (double = 0.0); void Set_All (double = 0.0); void Set_Lambda (String_Array &); void Set_Single (String_Array * [], String_Array * []); int Value (char); void Write (char []); double operator [] (int i) { return Val [i]; } float & operator () (char [], int); }; String_Array :: String_Array () // Default constructor. { Val = NULL; Num_Entries = Max_Str_Len = Alphabet_Size = 0; } String_Array :: String_Array (int L, int S) // Construct a String_Array that can be indexed by strings of length // up to L over an alphabet of size S . { int i; assert (L > 0 && S > 0); Max_Str_Len = L; Alphabet_Size = S; Num_Entries = int (pow (S, L + 1) - 1) / (S - 1); Val = (float *) Safe_malloc (Num_Entries * sizeof (float)); Val [0] = 1.0; for (i = 1; i < Num_Entries; i ++) Val [i] = 0.0; } String_Array :: ~ String_Array () // Destroy this String_Array by freeing its memory. { free (Val); } void String_Array :: Append (char S []) // Append the values of this String_Array to the file named S . { FILE * fp; int i; fp = fopen (S, "a"); if (fp == NULL) { fprintf (stderr, "ERROR: Could not open file \"%s\"\n", S); exit (-1); } fprintf (fp, "%d %d %d\n", Max_Str_Len, Alphabet_Size, Num_Entries); for (i = 0; i < Num_Entries; i ++) fprintf (fp, "%6.5f\n", Val [i]); fclose (fp); return; } double String_Array :: Combine (double Plus, double Minus) // Set the value Plus / (Plus + Minus) . { double Denom; Denom = Plus + Minus; if (Denom == 0) return 0.0; else return Plus / Denom; } void String_Array :: Combine (String_Array & Plus, String_Array & Minus) // Set all the values in this String_Array to corresponding // value Plus / (Plus + Minus) . { int i; for (i = 0; i < Num_Entries; i ++) Val [i] = Combine (Plus . Val [i], Minus . Val [i]); return; } void String_Array :: Condense (String_Array & L) // Reset the delta values for this string array to be // used as a simple model that is equivalent to the context model // using the lambda values in L . { long int i, Ct, Prev_i, Prev_i_Start, Prev_Size; long int Size, Sub; assert (Max_Str_Len == L . Max_Str_Len + 1); Val [0] = 1.0; Ct = Prev_i = Prev_i_Start = Sub = 0; Prev_Size = 1; Size = Alphabet_Size; for (i = 1; i < Num_Entries; i ++) { Val [i] = L . Val [Sub] * Val [i] + (1.0 - L . Val [Sub]) * Val [Prev_i_Start + Prev_i]; if (++ Ct == Size) { Ct = 0; Prev_i_Start += Prev_Size; Prev_Size = Size; Size *= Alphabet_Size; Prev_i = 0; } else Prev_i = (Prev_i + 1) % Prev_Size; if (i % Alphabet_Size == 0) Sub ++; } return; } void String_Array :: Convert_To_Logs () // Reset each values to its logarithm. { int i; for (i = 0; i < Num_Entries; i ++) Val [i] = log (Val [i]); return; } float * String_Array :: Distribution (char S [], int L) // Return a pointer to the first element in the array representing the // probability distribution of the next character occurring after // the context S [0 .. L-1] . { char * Tmp; int i; assert (L >= 0); Tmp = (char *) Safe_malloc (1 + L); strncpy (Tmp, S, L); Tmp [L] = 'a'; i = String_To_Sub (Tmp, L + 1); free (Tmp); return Val + i; } void String_Array :: Estimate (String_Array & D) // Estimate lambda values for this string array from // delta counts in D . { int i; assert (Max_Str_Len == D . Max_Str_Len - 1); Val [0] = 1.0; for (i = 1; i < Num_Entries; i ++) Val [i] = Min (0.99, D . Val [i] / ESTIMATE_THRESHOLD); return; } void String_Array :: Incr (char S [], int L, double D) // Add D to the entry at the subscript indicated by string // S [0 .. L-1] in this String_Array . { int i; i = String_To_Sub (S, L); Val [i] += D; return; } double String_Array :: Maximization_Step (String_Array & Plus, String_Array & Minus) // Reset the values in this String_Array using Plus and Minus . // Return the maximum change between an old and new value. { double Denom, Diff, Old, Max_Change = 0.0; int i; Val [0] = 1.0; for (i = 1; i < Num_Entries; i ++) { Denom = Plus . Val [i] + Minus . Val [i]; Old = Val [i]; if (Denom != 0) { Val [i] = Plus . Val [i] / Denom; if ((Diff = fabs (Val [i] - Old)) > Max_Change) Max_Change = Diff; } } return Max_Change; } double String_Array :: Min_Val () // Return the minimum value in this String_Array . { double Min; int i; Min = Val [0]; for (i = 1; i < Num_Entries; i ++) { if (Val [i] < Min) Min = Val [i]; } return Min; } void String_Array :: Normalize () // Convert the values in this String_Array to probabilities. { double Sum; long int j, Ct, Prev_Size, Prev_Start, Prev_Sub, Size, Start; Val [0] = 1.0; Ct = 0; Prev_Size = 1; Prev_Start = 0; Prev_Sub = 0; Size = Alphabet_Size; for (Start = 1; Start < Num_Entries; Start += Alphabet_Size) { if (Prev_Size > 1) for (j = 0; j < Alphabet_Size; j ++) Val [Start + j] += Val [Prev_Start + Prev_Sub + j]; Sum = 0.0; for (j = 0; j < Alphabet_Size; j ++) Sum += Val [Start + j]; if (Sum == 0.0) { for (j = 0; j < Alphabet_Size; j ++) Val [Start + j] = 0.0; } else { for (j = 0; j < Alphabet_Size; j ++) { Val [Start + j] /= Sum; if (Val [Start + j] < MINIMUM_DELTA_VAL) Val [Start + j] = MINIMUM_DELTA_VAL; } } if (++ Ct == Size) { Ct = 0; Prev_Start += Prev_Size; Prev_Size = Size; Size *= Alphabet_Size; Prev_Sub = 0; } else Prev_Sub = (Prev_Sub + Alphabet_Size) % Prev_Size; } return; } void String_Array :: Read (char S []) // Read the values of this String_Array from the file named S . { FILE * fp; int i, M, A, N; fp = fopen (S, "r"); if (fp == NULL) { fprintf (stderr, "ERROR: Could not open file \"%s\"\n", S); exit (-1); } fscanf (fp, "%d %d %d\n", & M, & A, & N); assert (M == Max_Str_Len && A == Alphabet_Size && N == Num_Entries); for (i = 0; i < Num_Entries; i ++) fscanf (fp, "%e\n", Val + i); Val [0] = 1.0; fclose (fp); return; } void String_Array :: Read (FILE * fp) // Read the values of this String_Array from the file fp , // which should already be open. Leave it open so that the next // String_Array can be read from the same file. { int i, M, A, N; fscanf (fp, "%d %d %d\n", & M, & A, & N); assert (M == Max_Str_Len && A == Alphabet_Size && N == Num_Entries); for (i = 0; i < Num_Entries; i ++) fscanf (fp, "%e\n", Val + i); Val [0] = 1.0; return; } void String_Array :: Set (double X) // Set the value of all elements in this String_Array to X . { int i; Val [0] = 1.0; for (i = 1; i < Num_Entries; i ++) Val [i] = X; return; } void String_Array :: Set_All (double X) // Set *all* the value of all elements in this String_Array to X . { int i; for (i = 0; i < Num_Entries; i ++) Val [i] = X; return; } void String_Array :: Set_Lambda (String_Array & D) // Estimate lambda values for this string array from // delta counts in D . { float * Prob, Sum; int Len; long int Prev_Start, Prev_Sub, Size, Sub; long int i, Ct, Prev_i, Prev_i_Start, Prev_Size, Used; assert (Max_Str_Len == D . Max_Str_Len - 1); printf ("Context Length Usage\n"); printf (" Len Used\n"); Len = 1; Used = 0; Val [0] = 1.0; Prob = (float *) Safe_malloc (D . Num_Entries * sizeof (float)); Sum = 0.0; for (i = 1; i <= Alphabet_Size; i ++) Sum += D . Val [i]; for (i = 1; i <= Alphabet_Size; i ++) Prob [i] = D . Val [i] / Sum; // Process the counts in blocks based on context length (i.e., // first block is contexts of length 1, second is contexts of length 2, ...) // Prev_Start is the first subscript of the one-smaller context block // e.g. i represents the context string whose lambda value is being set. // Suppose it's "at". Then Sub will represent // string "ata". The counts at Sub .. Sub + 3 will determine // the lambda value for string i . Prev_i represents the next // smaller context, in this case "t", whose probabilities are already // computed and stored starting at Prev_Sub .. Prev_Sub + 3 // (representing strings "ta", "tc", "tg" and "tt"). Sub = Alphabet_Size + 1; Prev_Start = 1; Prev_i_Start = 0; Prev_i = Prev_Sub = 0; Ct = 0; Prev_Size = 1; Size = Alphabet_Size; for (i = 1; i < Num_Entries; i ++) { #if 0 if (Val [Prev_i_Start + Prev_i] < 1.0) Val [i] = 0.0; else Val [i] = Analyze_Counts (D . Val + Sub, D . Val + Prev_Start + Prev_Sub, Alphabet_Size); #endif // Set the lambda value for this context based on the counts in D . Val // and the probabilities in Prob . Val [i] = New_Analyze (D . Val + Sub, Prob + Prev_Start + Prev_Sub, Alphabet_Size, Prob + Sub); #if NEW_VERSION Val [i] *= Val [Prev_i_Start + Prev_i]; #endif if (Val [i] > 0.0) Used ++; if (++ Ct == Size) // If jump up to the next context length { printf ("%3d %9ld (%3d%%)\n", Len ++, Used, int (0.5 + (100.0 * Used) / Size)); Used = 0; Ct = 0; Prev_i_Start += Prev_Size; Prev_Start += Size; Prev_Size *= Alphabet_Size; Size *= Alphabet_Size; Prev_i = 0; Prev_Sub = 0; } else { Prev_Sub = (Prev_Sub + Alphabet_Size) % Size; Prev_i = (Prev_i + 1) % Prev_Size; } Sub += Alphabet_Size; } free (Prob); return; } void String_Array :: Set_Single (String_Array * L_Plus [], String_Array * L_Minus []) // Set values so that for each string position of the maximum of all suffix values // is set to 1.0 . { double Max, X; int i, k, Ct, Len, Sub, Size; Ct = Len = 0; Size = 1; for (i = 0; i < Num_Entries; i ++) { Max = Combine ((* L_Plus [Len]) [i], (* L_Minus [Len]) [i]); Sub = Len; for (k = Len - 1; k >= 0; k --) { X = Combine ((* L_Plus [k]) [i], (* L_Minus [k]) [i]); if (X > Max) { Max = X; Sub = k; } } if (Max == 0.0 || Sub < Len || (* L_Plus [Len]) [i] < MIN_CORRECT_PREDICTS) Val [i] = 0.0; else Val [i] = 1.0; if (++ Ct == Size) { Ct = 0; Len ++; Size *= Alphabet_Size; } } return; } int String_Array :: String_To_Sub (char S [], int L) // Convert string S [0 .. L-1] to a subscript in the Val array. { int i, Sub; assert (L <= Max_Str_Len); if (L == 0) return 0; Sub = 0; for (i = 0; i < L; i ++) { Sub = Sub * Alphabet_Size + Value (S [i]); } Sub += int (pow (Alphabet_Size, L) - 1) / (Alphabet_Size - 1); return Sub; } int String_Array :: Value (char Ch) // Return the numeric value of character Ch . { switch (tolower (Ch)) { case 'a' : return 0; case 'c' : return 1; case 'g' : return 2; case 't' : return 3; default : fprintf (stderr, "ERROR: Unexpected character \'%c\' (ASCII %d)\n", Ch, int (Ch)); exit (-1); } return(-1); } void String_Array :: Write (char S []) // Write the values of this String_Array to the file named S . { FILE * fp; int i; fp = fopen (S, "w"); if (fp == NULL) { fprintf (stderr, "ERROR: Could not open file \"%s\"\n", S); exit (-1); } fprintf (fp, "%d %d %d\n", Max_Str_Len, Alphabet_Size, Num_Entries); for (i = 0; i < Num_Entries; i ++) fprintf (fp, "%6.5f\n", Val [i]); fclose (fp); return; } float & String_Array :: operator () (char S [], int L) // Return a reference to the value associated with string S [0 .. L-1] . { int i; i = String_To_Sub (S, L); return Val [i]; } float Analyze_Counts (float A [], float B [], int N) // Return the lambda value for the frequency counts in A [0 .. N-1] // corresponding to the longer context string compared to the frequency // counts in B [0 .. N-1] for the next shorter suffix context string. { double A_Sum, B_Sum, E; double Chi2; int i; A_Sum = 0.0; for (i = 0; i < N; i ++) A_Sum += A [i]; if (A_Sum >= SAMPLE_SIZE_BOUND) return 1.0; B_Sum = 0.0; for (i = 0; i < N; i ++) B_Sum += B [i]; if (B_Sum == 0.0) return 0.0; #if USE_CHI_SQUARE Chi2 = 0.0; for (i = 0; i < N; i ++) { E = A_Sum * (B [i] + MINIMUM_COUNT) / (B_Sum + 4 * MINIMUM_COUNT); Chi2 += pow (A [i] - E, 2.0) / E; } if (Chi2 >= CHI2_THRESHOLD) return Min (1.0, A_Sum / SAMPLE_WEIGHT); #else for (i = 0; i < N; i ++) if (Is_Significant (A [i], A_Sum, B [i] / B_Sum)) return Min (1.0, A_Sum / SAMPLE_WEIGHT); #endif return 0.0; } float New_Analyze (float A [], float P [], int N, float C []) // Return the lambda value for the frequency counts in A [0 .. N-1] // corresponding to the longer context string compared to the predicted // probabilities in P [0 .. N-1] for the next shorter suffix context string. // Also set C [0 .. N-1] to the probabilities for this longer context // string. { const int CHI2_ENTRIES = 7; float Chi2_Val [CHI2_ENTRIES] = {2.37, 4.11, 6.25, 7.81, 9.35, 11.3, 12.8}; float Significance [CHI2_ENTRIES] = {0.50, 0.75, 0.90, 0.95, 0.975, 0.99, 0.995}; double A_Sum, Chi2, E, Lambda; int i; A_Sum = 0.0; for (i = 0; i < N; i ++) A_Sum += A [i]; // If context string has occurred enough, then automatically use it // without interpolating, i.e., set lambda to 1.0 . if (A_Sum >= SAMPLE_SIZE_BOUND) { for (i = 0; i < N; i ++) C [i] = (A [i] + P [i]) / (A_Sum + 1.0); // Add P so that probabilites are not 0.0 return 1.0; } // Calculate chi-square statistic Chi2 = 0.0; for (i = 0; i < N; i ++) { E = A_Sum * P [i]; if (E != 0.0) Chi2 += pow (A [i] - E, 2.0) / E; } // Find the significance level for that statistic for (i = 0; i < CHI2_ENTRIES && Chi2_Val [i] < Chi2; i ++) ; if (i == 0) Lambda = 0.0; else if (i == CHI2_ENTRIES) Lambda = 1.0; else // interpolate Signifance in array Lambda = Significance [i - 1] + ((Chi2 - Chi2_Val [i - 1]) / (Chi2_Val [i] - Chi2_Val [i - 1])) * (Significance [i] - Significance [i - 1]); // Multiply significance by a factor determined by the number of samples Lambda *= A_Sum / SAMPLE_WEIGHT; if (Lambda > 1.0) Lambda = 1.0; // Set probabilities for this context as Lambda times probs from counts // plus (1 - Lambda) times probs of one-shorter context for (i = 0; i < N; i ++) C [i] = Lambda * ((A [i] + P [i]) / (A_Sum + 1.0)) + (1.0 - Lambda) * P [i]; return Lambda; } int Is_Significant (float K, double N, double P) // Return TRUE iff getting K positives in N trials // is significant when the probability of positive is P . { double Coeff, P_Pow, Q, Q_Pow, Std_Dev, Sum, z; double i; int j; assert (N >= 0.0 && K >= 0.0 && K <= N && P >= 0.0 && P <= 1.0); if (N == 0.0) return FALSE; Q = 1.0 - P; if (N * P >= 5.0 && N * Q >= 5.0) { // Use Normal approximation Std_Dev = sqrt (N * P * Q); z = fabs ((K - N * P) / Std_Dev); return (z >= Z_SCORE_THRESHOLD); } if (K >= N * P) { Coeff = 1.0; P_Pow = pow (P, N); Q_Pow = 1.0; Sum = 0.0; j = 1; for (i = N; i >= K; i --) { Sum += Coeff * P_Pow * Q_Pow; Coeff *= ((double) i) / j ++; P_Pow /= P; Q_Pow *= Q; } return (Sum <= SIGNIFICANCE_THRESHOLD); } else { Coeff = 1.0; P_Pow = 1.0; Q_Pow = pow (Q, N); Sum = 0.0; j = 1; for (i = 0; i <= K; i ++) { Sum += Coeff * P_Pow * Q_Pow; Coeff *= (N - i) / j ++; P_Pow *= P; Q_Pow /= Q; } return (Sum <= SIGNIFICANCE_THRESHOLD); } } void Reverse_Complement (char S [], long int T) // Set S [1 .. T] to its DNA reverse complement. { char Ch; long int i, j; for (i = 1, j = T; i < j; i ++, j --) { Ch = S [j]; S [j] = Complement (S [i]); S [i] = Complement (Ch); } if (i == j) S [i] = Complement (S [i]); } #endif