/** * * struct.h: Basic data structures * * Author: A.R.Subramanian */ /** * score matrix (e.g. BLOSUM62) */ struct scr_matrix { int length; // number of amino acids int max_score; // maximum among all entries in the data array int *char2num; // resolves the character of an amino acid to its number int *num2char; // resolves the number of an amino acid to its character int *data; // contains the matrix indexed by the number of the particular amino acid int *dist; // number of pairs of amino acids (i,j) having score at equal to // the value of the index long double **raw_dist; double avg_sim_score; }; /** * raw sequence */ struct seq { char *data; // sequence data char *name; // name/description of the sequence int num; // number of the sequence int length; // length of sequence int max_seen; char *dna_num; // Numbers fo retranslation from Protein to DNA int orf_frame; // reading frame of the longest orf char crick_strand; // orf translation or sequence translation on crickstrand }; /** * sequence collection */ struct seq_col { struct seq *seqs; // array of the sequences int avg_length; // average length of sequences int length; // number of sequences }; /** * probability distribution of scores in diagonals */ struct prob_dist { struct scr_matrix *smatrix; // pointer to the associated score matrix long double **data; // distribution of scores dist[i][j] contains the // probability of a diags of length i having score >=j double **log_data; // distribution of scores dist[i][j] contains the // -log(probability of a diags of length i having score >=j) // long double *expect; // score expectancy for each diaglen unsigned int max_dlen; // maximum diaglength in dist }; /** * part of a sequence (auxiliary data structure) */ struct seq_part { int num; // a number that indicates a position in an array struct seq* sq; // the pointer to the sequence int startpos; // startpos in the sequence //int leftmargin; //int rightmargin; }; /** * diagonal in the dotmatrix */ struct diag { struct seq_part seq_p1; // first sequence part struct seq_part seq_p2; // seconde sequence part unsigned int length; // length of the diag long score; // score of the diag long orig_score; // orig score of the diag struct diag *pred_diag; // predecessor diag for dynamic programming struct diag *col_pred_diag; // col predecessor diag for dynamic programming int pool_pos; // position in diag pool char meetsThreshold; // whether diag meets threshold // for vertex cover int degree; int max_degree; struct diag **neighbours; char anchor; // if this is an anchor diag char marked; // marking flag for arbitrary use char multi_dg; // is >0 if this is a multi dg struct diag **multi_cont; // the contained dgs of this is a multi dg int multi_length; // size of multi_cont // char onlyOverThres; double weight; // weight of the diag = -log(prob) double weight_sum; // weight sum for dialign double weight_fac; // weight factor double ov_weight; // overlap weight double total_weight; // total_weight = weight+o_weight }; /** * collection of diag */ struct simple_diag_col { unsigned int length; // number of diags double total_weight; // total weight double weight_fac; // weight factor struct diag** data; // the array of diags }; /** * guide tree node */ struct gt_node { char isLeaf; // whether it is leaf int *seq_num; // the sequence numbers int seq_num_length; // length of sequence numbers array struct gt_node *succ1; // successor nodes struct gt_node *succ2; }; /** * vertex cover node struct vc_node { double weight; struct diag *dg; int degree; struct vc_node *adjacents; } */ /** * collection of all diagonals sorted by the sequences */ struct diag_col { int seq_amount; // number of sequences involved struct simple_diag_col** diag_matrix; // diag_matrix[i +seq_amount *j] contains // all diags found involving the sequences i and j double total_weight; // total weight double average_weight; // average_weight struct diag** diags; // all diags unordered unsigned int diag_amount; // number of diags found struct gt_node *gt_root; }; /** * diag container */ struct diag_cont { struct diag* dg; struct diag_cont *next; }; /** * alignment position */ struct algn_pos { // int seq_num; // sequence number // unsigned long pos_in_seq; // position in the sequence char state; // orphane: not aligned to any pos, struct diag_cont *dg_cont; // diags that are aligned with that position int row; int col; // unsigned int succFPos; // if orphane, the position holding the succF // unsigned int predFPos; // analogous to succFPos // char* isAli; // if alignemnt at the positions exist int* predF; // predecessor frontier, only filled if non-orphane int* succF; // successor frontier, only filled if non-orphane char *proceed; // for output //char isInherited; // whether the pointers are inherited int predFPos; // in case of orphane, where to find the predF or succF int succFPos; int *eqcAlgnPos; // equivalence class minimum alignment position (for output) struct algn_pos *eqcParent; // equivalence class parent int eqcRank; // equivalence class rank (>= maximum number of children) // unsigned int *maxpos; // needed for output of the alignment }; /** * alignment */ struct alignment { //int seq_amount; // number of sequences involved //char *redo_seqs; // which pairs of sequences are to be aligned again char *seq_is_orphane; // boolean array indicating for each sequence // whether it is orphane or not int max_pos; // the greatest position in the alignment (including all -'s) // if <0: the alignment has not yet been prepared struct seq_col *scol; // all the sequences involved struct algn_pos **algn; // the alignment double total_weight; // the total weight of the alignment struct alignment *next; // pointer to next alignment in the sorted linked list //struct alignment *prev; // pointer to previous alignment in the sorted linked list //unsigned long pos; // position in the sorted linked list //struct diag** aligned_diags; // all aligned diags //int aligned_diags_amount; //int max_aligned_diags_amount; //int orig_max_aligned_diags_amount; //struct diag_cont* backlog_diags; // all backlog diags };