/* @source compseq application ** ** Counts the composition of dimer/trimer/etc words in a sequence ** ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk) ** @@ ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ******************************************************************************/ #include "emboss.h" static void compseq_readexpfreq(AjPTable *exptable, AjPFile infile, ajint size); static void compseq_makebigarray(ajulong no_elements, ajulong **bigarray); static double compseq_getexpfreqnuc(const AjPStr dispseq, ajint word, const ajulong *calcfreq_array, ajulong calcfreq_total); static double compseq_getexpfreqprot(const AjPStr dispseq, ajint word, AjBool ignorebz, const ajulong *calcfreq_array, ajulong calcfreq_total); /* @prog compseq ************************************************************** ** ** Counts the composition of dimer/trimer/etc words in a sequence ** ******************************************************************************/ int main(int argc, char **argv) { AjPSeqall seqall; AjPSeq seq; ajuint word; AjBool zerocount; AjBool ignorebz; ajint frame; AjPFile outfile; AjPFile infile; AjBool reverse; AjBool calcfreq; ajuint pos; ajuint endpos; const char *s; ajulong result; ajulong *bigarray; ajulong *calcfreq_array; ajulong no_elements; ajulong no_calcfreq_elements; AjBool first_time_round = ajTrue; ajulong count; AjPStr dispseq = NULL; const AjPStr tmpstr = NULL; /* from table, no need to delete */ ajulong total = 0; /* no. of words counted */ ajulong calcfreq_total = 0; /* no. of single bases/residues counted */ ajulong other = 0; /* no. of words containing 'other' bases/residues */ ajulong calcfreq_other = 0; /* no. of single 'other' bases/residues observed */ AjBool otherflag; AjBool seqisnuc=ajFalse; ajint increment; ajint count_of_sequence_names = 0; AjPTable exptable = NULL; /* table of expected frequencies */ /* holds the string of "Other" for looking in the hash table 'exptable' */ AjPStr strother = NULL; /* Don't have a file of expected frequencies yet */ AjBool have_exp_freq = ajFalse; double default_exp_freq; double obs_freq = 0.0; double exp_freq; double obs_exp; embInit("compseq", argc, argv); seqall = ajAcdGetSeqall("sequence"); word = ajAcdGetInt("word"); outfile = ajAcdGetOutfile("outfile"); infile = ajAcdGetInfile("infile"); zerocount = ajAcdGetBoolean("zerocount"); ignorebz = ajAcdGetBoolean("ignorebz"); frame = ajAcdGetInt("frame"); reverse = ajAcdGetBoolean("reverse"); calcfreq = ajAcdGetBoolean("calcfreq"); /* Output some documentation to the results file */ ajFmtPrintF(outfile, "#\n# Output from 'compseq'\n#\n"); if(!zerocount) ajFmtPrintF(outfile,"# Words with a frequency of zero are " "not reported.\n"); if(!ignorebz) ajFmtPrintF(outfile,"# The amino acid codes 'B' and 'Z' will " "be counted,\n# rather than treated as 'Other'.\n"); if(frame) ajFmtPrintF(outfile,"# Only words in frame %d will be " "counted.\n", frame); if(infile == NULL) { if (calcfreq) ajFmtPrintF(outfile,"# The Expected frequencies are " "calculated from the observed single\n" "# base or residue frequencies in these sequences\n"); else ajFmtPrintF(outfile,"# The Expected frequencies are " "calculated on the (false) assumption that every\n" "# word has equal frequency.\n"); } else { ajFmtPrintF(outfile,"# The Expected frequencies are taken " "from the file: %S\n",ajFileGetPrintnameS(infile)); compseq_readexpfreq(&exptable, infile, word); have_exp_freq = ajTrue; ajFileClose(&infile); } ajFmtPrintF(outfile, "#\n"); ajFmtPrintF(outfile, "# The input sequences are:\n"); /* ** see if using a sliding window or only looking at a word-sized ** single frame */ if(frame) increment = word; else increment = 1; while(ajSeqallNext(seqall, &seq)) { ajSeqTrim(seq); /* note the name of the sequence */ if(count_of_sequence_names++ < 10) ajFmtPrintF(outfile, "#\t%s\n", ajSeqGetNameC(seq)); else if(count_of_sequence_names++ == 11) ajFmtPrintF(outfile, "# ... et al.\n"); /* first of all need to make a store for the results */ if(first_time_round) { seqisnuc = ajSeqIsNuc(seq); if(!embNmerGetNoElements(&no_elements, word, seqisnuc, ignorebz)) ajFatal("The word size is too large for the data " "structure available."); compseq_makebigarray(no_elements, &bigarray); first_time_round = ajFalse; if(calcfreq) { if(!embNmerGetNoElements(&no_calcfreq_elements, word, seqisnuc, ignorebz)) ajFatal("The word size is too large for the data " "structure available."); compseq_makebigarray(no_calcfreq_elements, &calcfreq_array); } } ajSeqFmtUpper(seq); s = ajSeqGetSeqC(seq); /* ** Start at the first position, or at the frame, if it has been ** specified and then look at each word in a sliding window if ** no frame is specified, or at each increment of the word-size ** if frame is specified. Stop when less than a word-length from ** the end of the sequence. */ if(ajSeqGetLen(seq) > word) endpos = ajSeqGetLen(seq)-word; else endpos = 1; for(pos=frame; pos <= endpos; pos += increment) { if(seqisnuc) result = embNmerNuc2int(s, word, pos, &otherflag); else result = embNmerProt2int(s, word, pos, &otherflag, ignorebz); /* count this word */ if(!otherflag) bigarray[result]++; else other++; total++; } if(calcfreq) { /* Count the single bases or residues to get the observed frequences */ for(pos=0; pos < ajSeqGetLen(seq); pos++) { if(seqisnuc) result = embNmerNuc2int(s, 1, pos, &otherflag); else result = embNmerProt2int(s, 1, pos, &otherflag, ignorebz); if(!otherflag) calcfreq_array[result]++; else calcfreq_other++; calcfreq_total++; } } if(seqisnuc && reverse) { /* Do it again on the reverse strand */ ajSeqReverseForce(seq); s = ajSeqGetSeqC(seq); for(pos=frame; pos <= ajSeqGetLen(seq)-word; pos += increment) { result = embNmerNuc2int(s, word, pos, &otherflag); /* count this word */ if(!otherflag) bigarray[result]++; else other++; total++; } if(calcfreq) { /* ** Count the complemented single bases to get their ** observed frequences */ for(pos=0; pos < ajSeqGetLen(seq); pos++) { result = embNmerNuc2int(s, 1, pos, &otherflag); if(!otherflag) calcfreq_array[result]++; else calcfreq_other++; calcfreq_total++; } } } } /* Give the word size used */ ajFmtPrintF(outfile,"\n\nWord size\t%d\n", word); /* Now output the Total count */ ajFmtPrintF(outfile,"Total count\t%Lu\n\n", total); /* we have now counted the frequency of the words in the sequences */ ajFmtPrintF(outfile, "#\n# Word\tObs Count\tObs Frequency\tExp Frequency\t" "Obs/Exp Frequency\n#\n"); /* ** If there's no file of expected word frequencies, ** then use the frequencies of single bases or residues to calculate them. ** If we are not calculating the frequencies, then make a default ** by giving each word an equal frequency. */ default_exp_freq = 1/(double)no_elements; for(count=0; count 1? ** I don't think so, because words may contain more than 1 ** 'other' base, but there isn't an easy way to do this ** without storing the sequence of all words with an 'other' ** in them. Maybe do this in a later version? **/ exp_freq = (double)calcfreq_other/(double)calcfreq_total; } /* get the Other Observed/Expected ratio */ if((!have_exp_freq && !calcfreq) || exp_freq==0.0) { exp_freq = 0.0; /* display a big number rather than a divide by zero error */ obs_exp = 10000000000.0; } else obs_exp = obs_freq/exp_freq; ajFmtPrintF(outfile, "\nOther\t%Lu\t\t%.7f\t%.7f\t%.7f\n", other, obs_freq, exp_freq, obs_exp); ajFileClose(&outfile); ajSeqallDel(&seqall); ajSeqDel(&seq); AJFREE(bigarray); ajStrDel(&dispseq); ajStrDel(&strother); if(have_exp_freq) ajTablestrFree(&exptable); embExit(); return 0; } /* @funcstatic compseq_makebigarray ******************************************* ** ** Create an array to hold counts of the observed frequencies of words. ** ** @param [r] no_elements [ajulong] Size of array ** @param [w] bigarray [ajulong**] Returned pointer to array ** @return [void] ** @@ ******************************************************************************/ static void compseq_makebigarray(ajulong no_elements, ajulong **bigarray) { ajDebug("makebigarray for %Ld elements\n", no_elements); AJCNEW(*bigarray, no_elements); ajDebug("makebigarray - after AJCNEW\n"); return; } /* @funcstatic compseq_readexpfreq ******************************************** ** ** Read the expected frequencies of words from a file produced by a ** previous run of this program. ** ** @param [w] exptable [AjPTable*] Table of words and their ** expected frequencies ** @param [u] infile [AjPFile] Input file holding expected frequencies ** @param [r] size [ajint] Size of word expected (used for validation) ** @return [void] ** @@ ******************************************************************************/ static void compseq_readexpfreq(AjPTable *exptable, AjPFile infile, ajint size) { AjPStr line = NULL; char whiteSpace[] = " \t\n\r"; AjPStrTok tokens; AjPStr sizestr = NULL; ajint thissize; AjPStr key; AjPStr value; /* initialise the hash table - use case-insensitive comparison */ *exptable = ajTablestrNewCase(350); /* read the file */ while(ajReadlineTrim(infile, &line)) { /* skip comment and blank lines */ if(!ajStrFindC(line, "#")) continue; if(!ajStrGetLen(line)) continue; /* look for the word size */ if(!ajStrFindC(line, "Word size")) { ajStrAssignSubS(&sizestr, line, 10, ajStrGetLen(line)-1); ajStrTrimWhite(&sizestr); ajStrToInt(sizestr, &thissize); if(size == thissize) break; ajFatal("The word size you are counting (%d) is different " "to the word\nsize in the file of expected " "frequencies (%d).", size, thissize); } else ajFatal("The 'Word size' line was not found, instead " "found:\n%S\n", line); } /* read the file */ while(ajReadlineTrim(infile, &line)) { /* skip comment and blank lines */ if(!ajStrFindC(line, "#")) continue; if(!ajStrGetLen(line)) continue; /* ** look for the total number of counts - anything after this ** is the data */ if(!ajStrFindC(line, "Total")) break; } /* read in the observed frequencies as a string */ while(ajReadlineTrim(infile, &line)) { /* skip comment and blank lines */ if(!ajStrFindC(line, "#")) continue; if(!ajStrGetLen(line)) continue; tokens = ajStrTokenNewC(line, whiteSpace); /* get the word as the key */ key = ajStrNew(); ajStrTokenNextParse( &tokens, &key); /* ** get the observed frequency as the value - we'll use this as ** the expected frequency */ value = ajStrNew(); /* skip the observed count column */ ajStrTokenNextParse(&tokens, &value); ajStrTokenNextParse(&tokens, &value); ajTablePut(*exptable, key, value); ajStrTokenDel(&tokens); } ajStrDel(&line); ajStrDel(&sizestr); ajStrTokenDel(&tokens); return; } /* @funcstatic compseq_getexpfreqnuc ****************************************** ** ** Using the observed counts of single bases in the sequence this ** calculates the expected frequency of a word of one or more bases ** as the product of the single base frequencies. ** ** @param [r] dispseq [const AjPStr] sequence of word ** @param [r] word [ajint] word length ** @param [r] calcfreq_array [const ajulong *] array of counts of single bases ** @param [r] calcfreq_total [ajulong] number of bases counted ** @return [double] Calculated expected frequency of this word ** @@ ******************************************************************************/ static double compseq_getexpfreqnuc(const AjPStr dispseq, ajint word, const ajulong *calcfreq_array, ajulong calcfreq_total) { ajint i; ajint offset = 0; char c; double result; result = 1.0; for(i=0; i