/* @source newcpgreport application ** ** Reports ALL cpg rich regions in a sequence ** @author Copyright (C) Rodrigo Lopez (rls@ebi.ac.uk) ** @@ ** ** Original program "CPGREPORT" by Rodrigo Lopez (EGCG 1995) ** CpG island finder. Larsen et al Genomics 13 1992 p1095-1107 ** "usually defined as >200bp with %GC > 50% and obs/exp CpG > ** 0.6". Here use running sum rather than window to score: if not CpG ** at position i, then decrement runSum counter, but if CpG then runSum ** += CPGSCORE. Spans > threshold are searched ** for recursively. ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ******************************************************************************/ #include "emboss.h" #include #include static void newcpgreport_findbases(const AjPStr substr, ajint len, ajint window, ajint shift, float *obsexp, float *xypc, const AjPStr bases, float *obsexpmax, ajint *plotstart, ajint *plotend); static void newcpgreport_countbases(const char *seq, const char *bases, ajint window, ajint *cx, ajint *cy, ajint *cxpy); static void newcpgreport_identify(AjPFile outf, const float *obsexp, const float *xypc, AjBool *thresh, ajint begin, ajint len, ajint shift, const char *bases, const char *name, ajint minlen, float minobsexp, float minpc, const char *seq); static void newcpgreport_reportislands(AjPFile outf, const AjBool *thresh, const char *bases, const char *name, float minobsexp, float minpc, ajint minlen, ajint begin, ajint len, const char *seq); static void newcpgreport_compisl(AjPFile outf, const char *p, ajint begin1, ajint end1); /* @prog newcpgreport ********************************************************* ** ** Report CpG rich areas ** ******************************************************************************/ int main(int argc, char **argv) { AjPSeqall seqall; AjPSeq seq = NULL; AjPFile outf = NULL; AjPStr strand = NULL; AjPStr substr = NULL; AjPStr bases = NULL; ajint begin; ajint end; ajint len; ajint minlen; float minobsexp; float minpc; ajint window; ajint shift; ajint plotstart; ajint plotend; float *xypc = NULL; float *obsexp = NULL; AjBool *thresh = NULL; float obsexpmax; ajint i; ajint maxarr; embInit("newcpgreport",argc,argv); seqall = ajAcdGetSeqall("sequence"); window = ajAcdGetInt("window"); shift = ajAcdGetInt("shift"); outf = ajAcdGetOutfile("outfile"); minobsexp = ajAcdGetFloat("minoe"); minlen = ajAcdGetInt("minlen"); minpc = ajAcdGetFloat("minpc"); substr = ajStrNew(); bases = ajStrNewC("CG"); maxarr = 0; while(ajSeqallNext(seqall, &seq)) { begin = ajSeqallGetseqBegin(seqall); end = ajSeqallGetseqEnd(seqall); strand = ajSeqGetSeqCopyS(seq); ajStrFmtUpper(&strand); ajStrAssignSubC(&substr,ajStrGetPtr(strand),--begin,--end); len=ajStrGetLen(substr); if(len > maxarr) { AJCRESIZE(obsexp, len); AJCRESIZE(thresh, len); AJCRESIZE(xypc, len); maxarr = len; } for(i=0;i obsexp[j]) ? *obsexpmax : obsexp[j]; } xypc[j] = (cxf/windowf)*(float)100.0 + (cyf/windowf)*(float)100.0; } *plotend = j; return; } /* @funcstatic newcpgreport_countbases **************************************** ** ** Undocumented. ** ** @param [r] seq [const char*] sequence ** @param [r] bases [const char*] two-base to look for e.g. GC ** @param [r] window [ajint] window ** @param [w] cx [ajint*] amount C ** @param [w] cy [ajint*] amount G ** @param [w] cxpy [ajint*] amount CG ** @@ ******************************************************************************/ static void newcpgreport_countbases(const char *seq, const char *bases, ajint window, ajint *cx, ajint *cy, ajint *cxpy) { ajint i; ajint codex; ajint codey; ajint codea; ajint codeb; *cxpy = *cx = *cy = 0; codex = ajBaseAlphaToBin(bases[0]); codey = ajBaseAlphaToBin(bases[1]); codeb = ajBaseAlphaToBin(seq[0]); for(i=0; iminobsexp)&&(avpc>minpc)) { if(!sumlen) first=pos; /* start a new island */ sumlen += shift; ajDebug(" ** hit first: %d sumlen: %d\n", first, sumlen); } else { if(sumlen >= minlen) { /* island long enough? */ for(i=first; i<=pos-shift;++i) thresh[i]=ajTrue; } sumlen = 0; } } if(sumlen>=minlen) { for(i=first;i %.2f.\n",minobsexp); ajFmtPrintF(outf,"CC %% %c + %% %c > %.2f.\n",bases[0], bases[1],minpc); ajFmtPrintF(outf,"CC Length > %d.\n",minlen); ajFmtPrintF(outf,"XX\n"); ajFmtPrintF(outf,"FH Key Location/Qualifiers\n"); island = ajFalse; for(i=0;i