/* @source megamerger application ** ** Merge two large overlapping nucleic acid sequences ** ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk) ** @@ ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ******************************************************************************/ #include "emboss.h" static void megamerger_Merge(const AjPList matchlist, const AjPSeq seq1, const AjPSeq seq2, AjPSeqout seqout, AjPFile outfile, AjBool prefer); /* @prog megamerger *********************************************************** ** ** Merge two large overlapping nucleic acid sequences ** ******************************************************************************/ int main(int argc, char **argv) { AjPSeq seq1; AjPSeq seq2; AjPSeqout seqout; ajint wordlen; AjPTable seq1MatchTable = NULL; AjPList matchlist = NULL; AjPFile outfile; AjBool prefer; embInit("megamerger", argc, argv); wordlen = ajAcdGetInt("wordsize"); seq1 = ajAcdGetSeq("asequence"); seq2 = ajAcdGetSeq("bsequence"); prefer = ajAcdGetBoolean("prefer"); outfile = ajAcdGetOutfile("outfile"); seqout = ajAcdGetSeqout("outseq"); /* trim sequences to -sbegin and -send */ ajSeqTrim(seq1); ajSeqTrim(seq2); embWordLength(wordlen); if(embWordGetTable(&seq1MatchTable, seq1)) /* get table of words */ matchlist = embWordBuildMatchTable(seq1MatchTable, seq2, ajTrue); else ajFatal("No match found\n"); /* get the minimal set of overlapping matches */ embWordMatchMin(matchlist); if(ajListGetLength(matchlist)) { /* make the output file */ megamerger_Merge(matchlist, seq1, seq2, seqout, outfile, prefer); /* tidy up */ } embWordMatchListDelete(&matchlist); /* free the match structures */ embWordFreeTable(&seq1MatchTable); ajSeqoutClose(seqout); ajFileClose(&outfile); ajSeqDel(&seq1); ajSeqDel(&seq2); ajSeqoutDel(&seqout); embExit(); return 0; } /* @funcstatic megamerger_Merge *********************************************** ** ** Marge and write a report on the merge of the two sequences. ** ** @param [r] matchlist [const AjPList] List of minimal non-overlapping matches ** @param [r] seq1 [const AjPSeq] Sequence to be merged. ** @param [r] seq2 [const AjPSeq] Sequence to be merged. ** @param [w] seqout [AjPSeqout] Output merged sequence ** @param [u] outfile [AjPFile] Output file containing report. ** @param [r] prefer [AjBool] If TRUE, use the first sequence when there ** is a mismatch ** @return [void] ** @@ ******************************************************************************/ static void megamerger_Merge(const AjPList matchlist, const AjPSeq seq1,const AjPSeq seq2, AjPSeqout seqout, AjPFile outfile, AjBool prefer) { AjIList iter = NULL; /* match list iterator */ EmbPWordMatch p = NULL; /* match structure */ ajint count = 0; /* count of matches */ AjPStr seqstr; /* merged sequence string */ AjPStr s1; /* string of seq1 */ AjPStr s2; /* string of seq2 */ ajuint prev1end = 0; ajuint prev2end = 0; /* end positions (+1) of previous match */ ajuint mid1; ajuint mid2; /* middle of a mismatch region */ AjPStr tmp; /* holds sequence string while uppercasing */ AjPSeq seq = NULL; tmp = ajStrNew(); seqstr = ajStrNew(); s1 = ajSeqGetSeqCopyS(seq1); s2 = ajSeqGetSeqCopyS(seq2); /* change the sequences to lowercase to highlight problem areas */ ajStrFmtLower(&s1); ajStrFmtLower(&s2); /* title line */ ajFmtPrintF(outfile, "# Report of megamerger of: %s and %s\n\n", ajSeqGetNameC(seq1), ajSeqGetNameC(seq2)); iter = ajListIterNewread(matchlist); while(!ajListIterDone(iter)) { p = (EmbPWordMatch) ajListIterGet(iter); /* first match? */ if(!count++) { ajFmtPrintF(outfile, "%s overlap starts at %d\n", ajSeqGetNameC(seq1), p->seq1start+1); ajFmtPrintF(outfile, "%s overlap starts at %d\n\n", ajSeqGetNameC(seq2), p->seq2start+1); /* get initial sequence before the overlapping region */ if(p->seq1start == 0) { /* ** ignore the initial bit if both sequences match from ** the start */ if(p->seq2start > 0) { ajFmtPrintF(outfile, "Using %s 1-%d as the " "initial sequence\n\n", ajSeqGetNameC(seq2), p->seq2start); ajStrAssignSubS(&seqstr, s2, 0, p->seq2start-1); } } else if(p->seq2start == 0) { ajFmtPrintF(outfile, "Using %s 1-%d as the initial " "sequence\n\n", ajSeqGetNameC(seq1), p->seq1start); ajStrAssignSubS(&seqstr, s1, 0, p->seq1start-1); } else { ajFmtPrintF(outfile, "WARNING!\n"); ajFmtPrintF(outfile, "Neither sequence's overlap is at " "the start of the sequence\n"); if(p->seq1start > p->seq2start) { ajFmtPrintF(outfile, "Using %s 1-%d as the " "initial sequence\n\n", ajSeqGetNameC(seq1), p->seq1start); ajStrAssignSubS(&tmp, s1, 0, p->seq1start-1); ajStrFmtUpper(&tmp); ajStrAssignS(&seqstr, tmp); } else { ajFmtPrintF(outfile, "Using %s 1-%d as the initial " "sequence\n\n", ajSeqGetNameC(seq2), p->seq2start); ajStrAssignSubS(&tmp, s2, 0, p->seq2start-1); ajStrFmtUpper(&tmp); ajStrAssignS(&seqstr, tmp); } } } else { /* ** output the intervening mismatch between the previous ** matching region */ ajFmtPrintF(outfile, "\nWARNING!\nMismatch region found:\n"); if(prev1endseq1start) ajFmtPrintF(outfile, "Mismatch %s %d-%d\n", ajSeqGetNameC(seq1), prev1end+1, p->seq1start); else ajFmtPrintF(outfile, "Mismatch %s %d\n", ajSeqGetNameC(seq1), prev1end); if(prev2endseq2start) ajFmtPrintF(outfile, "Mismatch %s %d-%d\n", ajSeqGetNameC(seq2), prev2end+1, p->seq2start); else ajFmtPrintF(outfile, "Mismatch %s %d\n", ajSeqGetNameC(seq2), prev2end); if(prefer) { /* use sequence 1 as the 'correct' one */ ajStrAssignSubS(&tmp, s1, prev1end, p->seq1start-1); ajStrFmtUpper(&tmp); ajStrAppendS(&seqstr, tmp); } else { /* ** use the sequence where the mismatch is furthest ** from the end as the 'correct' one */ mid1 = (prev1end + p->seq1start-1)/2; mid2 = (prev2end + p->seq2start-1)/2; /* is the mismatch closer to the ends of seq1 or seq2? */ if(AJMIN(mid1, ajSeqGetLen(seq1)-mid1-1) < AJMIN(mid2, ajSeqGetLen(seq2)-mid2-1)) { ajFmtPrintF(outfile, "Mismatch is closer to the ends " "of %s, so use %s in the merged " "sequence\n\n", ajSeqGetNameC(seq1), ajSeqGetNameC(seq2)); if(prev2end < p->seq2start) { ajStrAssignSubS(&tmp, s2, prev2end, p->seq2start-1); ajStrFmtUpper(&tmp); ajStrAppendS(&seqstr, tmp); } } else { ajFmtPrintF(outfile, "Mismatch is closer to the ends of %s, " "so use %s in the merged sequence\n\n", ajSeqGetNameC(seq2), ajSeqGetNameC(seq1)); if(prev1end < p->seq1start) { ajStrAssignSubS(&tmp, s1, prev1end, p->seq1start-1); ajStrFmtUpper(&tmp); ajStrAppendS(&seqstr, tmp); } } } } /* output the match */ ajFmtPrintF(outfile, "Matching region %s %d-%d : %s %d-%d\n", ajSeqGetNameC(seq1), p->seq1start+1, p->seq1start + p->length, ajSeqGetNameC(seq2), p->seq2start+1, p->seq2start + p->length); ajFmtPrintF(outfile, "Length of match: %d\n", p->length); ajStrAppendSubS(&seqstr, s1, p->seq1start, p->seq1start + p->length-1); /* ** note the end positions (+1) to get the intervening region ** between matches */ prev1end = p->seq1start + p->length; prev2end = p->seq2start + p->length; } /* end of overlapping region */ ajFmtPrintF(outfile, "\n%s overlap ends at %d\n", ajSeqGetNameC(seq1), p->seq1start+p->length); ajFmtPrintF(outfile, "%s overlap ends at %d\n\n", ajSeqGetNameC(seq2), p->seq2start+p->length); /* is seq1 only longer than the matched regions? */ if(prev2end >= ajSeqGetLen(seq2) && prev1end < ajSeqGetLen(seq1)) { ajFmtPrintF(outfile, "Using %s %d-%d as the final " "sequence\n\n", ajSeqGetNameC(seq1), prev1end+1, ajSeqGetLen(seq1)); ajStrAppendSubS(&seqstr, s1, prev1end, ajSeqGetLen(seq1)-1); /* is seq2 only longer than the matched regions? */ } else if(prev1end >= ajSeqGetLen(seq1) && prev2end < ajSeqGetLen(seq2)) { ajFmtPrintF(outfile, "Using %s %d-%d as the final " "sequence\n\n", ajSeqGetNameC(seq2), prev2end+1, ajSeqGetLen(seq2)); ajStrAppendSubS(&seqstr, s2, prev2end, ajSeqGetLen(seq2)-1); /* both are longer! */ } else if(prev1end < ajSeqGetLen(seq1) && prev2end < ajSeqGetLen(seq2)) { ajFmtPrintF(outfile, "WARNING!\n"); ajFmtPrintF(outfile, "Neither sequence's overlap is at the " "end of the sequence\n"); if(ajSeqGetLen(seq1)-prev1end > ajSeqGetLen(seq2)-prev2end) { ajFmtPrintF(outfile, "Using %s %d-%d as the final " "sequence\n\n", ajSeqGetNameC(seq1), prev1end+1, ajSeqGetLen(seq1)); ajStrAssignSubS(&tmp, s1, prev1end, ajSeqGetLen(seq1)-1); ajStrFmtUpper(&tmp); ajStrAppendS(&seqstr, tmp); } else { ajFmtPrintF(outfile, "Using %s %d-%d as the final " "sequence\n\n", ajSeqGetNameC(seq2), prev2end+1, ajSeqGetLen(seq2)); ajStrAssignSubS(&tmp, s2, prev2end, ajSeqGetLen(seq2)-1); ajStrFmtUpper(&tmp); ajStrAppendS(&seqstr, tmp); } } /* write out sequence at end */ seq = ajSeqNewSeq(seq1); ajSeqAssignSeqS(seq, seqstr); ajSeqoutWriteSeq(seqout, seq); ajSeqDel(&seq); ajStrDel(&s1); ajStrDel(&s2); ajStrDel(&tmp); ajStrDel(&seqstr); ajListIterDel(&iter); return; }