package align2; import java.util.Arrays; import stream.SiteScore; import dna.AminoAcid; import dna.ChromosomeArray; import dna.Data; import dna.Gene; /** * "P" for "Packed".
* Same as MSA2P, but the "prevState" field was removed. * Yields identical results to MSA2, but is faster. * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead, * or the time field should be shrunk. */ public final class MultiStateAligner9fs { public static void main(String[] args){ byte[] read=args[0].getBytes(); byte[] ref=args[1].getBytes(); byte[] original=ref; boolean colorspace=false; if(args.length>2 && args[2].equalsIgnoreCase("cs")){ colorspace=true; read=AminoAcid.toColorspace(read); ref=AminoAcid.toColorspace(ref); } MultiStateAligner9fs msa=new MultiStateAligner9fs(read.length, ref.length, colorspace); System.out.println("Initial: "); for(int mode=0; mode0); rows=read.length; columns=refEndLoc-refStartLoc+1; if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){ // assert(false) : minScore; return fillUnlimited(read, ref, refStartLoc, refEndLoc); } // final int BARRIER_I2=columns-BARRIER_I1; final int BARRIER_I2=rows-BARRIER_I1; final int BARRIER_D2=rows-BARRIER_D1; minScore-=100; //Increases quality trivially assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. assert(subfloor=0; i--){ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); } horizLimit[columns]=minScore_off; for(int i=columns-1; i>=0; i--){ horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); } for(int row=1; row<=rows; row++){ int colStart=minGoodCol; int colStop=maxGoodCol; minGoodCol=-1; maxGoodCol=-2; final int vlimit=vertLimit[row]; if(verbose2){ System.out.println(); System.out.println("row="+row); System.out.println("colStart="+colStart); System.out.println("colStop="+colStop); System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); } if(colStart<0 || colStop1){ assert(row>0); packed[MODE_MATCH][row][colStart-1]=subfloor; packed[MODE_INS][row][colStart-1]=subfloor; packed[MODE_DEL][row][colStart-1]=subfloor; packed[MODE_SUB][row][colStart-1]=subfloor; } for(int col=colStart; col<=columns; col++){ if(verbose2){ System.out.println("\ncol "+col); } final byte call0=(row<2 ? (byte)'?' : read[row-2]); final byte call1=read[row-1]; final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); final byte ref1=ref[refStartLoc+col-1]; final boolean gap=(ref1==GAPC); assert(call1!=GAPC); // final boolean match=(read[row-1]==ref[refStartLoc+col-1]); // final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); final boolean match=(call1==ref1 && ref1!='N'); final boolean prevMatch=(call0==ref0 && ref0!='N'); // System.err.println("") iterationsLimited++; final int limit=Tools.max(vlimit, horizLimit[col]); final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); final int delNeeded=Tools.max(0, row-col-1); final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); final int delPenalty=calcDelScoreOffset(delNeeded); final int insPenalty=calcInsScoreOffset(insNeeded); final int scoreFromMatch_MS=packed[MODE_MATCH][row-1][col-1]&SCOREMASK; final int scoreFromSub_MS=packed[MODE_SUB][row-1][col-1]&SCOREMASK; final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; final int scoreFromDiag_DEL=Tools.max(packed[MODE_MATCH][row][col-1]&SCOREMASK, packed[MODE_SUB][row][col-1]&SCOREMASK); final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; final int scoreFromDiag_INS=Tools.max(packed[MODE_MATCH][row-1][col]&SCOREMASK, packed[MODE_SUB][row-1][col]&SCOREMASK); final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; // if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? streak+1 : 1); prevState=MODE_MATCH; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } // if(time>MAX_TIME){time=MAX_TIME-MASK5;} // assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; // assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; //// packed[MODE_MS][row][col]=(score|prevState|time); // packed[MODE_MS][row][col]=(score|time); // assert((score&SCOREMASK)==score); //// assert((prevState&MODEMASK)==prevState); // assert((time&TIMEMASK)==time); final int limit2; if(delNeeded>0){ limit2=limit-delPenalty; }else if(insNeeded>0){ limit2=limit-insPenalty; }else{ limit2=limit; } assert(limit2>=limit); if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} if(score>=limit2){ maxGoodCol=col; if(minGoodCol<0){minGoodCol=col;} }else{ score=subfloor; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MATCH][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } } if(gap || (scoreFromMatch_MS<=limit3 && scoreFromSub_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){ packed[MODE_SUB][row][col]=subfloor; }else{//Calculate match and sub scores final int streak=(packed[MODE_SUB][row-1][col-1]&TIMEMASK); {//Calculate match/sub score int score; int time; byte prevState; int scoreM=match ? subfloor : scoreFromMatch_MS+POINTSoff_MATCHSUB; int scoreS=scoreFromSub_MS+(streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? 1 : streak+1); // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); prevState=MODE_MATCH; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } // if(time>MAX_TIME){time=MAX_TIME-MASK5;} // assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; // assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; //// packed[MODE_MS][row][col]=(score|prevState|time); // packed[MODE_MS][row][col]=(score|time); // assert((score&SCOREMASK)==score); //// assert((prevState&MODEMASK)==prevState); // assert((time&TIMEMASK)==time); final int limit2; if(delNeeded>0){ limit2=limit-delPenalty; }else if(insNeeded>0){ limit2=limit-insPenalty; }else{ limit2=limit; } assert(limit2>=limit); if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} if(score>=limit2){ maxGoodCol=col; if(minGoodCol<0){minGoodCol=col;} }else{ score=subfloor; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MATCH][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); */ } } if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ // assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; packed[MODE_DEL][row][col]=subfloor; }else{//Calculate DEL score final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : streak=scoreD){ score=scoreMS; time=1; prevState=MODE_MATCH; }else{ score=scoreD; time=streak+1; prevState=MODE_DEL; } final int limit2; if(insNeeded>0){ limit2=limit-insPenalty; }else if(delNeeded>0){ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); }else{ limit2=limit; } assert(limit2>=limit); if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} if(score>=limit2){ maxGoodCol=col; if(minGoodCol<0){minGoodCol=col;} }else{ score=subfloor; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_DEL][row][col]=(score|prevState|time); packed[MODE_DEL][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } // if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ packed[MODE_INS][row][col]=subfloor; }else{//Calculate INS score final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; int scoreMS=scoreFromDiag_INS+POINTSoff_INS; // int scoreD=scoreFromDel+POINTSoff_INS; int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : streak=colStop){ if(col>colStop && maxGoodCol1){ packed[MODE_MATCH][row-1][col+1]=subfloor; packed[MODE_INS][row-1][col+1]=subfloor; packed[MODE_DEL][row-1][col+1]=subfloor; } } } } int maxCol=-1; int maxState=-1; int maxScore=Integer.MIN_VALUE; for(int state=0; statemaxScore){ maxScore=x; maxCol=col; maxState=state; } } } assert(maxScore>=BADoff); // if(maxScore==BADoff){ // return null; // } // if(maxScore>=SCOREOFFSET; // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); return new int[] {rows, maxCol, maxState, maxScore}; } /** return new int[] {rows, maxC, maxS, max}; * Will not fill areas that cannot match minScore */ public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} else{ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); assert(gref!=null) : "Excessively long read:\n"+new String(read); return fillUnlimited(read, gref, 0, greflimit); } } /** return new int[] {rows, maxC, maxS, max}; * Does not require a min score (ie, same as old method) */ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ rows=read.length; columns=refEndLoc-refStartLoc+1; final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; final int subfloor=0-2*maxGain; assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. // final int BARRIER_I2=columns-BARRIER_I1; final int BARRIER_I2=rows-BARRIER_I1; final int BARRIER_D2=rows-BARRIER_D1; assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; assert(refEndLoc=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? streak+1 : 1); // prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; // prevState=MODE_DEL; }else{ score=scoreI; time=1; // prevState=MODE_INS; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MATCH][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); }else{ int scoreMS; if(ref1!='N' && call1!='N'){ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? 1 : streak+1); // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); prevState=MODE_MATCH; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MATCH][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } } } if(rowBARRIER_D2){ packed[MODE_DEL][row][col]=subfloor; }else{//Calculate DEL score final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_DEL; int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : streak=scoreD){ score=scoreMS; time=1; prevState=MODE_MATCH; }else{ score=scoreD; time=streak+1; prevState=MODE_DEL; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_DEL][row][col]=(score|prevState|time); packed[MODE_DEL][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } //Calculate INS score // if(gap || colBARRIER_I2){ if(gap || rowBARRIER_I2){ packed[MODE_INS][row][col]=subfloor; }else{//Calculate INS score final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_INS; // int scoreD=scoreFromDel+POINTSoff_INS; int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : streakmaxScore){ maxScore=x; maxCol=col; maxState=state; } } } maxScore>>=SCOREOFFSET; // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); return new int[] {rows, maxCol, maxState, maxScore}; } @Deprecated /** return new int[] {rows, maxC, maxS, max}; */ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; rows=read.length; columns=refEndLoc-refStartLoc+1; assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; assert(refEndLoc=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? streak+1 : 1); // prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; // prevState=MODE_DEL; }else{ score=scoreI; time=1; // prevState=MODE_INS; } score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MATCH][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); }else{ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? 1 : streak+1); // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); prevState=MODE_MATCH; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MATCH][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } } } {//Calculate DEL score final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_DEL; int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : streak=scoreD){ score=scoreMS; time=1; prevState=MODE_MATCH; }else{ score=scoreD; time=streak+1; prevState=MODE_DEL; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_DEL][row][col]=(score|prevState|time); packed[MODE_DEL][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } {//Calculate INS score final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_INS; // int scoreD=scoreFromDel+POINTSoff_INS; int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : streakmaxScore){ maxScore=x; maxCol=col; maxState=state; } } } maxScore>>=SCOREOFFSET; // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); return new int[] {rows, maxCol, maxState, maxScore}; } /** @return {score, bestRefStart, bestRefStop} */ /** Generates the match string */ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ if(gapped){ final byte[] gref=grefbuffer; int gstart=translateToGappedCoordinate(refStartLoc, gref); int gstop=translateToGappedCoordinate(refEndLoc, gref); byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); return out; }else{ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); } } /** Generates the match string */ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ // assert(false); assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; assert(row==rows); byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". int outPos=0; int gaps=0; if(state==MODE_INS){ //TODO ? Maybe not needed. } while(row>0 && col>0){ // byte prev0=(byte)(packed[state][row][col]&MODEMASK); final int time=packed[state][row][col]&TIMEMASK; final byte prev; // System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); if(state==MODE_MATCH){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MATCH][row-1][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} else{prev=MODE_INS;} } byte c=read[row-1]; byte r=ref[refStartLoc+col-1]; if(c==r){ out[outPos]='m'; }else{ if(!AminoAcid.isFullyDefined(c, colorspace)){ out[outPos]='N'; }else if(!AminoAcid.isFullyDefined(r, colorspace)){ // out[outPos]='X'; out[outPos]='N'; }else{ out[outPos]='S'; } } row--; col--; }else if(state==MODE_DEL){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel){prev=MODE_MATCH;} else{prev=MODE_DEL;} } byte r=ref[refStartLoc+col-1]; if(r==GAPC){ out[outPos]='-'; gaps++; }else{ out[outPos]='D'; } col--; }else{ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; if(scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} else{prev=MODE_INS;} } assert(state==MODE_INS) : state; if(col==0){ out[outPos]='X'; }else if(col>=columns){ out[outPos]='Y'; }else{ out[outPos]='I'; } row--; } // assert(prev==prev0); state=prev; outPos++; } assert(row==0 || col==0); if(col!=row){ while(row>0){ out[outPos]='X'; outPos++; row--; col--; } if(col>0){ //do nothing } } //Shrink and reverse the string byte[] out2=new byte[outPos]; for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); out[1]=translateFromGappedCoordinate(out[1], gref); out[2]=translateFromGappedCoordinate(out[2], gref); if(verbose){System.err.println("returning score "+Arrays.toString(out));} return out; }else{ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); } } /** @return {score, bestRefStart, bestRefStop} */ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, final int maxRow, final int maxCol, final int maxState){ int row=maxRow; int col=maxCol; int state=maxState; assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ score+=POINTSoff_NOREF; difR--; } row+=difR; col+=difR; } assert(refStartLoc<=refEndLoc); assert(row==rows); final int bestRefStop=refStartLoc+col-1; while(row>0 && col>0){ // System.err.println("state="+state+", row="+row+", col="+col); // byte prev0=(byte)(packed[state][row][col]&MODEMASK); final int time=packed[state][row][col]&TIMEMASK; final byte prev; if(state==MODE_MATCH){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MATCH][row-1][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} else{prev=MODE_INS;} } row--; col--; }else if(state==MODE_DEL){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel){prev=MODE_MATCH;} else{prev=MODE_DEL;} } col--; }else{ assert(state==MODE_INS); if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; if(scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} else{prev=MODE_INS;} } row--; } if(col<0){ System.err.println(row); break; //prevents an out of bounds access } // assert(prev==prev0); state=prev; // System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); } // assert(false) : row+", "+col; if(row>col){ col-=row; } final int bestRefStart=refStartLoc+col; score>>=SCOREOFFSET; int[] rvec; if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow int padLeft=Tools.max(0, refStartLoc-bestRefStart); int padRight=Tools.max(0, bestRefStop-refEndLoc); rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; }else{ rvec=new int[] {score, bestRefStart, bestRefStop}; } return rvec; } /** Will not fill areas that cannot match minScore. * @return {score, bestRefStart, bestRefStop} */ public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){ int a=Tools.max(0, refStartLoc); int b=Tools.min(ref.length-1, refEndLoc); assert(b>=a); int[] score; if(gaps==null){ if(verbose){ System.err.println("no gaps"); } if(b-a>=maxColumns){ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); assert(false) : refStartLoc+", "+refEndLoc; b=Tools.min(ref.length-1, a+maxColumns-1); } int[] max=fillLimited(read, ref, a, b, minScore, gaps); score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false)); }else{ if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);} int[] max=fillLimited(read, ref, a, b, minScore, gaps); if(verbose){System.err.println("max: "+Arrays.toString(max));} // score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true)); score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true)); } return score; } public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){ return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps); } /* public final int[] fillAndScoreLimited_Gapped(byte[] read, SiteScore ss, int thresh, int minScore){ if(ss.gaps==null){return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore);} int[] gaps=ss.gaps; final int bound1=gaps[0]=Tools.min(ss.start, gaps[0]); final int bound2=gaps[gaps.length-1]=Tools.max(ss.stop, gaps[gaps.length-1]); //This block is no longer needed since the array is preallocated. int len=0; final int gb2=GAPBUFFER*2; for(int i=0; iy); int gap=z-y-1; if(gap=len) : ss+"\t"+len+"\t"+gref.length; ChromosomeArray cha=Data.getChromosome(ss.chrom); for(int i=0, j=0; iy); int gap=z-y-1; assert(gap>=MINGAP); if(gap0); int rem=gap%GAPLEN; int lim=y+GAPBUFFER; for(int r=y+1; r<=lim; r++, j++){ gref[j]=cha.get(r); } for(int g=0; g-9999); break; } if(refc!=GAPC){ j++; }else{ j+=GAPLEN; } } assert(rstart2>-9999 && rstop2>-9999); scoreArray[1]=rstart2; scoreArray[2]=rstop2; return scoreArray; }*/ /** * Fills grefbuffer * @param ref * @param a * @param b * @param gaps * @return gref */ public final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ assert(gaps!=null && gaps.length>0); assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); assert(refEndLoc>=gaps[gaps.length-1]); final int g0_old=gaps[0]; final int gN_old=gaps[gaps.length-1]; gaps[0]=Tools.min(gaps[0], refStartLoc); gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); grefRefOrigin=gaps[0]; if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} // grefRefOrigin=Tools.min(gaps[0], refStartLoc); // //This block is no longer needed since the array is preallocated. // int len=0; // final int gb2=GAPBUFFER*2; // for(int i=0; iy); // int gap=z-y-1; // if(gapy); int gap=z-y-1; assert(gap>=MINGAP) : gap+"\t"+MINGAP; if(gap "+gref.length); return null; } for(int i=greflimit, r=refEndLoc+1; i "+j);} return j; } j+=(c==GAPC ? GAPLEN : 1); // if(c!=GAPC){j++;} // else{j+=GAPLEN;} } System.err.println(grefRefOrigin); System.err.println(point); System.err.println(new String(gref)); throw new RuntimeException("Out of bounds."); } private final int translateToGappedCoordinate(int point, byte[] gref){ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} if(point<=grefRefOrigin){return point-grefRefOrigin;} for(int i=0, j=grefRefOrigin; i "+i);} return i; } j+=(c==GAPC ? GAPLEN : 1); // if(c!=GAPC){j++;} // else{j+=GAPLEN;} } System.err.println(grefRefOrigin); System.err.println(point); System.err.println(new String(gref)); throw new RuntimeException("Out of bounds."); } public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){ return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps); } @Deprecated public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){ int a=Tools.max(0, refStartLoc); int b=Tools.min(ref.length-1, refEndLoc); assert(b>=a); if(b-a>=maxColumns){ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); b=Tools.min(ref.length-1, a+maxColumns-1); } int[] max=fillQ(read, ref, baseScores, a, b); // int[] score=score(read, ref, a, b, max[0], max[1], max[2]); // return score; return null; } @Deprecated public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){ return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores); } @Deprecated public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){ return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores); } // public final int scoreNoIndels(byte[] read, SiteScore ss){ // // ChromosomeArray cha=Data.getChromosome(ss.chrom); // final int refStart=ss.start; // // int score=0; // int mode=MODE_START; // int timeInMode=0; // if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match // // for(int i=0; icha.maxIndex+1){ // int dif=(cha.maxIndex+1-refStop); // readStop-=dif; // score+=POINTSoff_NOREF*dif; // } // //// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //No longer needed. // // for(int i=readStart; i0){//match if(loc==lastValue){//contiguous match score+=POINTS_MATCH2; }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match score+=POINTS_MATCH; }else if(loc=0); score+=POINTS_MATCH; score+=POINTS_DEL; int dif=lastLoc-loc+1; if(dif>MINGAP){ int rem=dif%GAPLEN; int div=(dif-GAPBUFFER2)/GAPLEN; score+=(div*POINTS_GAP); assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 // assert(false) : div; } if(dif>LIMIT_FOR_COST_5){ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; dif=LIMIT_FOR_COST_5; } if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_DEL2; } timeInMode=1; }else if(loc>lastLoc){//insertion assert(lastLoc>=0); score+=POINTS_MATCH; score+=POINTS_INS; int dif=Tools.min(loc-lastLoc+1, 5); assert(dif>0); if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_INS2; } timeInMode=1; }else{ assert(false); } lastLoc=loc; }else{//substitution if(lastValue<0 && timeInMode>0){//contiguous if(timeInMode0){//match if(loc==lastValue){//contiguous match score+=(POINTS_MATCH2+baseScores[i]); }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match score+=(POINTS_MATCH+baseScores[i]); }else if(loc=0); score+=(POINTS_MATCH+baseScores[i]); score+=POINTS_DEL; int dif=lastLoc-loc+1; if(dif>MINGAP){ int rem=dif%GAPLEN; int div=(dif-GAPBUFFER2)/GAPLEN; score+=(div*POINTS_GAP); assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 // assert(false) : div; } if(dif>LIMIT_FOR_COST_5){ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; dif=LIMIT_FOR_COST_5; } if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_DEL2; } timeInMode=1; }else if(loc>lastLoc){//insertion assert(lastLoc>=0); score+=(POINTS_MATCH+baseScores[i]); score+=POINTS_INS; int dif=Tools.min(loc-lastLoc+1, 5); assert(dif>0); if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_INS2; } timeInMode=1; }else{ assert(false); } lastLoc=loc; }else{//substitution if(lastValue<0 && timeInMode>0){//contiguous if(timeInModeref.length){ int dif=(refStop-ref.length); readStop-=dif; score+=POINTS_NOREF*dif; } // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. for(int i=readStart; iref.length){ int dif=(refStop-ref.length); readStop-=dif; score+=POINTS_NOREF*dif; } // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. for(int i=readStart; iref.length){ int dif=(refStop-ref.length); System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); readStop-=dif; score+=POINTS_NOREF*dif; } assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; assert(matchReturn!=null); assert(matchReturn.length==1); if(matchReturn[0]==null || matchReturn[0].length!=read.length){ assert(matchReturn[0]==null || matchReturn[0].lengthref.length){ int dif=(refStop-ref.length); System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); readStop-=dif; score+=POINTS_NOREF*dif; } assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; assert(matchReturn!=null); assert(matchReturn.length==1); if(matchReturn[0]==null || matchReturn[0].length!=read.length){ assert(matchReturn[0]==null || matchReturn[0].length=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; for(int i=0; i>SCOREOFFSET; String s=" "+num; if(s.length()>width){s=num>0 ? maxString : minString;} int spaces=width-s.length(); assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; for(int i=0; i=0); for(int i=0; iLIMIT_FOR_COST_5){ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; len=LIMIT_FOR_COST_5; } if(len>LIMIT_FOR_COST_4){ score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4; len=LIMIT_FOR_COST_4; } if(len>LIMIT_FOR_COST_3){ score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3; len=LIMIT_FOR_COST_3; } if(len>1){ score+=(len-1)*POINTS_DEL2; } return score; } private static int calcDelScoreOffset(int len){ if(len<=0){return 0;} int score=POINTSoff_DEL; if(len>LIMIT_FOR_COST_5){ score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5; len=LIMIT_FOR_COST_5; } if(len>LIMIT_FOR_COST_4){ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; len=LIMIT_FOR_COST_4; } if(len>LIMIT_FOR_COST_3){ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; len=LIMIT_FOR_COST_3; } if(len>1){ score+=(len-1)*POINTSoff_DEL2; } return score; } public static int calcInsScore(int len){ if(len<=0){return 0;} int score=POINTS_INS; if(len>LIMIT_FOR_COST_4){ score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4; len=LIMIT_FOR_COST_4; } if(len>LIMIT_FOR_COST_3){ score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3; len=LIMIT_FOR_COST_3; } if(len>1){ score+=(len-1)*POINTS_INS2; } return score; } private static int calcInsScoreOffset(int len){ if(len<=0){return 0;} int score=POINTSoff_INS; if(len>LIMIT_FOR_COST_4){ score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; len=LIMIT_FOR_COST_4; } if(len>LIMIT_FOR_COST_3){ score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; len=LIMIT_FOR_COST_3; } if(len>1){ score+=(len-1)*POINTSoff_INS2; } return score; } public final int maxRows; public final int maxColumns; private final int[][][] packed; private final byte[] grefbuffer; private int greflimit=-1; private int greflimit2=-1; private int grefRefOrigin=-1; public static final int GAPBUFFER=Shared.GAPBUFFER; public static final int GAPBUFFER2=Shared.GAPBUFFER2; public static final int GAPLEN=Shared.GAPLEN; public static final int MINGAP=Shared.MINGAP; public static final int GAPCOST=Shared.GAPCOST; public static final byte GAPC=Shared.GAPC; private static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN); /**DO NOT MODIFY*/ public final byte[] getGrefbuffer(){ return grefbuffer; } public final int[] vertLimit; public final int[] horizLimit; CharSequence showVertLimit(){ StringBuilder sb=new StringBuilder(); for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");} return sb; } CharSequence showHorizLimit(){ StringBuilder sb=new StringBuilder(); for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");} return sb; } // public static final int MODEBITS=2; public static final int TIMEBITS=12; public static final int SCOREBITS=32-TIMEBITS; public static final int MAX_TIME=((1<