package align2; import java.util.Arrays; import stream.SiteScore; import dna.AminoAcid; /** * "P" for "Packed".
* Same as MSA2P, but the "prevState" field was removed. * Yields identical results to MSA2, but is faster. * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead, * or the time field should be shrunk. */ public final class MultiStateAligner9ts extends MSA{ public static void main(String[] args){ byte[] read=args[0].getBytes(); byte[] ref=args[1].getBytes(); byte[] original=ref; boolean colorspace=false; if(args.length>2 && args[2].equalsIgnoreCase("cs")){ colorspace=true; read=AminoAcid.toColorspace(read); ref=AminoAcid.toColorspace(ref); } MultiStateAligner9ts msa=new MultiStateAligner9ts(read.length, ref.length, colorspace); System.out.println("Initial: "); for(int mode=0; mode0); rows=read.length; columns=refEndLoc-refStartLoc+1; final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 : Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2; if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){ // assert(false) : minScore; // assert(minScore>0) : minScore; // assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length); return fillUnlimited(read, ref, refStartLoc, refEndLoc); } // final int BARRIER_I2=columns-BARRIER_I1; final int BARRIER_I2=rows-BARRIER_I1; final int BARRIER_D2=rows-BARRIER_D1; minScore-=120; //Increases quality trivially assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. assert(subfloor=0; i--){ byte c=read[i]; if(AminoAcid.isFullyDefined(c)){ vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); prevDefined=true; }else{ vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); prevDefined=false; } } horizLimit[columns]=minScore_off; prevDefined=false; for(int i=columns-1; i>=0; i--){ byte c=ref[refStartLoc+i]; if(AminoAcid.isFullyDefined(c)){ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); prevDefined=true; }else{ horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); prevDefined=false; } } // vertLimit[rows]=minScore_off; // for(int i=rows-1; i>=0; i--){ // vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); // } // // horizLimit[columns]=minScore_off; // for(int i=columns-1; i>=0; i--){ // horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); // } for(int row=1; row<=rows; row++){ final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband)); final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1)); minGoodCol=-1; maxGoodCol=-2; final int vlimit=vertLimit[row]; if(verbose2){ System.out.println(); System.out.println("row="+row); System.out.println("colStart="+colStart); System.out.println("colStop="+colStop); System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); } if(colStart<0 || colStop1){ assert(row>0); packed[MODE_MS][row][colStart-1]=subfloor; packed[MODE_INS][row][colStart-1]=subfloor; packed[MODE_DEL][row][colStart-1]=subfloor; } for(int col=colStart; col<=columns; col++){ if(verbose2){ System.out.println("\ncol "+col); } final byte call0=(row<2 ? (byte)'?' : read[row-2]); final byte call1=read[row-1]; final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); final byte ref1=ref[refStartLoc+col-1]; final boolean gap=(ref1==GAPC); assert(call1!=GAPC); // final boolean match=(read[row-1]==ref[refStartLoc+col-1]); // final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); final boolean match=(call1==ref1 && ref1!='N'); final boolean prevMatch=(call0==ref0 && ref0!='N'); // System.err.println("") iterationsLimited++; final int limit=Tools.max(vlimit, horizLimit[col]); final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); final int delNeeded=Tools.max(0, row-col-1); final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); final int delPenalty=calcDelScoreOffset(delNeeded); final int insPenalty=calcInsScoreOffset(insNeeded); final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; // if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? streak+1 : 1); prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } // if(time>MAX_TIME){time=MAX_TIME-MASK5;} // assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; // assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; //// packed[MODE_MS][row][col]=(score|prevState|time); // packed[MODE_MS][row][col]=(score|time); // assert((score&SCOREMASK)==score); //// assert((prevState&MODEMASK)==prevState); // assert((time&TIMEMASK)==time); }else{ int scoreMS; if(ref1!='N' && call1!='N'){ scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? 1 : streak+1); // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } // if(time>MAX_TIME){time=MAX_TIME-MASK5;} // assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; // assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; //// packed[MODE_MS][row][col]=(score|prevState|time); // packed[MODE_MS][row][col]=(score|time); // assert((score&SCOREMASK)==score); //// assert((prevState&MODEMASK)==prevState); // assert((time&TIMEMASK)==time); } final int limit2; if(delNeeded>0){ limit2=limit-delPenalty; }else if(insNeeded>0){ limit2=limit-insPenalty; }else{ limit2=limit; } assert(limit2>=limit); if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} if(score>=limit2){ maxGoodCol=col; if(minGoodCol<0){minGoodCol=col;} }else{ score=subfloor; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MS][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } } if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ // assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; packed[MODE_DEL][row][col]=subfloor; }else{//Calculate DEL score final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : streak=scoreD){ score=scoreMS; time=1; prevState=MODE_MS; }else{ score=scoreD; time=streak+1; prevState=MODE_DEL; } final int limit2; if(insNeeded>0){ limit2=limit-insPenalty; }else if(delNeeded>0){ limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); }else{ limit2=limit; } assert(limit2>=limit); if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} if(score>=limit2){ maxGoodCol=col; if(minGoodCol<0){minGoodCol=col;} }else{ score=subfloor; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_DEL][row][col]=(score|prevState|time); packed[MODE_DEL][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } // if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ packed[MODE_INS][row][col]=subfloor; }else{//Calculate INS score final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; int scoreMS=scoreFromDiag_INS+POINTSoff_INS; // int scoreD=scoreFromDel+POINTSoff_INS; int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : streak=colStop){ if(col>colStop && (maxGoodCol0)){break;} if(row>1){ packed[MODE_MS][row-1][col+1]=subfloor; packed[MODE_INS][row-1][col+1]=subfloor; packed[MODE_DEL][row-1][col+1]=subfloor; } } } } int maxCol=-1; int maxState=-1; int maxScore=Integer.MIN_VALUE; for(int state=0; statemaxScore){ maxScore=x; maxCol=col; maxState=state; } } } assert(maxScore>=BADoff); // if(maxScore==BADoff){ // return null; // } // if(maxScore>=SCOREOFFSET; // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); return new int[] {rows, maxCol, maxState, maxScore}; } @Override public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} else{ byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); assert(gref!=null) : "Excessively long read:\n"+new String(read); return fillUnlimited(read, gref, 0, greflimit); } } /** return new int[] {rows, maxC, maxS, max}; * Does not require a min score (ie, same as old method) */ private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ rows=read.length; columns=refEndLoc-refStartLoc+1; final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; final int subfloor=0-2*maxGain; assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n" +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more. // final int BARRIER_I2=columns-BARRIER_I1; final int BARRIER_I2=rows-BARRIER_I1; final int BARRIER_D2=rows-BARRIER_D1; //temporary, for finding a bug if(rows>maxRows || columns>maxColumns){ throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); } assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; assert(refEndLoc=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? streak+1 : 1); // prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; // prevState=MODE_DEL; }else{ score=scoreI; time=1; // prevState=MODE_INS; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MS][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); }else{ int scoreMS; if(ref1!='N' && call1!='N'){ scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? 1 : streak+1); // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MS][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } } } if(rowBARRIER_D2){ packed[MODE_DEL][row][col]=subfloor; }else{//Calculate DEL score final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_DEL; int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : streak=scoreD){ score=scoreMS; time=1; prevState=MODE_MS; }else{ score=scoreD; time=streak+1; prevState=MODE_DEL; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_DEL][row][col]=(score|prevState|time); packed[MODE_DEL][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } //Calculate INS score // if(gap || colBARRIER_I2){ if(gap || rowBARRIER_I2){ packed[MODE_INS][row][col]=subfloor; }else{//Calculate INS score final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_INS; // int scoreD=scoreFromDel+POINTSoff_INS; int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : streakmaxScore){ maxScore=x; maxCol=col; maxState=state; } } } maxScore>>=SCOREOFFSET; // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); return new int[] {rows, maxCol, maxState, maxScore}; } @Deprecated /** return new int[] {rows, maxC, maxS, max}; */ public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; rows=read.length; columns=refEndLoc-refStartLoc+1; assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; assert(refEndLoc=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? streak+1 : 1); // prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; // prevState=MODE_DEL; }else{ score=scoreI; time=1; // prevState=MODE_INS; } score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MS][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); }else{ int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ score=scoreMS; time=(prevMatch ? 1 : streak+1); // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); prevState=MODE_MS; }else if(scoreD>=scoreI){ score=scoreD; time=1; prevState=MODE_DEL; }else{ score=scoreI; time=1; prevState=MODE_INS; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_MS][row][col]=(score|prevState|time); packed[MODE_MS][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } } } {//Calculate DEL score final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_DEL; int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : streak=scoreD){ score=scoreMS; time=1; prevState=MODE_MS; }else{ score=scoreD; time=streak+1; prevState=MODE_DEL; } if(time>MAX_TIME){time=MAX_TIME-MASK5;} assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; // packed[MODE_DEL][row][col]=(score|prevState|time); packed[MODE_DEL][row][col]=(score|time); assert((score&SCOREMASK)==score); // assert((prevState&MODEMASK)==prevState); assert((time&TIMEMASK)==time); } {//Calculate INS score final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; int scoreMS=scoreFromDiag+POINTSoff_INS; // int scoreD=scoreFromDel+POINTSoff_INS; int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : streakmaxScore){ maxScore=x; maxCol=col; maxState=state; } } } maxScore>>=SCOREOFFSET; // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); return new int[] {rows, maxCol, maxState, maxScore}; } @Override /** @return {score, bestRefStart, bestRefStop} */ /** Generates the match string */ public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ if(gapped){ final byte[] gref=grefbuffer; int gstart=translateToGappedCoordinate(refStartLoc, gref); int gstop=translateToGappedCoordinate(refEndLoc, gref); byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); return out; }else{ return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); } } @Override /** Generates the match string */ public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ // assert(false); assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; assert(row==rows); byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". int outPos=0; int gaps=0; if(state==MODE_INS){ //TODO ? Maybe not needed. } while(row>0 && col>0){ // byte prev0=(byte)(packed[state][row][col]&MODEMASK); final int time=packed[state][row][col]&TIMEMASK; final byte prev; // System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); if(state==MODE_MS){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} else{prev=MODE_INS;} } byte c=read[row-1]; byte r=ref[refStartLoc+col-1]; if(c==r){ out[outPos]='m'; }else{ if(!AminoAcid.isFullyDefined(c, colorspace)){ out[outPos]='N'; }else if(!AminoAcid.isFullyDefined(r, colorspace)){ // out[outPos]='X'; out[outPos]='N'; }else{ out[outPos]='S'; } } row--; col--; }else if(state==MODE_DEL){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} else{prev=MODE_DEL;} } byte r=ref[refStartLoc+col-1]; if(r==GAPC){ out[outPos]='-'; gaps++; }else{ out[outPos]='D'; } col--; }else{ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} else{prev=MODE_INS;} } assert(state==MODE_INS) : state; if(col==0){ out[outPos]='X'; }else if(col>=columns){ out[outPos]='Y'; }else{ out[outPos]='I'; } row--; } // assert(prev==prev0); state=prev; outPos++; } assert(row==0 || col==0); if(col!=row){ while(row>0){ out[outPos]='X'; outPos++; row--; col--; } if(col>0){ //do nothing } } //Shrink and reverse the string byte[] out2=new byte[outPos]; for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); out[1]=translateFromGappedCoordinate(out[1], gref); out[2]=translateFromGappedCoordinate(out[2], gref); if(verbose){System.err.println("returning score "+Arrays.toString(out));} return out; }else{ return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); } } @Override /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState},
* or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}
* if more padding is needed */ public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, final int maxRow, final int maxCol, final int maxState){ int row=maxRow; int col=maxCol; int state=maxState; assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ score+=POINTSoff_NOREF; difR--; } row+=difR; col+=difR; } assert(refStartLoc<=refEndLoc); assert(row==rows); final int bestRefStop=refStartLoc+col-1; while(row>0 && col>0){ // System.err.println("state="+state+", row="+row+", col="+col); // byte prev0=(byte)(packed[state][row][col]&MODEMASK); final int time=packed[state][row][col]&TIMEMASK; final byte prev; if(state==MODE_MS){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} else{prev=MODE_INS;} } row--; col--; }else if(state==MODE_DEL){ if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} else{prev=MODE_DEL;} } col--; }else{ assert(state==MODE_INS); if(time>1){prev=(byte)state;} else{ final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} else{prev=MODE_INS;} } row--; } if(col<0){ System.err.println(row); break; //prevents an out of bounds access } // assert(prev==prev0); state=prev; // System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); } // assert(false) : row+", "+col; if(row>col){ col-=row; } final int bestRefStart=refStartLoc+col; score>>=SCOREOFFSET; int[] rvec; if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow int padLeft=Tools.max(0, refStartLoc-bestRefStart); int padRight=Tools.max(0, bestRefStop-refEndLoc); rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}; }else{ rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}; } return rvec; } /** * Fills grefbuffer * @param ref * @param a * @param b * @param gaps * @return gref */ private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ assert(gaps!=null && gaps.length>0); assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); assert(refEndLoc>=gaps[gaps.length-1]); final int g0_old=gaps[0]; final int gN_old=gaps[gaps.length-1]; gaps[0]=Tools.min(gaps[0], refStartLoc); gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); grefRefOrigin=gaps[0]; if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} // grefRefOrigin=Tools.min(gaps[0], refStartLoc); // //This block is no longer needed since the array is preallocated. // int len=0; // final int gb2=GAPBUFFER*2; // for(int i=0; iy); // int gap=z-y-1; // if(gapy); int gap=z-y-1; assert(gap>=MINGAP) : gap+"\t"+MINGAP; if(gapgref.length){ System.err.println("gref buffer overflow: "+lim+" > "+gref.length); return null; } for(int i=greflimit, r=refEndLoc+1; i "+j);} return j; } j+=(c==GAPC ? GAPLEN : 1); // if(c!=GAPC){j++;} // else{j+=GAPLEN;} } System.err.println(grefRefOrigin); System.err.println(point); System.err.println(new String(gref)); throw new RuntimeException("Out of bounds."); } private final int translateToGappedCoordinate(int point, byte[] gref){ if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} if(point<=grefRefOrigin){return point-grefRefOrigin;} for(int i=0, j=grefRefOrigin; i "+i);} return i; } j+=(c==GAPC ? GAPLEN : 1); // if(c!=GAPC){j++;} // else{j+=GAPLEN;} } System.err.println(grefRefOrigin); System.err.println(point); System.err.println(new String(gref)); throw new RuntimeException("Out of bounds."); } /** Calculates score based on an array from Index */ private final int calcAffineScore(int[] locArray){ int score=0; int lastLoc=-2; //Last true location int lastValue=-1; int timeInMode=0; for(int i=0; i0){//match if(loc==lastValue){//contiguous match score+=POINTS_MATCH2; }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match score+=POINTS_MATCH; }else if(loc=0); score+=POINTS_MATCH; score+=POINTS_DEL; int dif=lastLoc-loc+1; if(dif>MINGAP){ int rem=dif%GAPLEN; int div=(dif-GAPBUFFER2)/GAPLEN; score+=(div*POINTS_GAP); assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 // assert(false) : div; } if(dif>LIMIT_FOR_COST_5){ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; dif=LIMIT_FOR_COST_5; } if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_DEL2; } timeInMode=1; }else if(loc>lastLoc){//insertion assert(lastLoc>=0); score+=POINTS_MATCH; score+=POINTS_INS; int dif=Tools.min(loc-lastLoc+1, 5); assert(dif>0); if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_INS2; } timeInMode=1; }else{ assert(false); } lastLoc=loc; }else{//substitution if(lastValue<0 && timeInMode>0){//contiguous if(timeInMode0){//match if(loc==lastValue){//contiguous match score+=(POINTS_MATCH2+baseScores[i]); }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match score+=(POINTS_MATCH+baseScores[i]); }else if(loc=0); score+=(POINTS_MATCH+baseScores[i]); score+=POINTS_DEL; int dif=lastLoc-loc+1; if(dif>MINGAP){ int rem=dif%GAPLEN; int div=(dif-GAPBUFFER2)/GAPLEN; score+=(div*POINTS_GAP); assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 // assert(false) : div; } if(dif>LIMIT_FOR_COST_5){ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; dif=LIMIT_FOR_COST_5; } if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_DEL2; } timeInMode=1; }else if(loc>lastLoc){//insertion assert(lastLoc>=0); score+=(POINTS_MATCH+baseScores[i]); score+=POINTS_INS; int dif=Tools.min(loc-lastLoc+1, 5); assert(dif>0); if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_INS2; } timeInMode=1; }else{ assert(false); } lastLoc=loc; }else if(loc==-1){//substitution if(lastValue<0 && timeInMode>0){//contiguous if(timeInMode1) : minContig; int contig=0; int maxContig=0; int score=0; int lastLoc=-3; //Last true location int lastValue=-1; int timeInMode=0; for(int i=0; i0){//match if(loc==lastValue){//contiguous match contig++; score+=(POINTS_MATCH2+baseScores[i]); }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match maxContig=Tools.max(maxContig, contig); contig=1; score+=(POINTS_MATCH+baseScores[i]); }else if(loc=0); score+=(POINTS_MATCH+baseScores[i]); score+=POINTS_DEL; int dif=lastLoc-loc+1; if(dif>MINGAP){ int rem=dif%GAPLEN; int div=(dif-GAPBUFFER2)/GAPLEN; score+=(div*POINTS_GAP); assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 // assert(false) : div; } if(dif>LIMIT_FOR_COST_5){ score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; dif=LIMIT_FOR_COST_5; } if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_DEL2; } timeInMode=1; }else if(loc>lastLoc){//insertion maxContig=Tools.max(maxContig, contig); contig=0; assert(lastLoc>=0); score+=(POINTS_MATCH+baseScores[i]); score+=POINTS_INS; int dif=Tools.min(loc-lastLoc+1, 5); assert(dif>0); if(dif>LIMIT_FOR_COST_4){ score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; dif=LIMIT_FOR_COST_4; } if(dif>LIMIT_FOR_COST_3){ score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; dif=LIMIT_FOR_COST_3; } if(dif>1){ score+=(dif-1)*POINTS_INS2; } timeInMode=1; }else{ assert(false); } lastLoc=loc; }else if(loc==-1){//substitution if(lastValue<0 && timeInMode>0){//contiguous if(timeInModeref.length){ int dif=(refStop-ref.length); readStop-=dif; score+=POINTS_NOREF*dif; norefs+=dif; } // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. for(int i=readStart; i=ref.length) ? (byte)'N' : ref[j]; if(c=='N' || r=='N'){match[i]='N';} else if(c==r){match[i]='m';} else{match[i]='S';} } return match; } @Override public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){ return scoreNoIndels(read, ref, baseScores, refStart, null); } @Override public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){ int score=0; int mode=-1; int timeInMode=0; int norefs=0; //This block handles cases where the read runs outside the reference //Of course, padding the reference with 'N' would be better, but... int readStart=0; int readStop=read.length; final int refStop=refStart+read.length; boolean semiperfect=true; if(refStart<0){ readStart=0-refStart; score+=POINTS_NOREF*readStart; norefs+=readStart; } if(refStop>ref.length){ int dif=(refStop-ref.length); readStop-=dif; score+=POINTS_NOREF*dif; norefs+=dif; } // if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. for(int i=readStart; iref.length){return -99999;} if(refStart<0){ readStart=0-refStart; score+=POINTS_NOREF*readStart; } if(refStop>ref.length){ int dif=(refStop-ref.length); System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); readStop-=dif; score+=POINTS_NOREF*dif; } assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; assert(matchReturn!=null); assert(matchReturn.length==1); if(matchReturn[0]==null || matchReturn[0].length!=read.length){ assert(matchReturn[0]==null || matchReturn[0].lengthref.length){return -99999;} if(refStart<0){ readStart=0-refStart; score+=POINTS_NOREF*readStart; } if(refStop>ref.length){ int dif=(refStop-ref.length); System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); readStop-=dif; score+=POINTS_NOREF*dif; } assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; assert(matchReturn!=null); assert(matchReturn.length==1); if(matchReturn[0]==null || matchReturn[0].length!=read.length){ assert(matchReturn[0]==null || matchReturn[0].length>SCOREOFFSET).append(",");} return sb; } public static float minIdToMinRatio(double minid){ if(minid>1){minid=minid/100;} assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100."; double matchdif=POINTS_MATCH-POINTS_MATCH2; double match=POINTS_MATCH2; double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2; double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4; double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3); double badAvg=.7*sub+.2*del+.1*ins; double badFraction=1-minid; double minratio=(match+badFraction*badAvg)/match; assert(minratio<=1); minratio=Tools.max(0.1, minratio); return (float)minratio; } public static final int TIMEBITS=11; public static final int SCOREBITS=32-TIMEBITS; public static final int MAX_TIME=((1<