package align2; import java.util.Arrays; import stream.Read; import stream.SiteScore; import dna.AminoAcid; import dna.ChromosomeArray; import dna.Data; import dna.Gene; /** * @author Brian Bushnell * @date Jun 20, 2013 * */ public abstract class MSA { public static final float minIdToMinRatio(double minid, String classname){ if("MultiStateAligner9ts".equalsIgnoreCase(classname)){ return MultiStateAligner9ts.minIdToMinRatio(minid); }else if("MultiStateAligner10ts".equalsIgnoreCase(classname)){ return MultiStateAligner10ts.minIdToMinRatio(minid); }else if("MultiStateAligner11ts".equalsIgnoreCase(classname)){ return MultiStateAligner11ts.minIdToMinRatio(minid); }else if("MultiStateAligner9PacBio".equalsIgnoreCase(classname)){ return MultiStateAligner9PacBio.minIdToMinRatio(minid); }else{ assert(false) : "Unhandled MSA type: "+classname; return MultiStateAligner11ts.minIdToMinRatio(minid); } } public static final MSA makeMSA(int maxRows_, int maxColumns_, boolean colorspace_, String classname){ if("MultiStateAligner9ts".equalsIgnoreCase(classname)){ return new MultiStateAligner9ts(maxRows_, maxColumns_, colorspace_); }else if("MultiStateAligner10ts".equalsIgnoreCase(classname)){ return new MultiStateAligner10ts(maxRows_, maxColumns_, colorspace_); }else if("MultiStateAligner11ts".equalsIgnoreCase(classname)){ return new MultiStateAligner11ts(maxRows_, maxColumns_, colorspace_); }else if("MultiStateAligner9PacBio".equalsIgnoreCase(classname)){ return new MultiStateAligner9PacBio(maxRows_, maxColumns_, colorspace_); }else{ assert(false) : "Unhandled MSA type: "+classname; return new MultiStateAligner11ts(maxRows_, maxColumns_, colorspace_); } } public MSA(int maxRows_, int maxColumns_, boolean colorspace_){ maxRows=maxRows_; maxColumns=maxColumns_; colorspace=colorspace_; // packed=new int[3][maxRows+1][maxColumns+1]; // grefbuffer=new byte[maxColumns+2]; // // vertLimit=new int[maxRows+1]; // horizLimit=new int[maxColumns+1]; // Arrays.fill(vertLimit, BADoff); // Arrays.fill(horizLimit, BADoff); // //// for(int i=0; i=a); int[] score; if(gaps==null){ if(verbose){ System.err.println("no gaps"); } if(b-a>=maxColumns){ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); assert(false) : refStartLoc+", "+refEndLoc; b=Tools.min(ref.length-1, a+maxColumns-1); } int[] max=fillLimited(read, ref, a, b, minScore, gaps); score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false)); }else{ if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);} int[] max=fillLimited(read, ref, a, b, minScore, gaps); if(verbose){System.err.println("max: "+Arrays.toString(max));} // score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true)); score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true)); } return score; } public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){ return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps); } // public final int[] translateScoreFromGappedCoordinate(int[] score) public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){ return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps); } @Deprecated public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){ int a=Tools.max(0, refStartLoc); int b=Tools.min(ref.length-1, refEndLoc); assert(b>=a); if(b-a>=maxColumns){ System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); b=Tools.min(ref.length-1, a+maxColumns-1); } int[] max=fillQ(read, ref, baseScores, a, b); // int[] score=score(read, ref, a, b, max[0], max[1], max[2]); // return score; return null; } @Deprecated public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){ return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores); } @Deprecated public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){ return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores); } // public final int scoreNoIndels(byte[] read, SiteScore ss){ // // ChromosomeArray cha=Data.getChromosome(ss.chrom); // final int refStart=ss.start; // // int score=0; // int mode=MODE_START; // int timeInMode=0; // if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match // // for(int i=0; i"+(char)c+"; rpos="+rpos);} prevMode=mode; prevStreak=current; mode=c; current=1; } } if(current>0){ assert(mode==match[match.length-1]); if(mode=='m'){ if(score<=0){ score=0; lastZeroC=cpos; lastZeroM=match.length-current; lastZeroR=rpos; } int add=calcMatchScore(current); score+=(matchPointsMult*add); // if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N cpos+=current; rpos+=current; if(score>maxScore){ maxScore=score; startLocC=lastZeroC; startLocM=lastZeroM; startLocR=lastZeroR; stopLocC=cpos-1; stopLocM=match.length-1; stopLocR=rpos-1; } }else if(mode=='S'){ score+=calcSubScore(current); if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();} cpos+=current; rpos+=current; }else if(mode=='D'){ score+=calcDelScore(current, true); rpos+=current; }else if(mode=='I'){ score+=calcInsScore(current); cpos+=current; }else if(mode=='C'){ cpos+=current; }else if(mode=='X' || mode=='Y'){ score+=calcInsScore(current); cpos+=current; }else if(mode=='N'){ score+=calcNocallScore(current); cpos+=current; rpos+=current; }else if(mode=='R'){ score+=calcNorefScore(current); cpos+=current; rpos+=current; }else if(mode!=0){ assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match)+"\n"+new String(bases); } if(verbose){System.err.println("mode "+(char)mode+"->end; rpos="+rpos);} } if(startLocC<0 || stopLocC<0){ assert(false) : "Failed."; return false; } if(verbose){System.err.println("A: r.start="+r.start+", r.stop="+r.stop+"; rpos="+rpos+"; len="+bases.length+"; reflen="+(r.stop-r.start+1));} assert(rpos==r.stop+1) : rpos+"!="+r.start+"\n"+r; if(verbose){System.err.println("B: rpos="+rpos+", startLocR="+startLocR+", stopLocR="+stopLocR);} int headTrimR=startLocC; int headTrimM=startLocM; int tailTrimR=bases.length-stopLocC-1; int tailTrimM=match.length-stopLocM-1; if(verbose){System.err.println("C: headTrimR="+headTrimR+", headTrimM="+headTrimM+", tailTrimR="+tailTrimR+", tailTrimM="+tailTrimM);} if(headTrimR<=minToClip && headTrimM<=minToClip){ headTrimR=headTrimM=0; } if(tailTrimR<=minToClip && tailTrimM<=minToClip){ tailTrimR=tailTrimM=0; } if(headTrimR==0 && headTrimM==0 && tailTrimR==0 && tailTrimM==0){ return false; } //Do trimming final int headDelta=headTrimR-headTrimM; final int tailDelta=tailTrimR-tailTrimM; final byte[] match2; if(verbose){System.err.println("D: headTrimR="+headTrimR+", headTrimM="+headTrimM+", tailTrimR="+tailTrimR+", tailTrimM="+tailTrimM);} if(verbose){System.err.println("D: headDelta="+headDelta+", tailDelta="+tailDelta);} if(headDelta==0 && tailDelta==0){ //Length-neutral trimming match2=match; for(int i=0; i0 ? Tools.max(ss.pairedScore+(maxScore-ss.slowScore), 0) : 0; } return true; } /** Assumes match string is in long format. */ public final int score(byte[] match){ if(match==null || match.length<1){return 0;} byte mode=match[0], prevMode='0'; int current=0, prevStreak=0; int score=0; for(int mpos=0; mpos"+(char)c+"\tcurrent="+current+"\tscore="+score);} prevMode=mode; prevStreak=current; mode=c; current=1; } } if(current>0){ assert(mode==match[match.length-1]); if(mode=='m'){ score+=calcMatchScore(current); // if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N }else if(mode=='S'){ score+=calcSubScore(current); if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();} }else if(mode=='D'){ score+=calcDelScore(current, true); }else if(mode=='I'){ score+=calcInsScore(current); }else if(mode=='C'){ //do nothing }else if(mode=='X' || mode=='Y'){ score+=calcInsScore(current); }else if(mode=='N'){ score+=calcNocallScore(current); }else if(mode=='R'){ score+=calcNorefScore(current); }else if(mode!=0){ assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match); } if(verbose){System.err.println("mode "+(char)mode+"->end; score="+score);} } return score; } // //TODO // public final byte[] softClipBoundsShortmatch(byte[] match, byte[] bases, int minToClip){ // if(match==null || match.length<1){return null;} // int[] score=new int[bases.length]; // // byte mode='0', c='0'; // int current=0; // int rpos=0; // long currentScore; // for(int i=0; i0 || !Character.isDigit(c)){ // current=Tools.max(current, 1); // if(mode=='m'){ // msdicn[0]+=current; // }else if(mode=='S'){ // msdicn[1]+=current; // }else if(mode=='D'){ // msdicn[2]+=current; // }else if(mode=='I'){ // msdicn[3]+=current; // }else if(mode=='C' || mode=='X' || mode=='Y'){ // msdicn[4]+=current; // }else if(mode=='N' || mode=='R'){ // msdicn[5]+=current; // } // } // return msdicn; // } public abstract int maxQuality(int numBases); public abstract int maxQuality(byte[] baseScores); public abstract int maxImperfectScore(int numBases); public abstract int maxImperfectScore(byte[] baseScores); public final static String toString(int[] a){ int width=7; StringBuilder sb=new StringBuilder((a.length+1)*width+2); for(int num : a){ String s=" "+num; int spaces=width-s.length(); assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; for(int i=0; i>SCOREOFFSET; String s=" "+num; if(s.length()>width){s=num>0 ? maxString : minString;} int spaces=width-s.length(); assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; for(int i=0; i=0); for(int i=0; i0) : len; return POINTS_MATCH()+(len-1)*POINTS_MATCH2(); } public final int calcSubScore(int len){ assert(len>0) : len; final int lim3=LIMIT_FOR_COST_3(); int score=POINTS_SUB(); if(len>lim3){ score+=(len-lim3)*POINTS_SUB3(); len=lim3; } if(len>1){ score+=(len-1)*POINTS_SUB2(); } return score; } public final int calcNorefScore(int len){return len*POINTS_NOREF();} public final int calcNocallScore(int len){return len*POINTS_NOCALL();} public abstract int calcDelScore(int len, boolean approximateGaps); // private static int calcDelScoreOffset(int len){ // if(len<=0){return 0;} // int score=POINTSoff_DEL; // // if(len>LIMIT_FOR_COST_5){ // score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5; // len=LIMIT_FOR_COST_5; // } // if(len>LIMIT_FOR_COST_4){ // score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; // len=LIMIT_FOR_COST_4; // } // if(len>LIMIT_FOR_COST_3){ // score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; // len=LIMIT_FOR_COST_3; // } // if(len>1){ // score+=(len-1)*POINTSoff_DEL2; // } // return score; // } public abstract int calcInsScore(int len); // private static int calcInsScoreOffset(int len){ // if(len<=0){return 0;} // int score=POINTSoff_INS; // if(len>LIMIT_FOR_COST_4){ // score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; // len=LIMIT_FOR_COST_4; // } // if(len>LIMIT_FOR_COST_3){ // score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; // len=LIMIT_FOR_COST_3; // } // if(len>1){ // score+=(len-1)*POINTSoff_INS2; // } // return score; // } static final int GAPBUFFER=Shared.GAPBUFFER; static final int GAPBUFFER2=Shared.GAPBUFFER2; static final int GAPLEN=Shared.GAPLEN; static final int MINGAP=Shared.MINGAP; static final int GAPCOST=Shared.GAPCOST; static final byte GAPC=Shared.GAPC; /** Seemingly to clear out prior data from the gref. Not sure what else it's used for. */ static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN); /**DO NOT MODIFY*/ public abstract byte[] getGrefbuffer(); // public final int[] vertLimit; // public final int[] horizLimit; public abstract CharSequence showVertLimit(); public abstract CharSequence showHorizLimit(); //// public static final int MODEBITS=2; // public static final int TIMEBITS=11; // public static final int SCOREBITS=32-TIMEBITS; // public static final int MAX_TIME=((1<