package jgi; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import kmer.KCountArray; import kmer.KmerCount6; import stream.ConcurrentGenericReadInputStream; import stream.ConcurrentReadStreamInterface; import stream.RTextOutputStream3; import stream.Read; import dna.AminoAcid; import dna.Data; import dna.Timer; import fileIO.FileFormat; import fileIO.ReadWrite; import align2.ListNum; import align2.Tools; /** * @author Brian Bushnell * @date Aug 20, 2012 * */ public class ErrorCorrect extends Thread{ public static void main(String[] args){ String reads1=args[0]; String reads2=(args.length>1 ? args[1] : null); int k=23; int cbits=4; int gap=0; int hashes=1; int thresh1=1; int thresh2=2; int matrixbits=34; long maxReads=-1; int buildpasses=1; long tablereads=-1; //How many reads to process when building the hashtable int buildStepsize=4; String output=null; boolean ordered=true; boolean overwrite=false; for(int i=2; i1 ? split[1] : "true"); if(Tools.isJavaFlag(arg)){ //jvm argument; do nothing }else if(a.equals("null")){ // do nothing }else if(a.equals("k") || a.equals("kmer")){ k=Integer.parseInt(b); }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ cbits=Integer.parseInt(b); }else if(a.equals("initialthresh") || a.equals("thresh1")){ thresh1=Integer.parseInt(b); }else if(a.equals("thresh") || a.equals("thresh2")){ thresh2=Integer.parseInt(b); }else if(a.startsWith("gap")){ gap=Integer.parseInt(b); }else if(a.startsWith("matrixbits")){ matrixbits=Integer.parseInt(b); }else if(a.startsWith("hashes") || a.startsWith("multihash")){ hashes=Integer.parseInt(b); }else if(a.startsWith("maxerrors")){ ERROR_CORRECTION_LIMIT=Integer.parseInt(b); }else if(a.startsWith("passes")){ buildpasses=Integer.parseInt(b); }else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){ buildStepsize=Integer.parseInt(b); }else if(a.equals("ziplevel") || a.equals("zl")){ ReadWrite.ZIPLEVEL=Integer.parseInt(b); }else if(a.equals("threads") || a.equals("t")){ System.err.println("Can't change threadcount for this class."); //THREADS=Integer.parseInt(b); }else if(a.startsWith("reads") || a.startsWith("maxreads")){ maxReads=Long.parseLong(b); }else if(a.startsWith("tablereads")){ tablereads=Long.parseLong(b); }else if(a.startsWith("build") || a.startsWith("genome")){ Data.setGenome(Integer.parseInt(b)); Data.sysout.println("Set genome to "+Data.GENOME_BUILD); }else if(a.equals("outputinfo") || a.startsWith("info")){ OUTPUT_INFO=Tools.parseBoolean(b); }else if(a.startsWith("out")){ output=b; }else if(a.startsWith("verbose")){ KCountArray.verbose=Tools.parseBoolean(b); // verbose=KCountArray.verbose=Tools.parseBoolean(b); }else if(a.equals("ordered") || a.equals("ord")){ ordered=Tools.parseBoolean(b); }else if(a.equals("overwrite") || a.equals("ow")){ overwrite=Tools.parseBoolean(b); }else{ throw new RuntimeException("Unknown parameter "+args[i]); } } KCountArray kca=makeTable(reads1, reads2, k, cbits, gap, hashes, buildpasses, matrixbits, tablereads, buildStepsize, thresh1, thresh2); detect(reads1, reads2, kca, k, thresh2, maxReads, output, ordered, overwrite); } public static KCountArray makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits, long maxreads, int stepsize, int thresh1, int thresh2){ Timer thash=new Timer(); KmerCount6.maxReads=maxreads; int kbits=2*k; matrixbits=Tools.min(kbits, matrixbits); thash.start(); // Data.sysout.println("kbits="+(kbits)+" -> "+(1L< "+(1L<1); KCountArray trusted=null; for(int i=1; i2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) int step=(stepsize==1 ? 1 : stepsize+i%2); // if(!conservative){step=(step+3)/4;} if(!conservative){step=Tools.min(3, (step+3)/4);} KmerCount6.countFastq(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); kca.shutdown(); Data.sysout.println("Trusted: \t"+kca.toShortString()); trusted=kca; kca=KCountArray.makeNew(1L< ln=cris.nextList(); ArrayList reads=(ln!=null ? ln.list : null); long covered=0; long uncovered=0; long coveredFinal=0; long uncoveredFinal=0; long fullyCorrected=0; long failed=0; long totalBases=0; long totalReads=0; while(reads!=null && reads.size()>0){ for(Read r : reads){ Read r2=r.mate; { // if(r.numericID==23){verbose=true;} totalReads++; if(verbose){System.err.println();} totalBases+=r.bases.length; // BitSet bs=detectErrors(r, kca, k, thresh); BitSet bs=detectErrorsBulk(r, kca, k, thresh, 1); if(verbose){System.err.println(toString(bs, r.bases.length));} // Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.bases.length)); if(verbose){System.err.println(toString(detectErrors(r, kca, k, thresh), r.bases.length-k+1));} if(bs==null){//No errors, or can't detect errors assert(false); }else{ int x=bs.cardinality(); covered+=x; uncovered+=(r.bases.length-x); if(x0){return detectErrorsSplit(r, kca, k, thresh);} final int kbits=2*k; final long mask=~((-1L)<<(kbits)); final int gap=kca.gap; int bslen=r.bases.length-k-gap+1; if(bslen<1){return null;} //Read is too short to detect errors BitSet bs=new BitSet(bslen); int len=0; long kmer=0; byte[] bases=r.bases; for(int i=0; i=k){ int count=kca.read(kmer); if(count>=thresh){ bs.set(i+1-k); } } } } return bs; } public static BitSet detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize/*, final int offset*/){ if(kca.gap>0){return detectErrorsSplit(r, kca, k, thresh);} final int kbits=2*k; final long mask=~((-1L)<<(kbits)); final int gap=kca.gap; if(r.bases==null || r.bases.length=k && ((len-k)%stepsize==0 || i==bases.length-1)){ int count=kca.read(kmer); if(count>=thresh){ bs.set(i+1-setlen, i+1); } } } } r.errors=bs.cardinality()-r.bases.length; // assert(bases.length==r.bases.length); return bs; } public static BitSet detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize){ if(kca.gap>0){throw new RuntimeException("TODO");} final int kbits=2*k; final long mask=~((-1L)<<(kbits)); final int gap=kca.gap; if(r.bases==null || r.bases.length=k && (i%detectStepsize==0 || i==bases.length-1)){ int count=kca.read(kmer); if(count0){return detectErrorsSplit(r, kca, k, thresh);} final int kbits=2*k; final long mask=~((-1L)<<(kbits)); final int gap=kca.gap; if(r.bases==null || r.bases.length=k){ int count=kca.read(kmer); if(count>=thresh){ bs.set(i+1-setlen); bs.set(i); } } } } return bs; } /** * @param r * @param kca * @param k * @param thresh * @return */ private static BitSet detectErrorsSplit(Read r, KCountArray kca, int k, int thresh) { assert(false) : "TODO"; return null; } /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */ public static BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst){ if(kca.gap>0){assert(false) : "TODO";} assert(!OUTPUT_INFO) : "TODO: Outputting correction data is not yet supported."; int corrections=0; //Alternately, corrections=r.errorsCorrected r.errors=0; if(bs.cardinality()==0){//Cannot be corrected r.errors=r.bases.length; return bs; } // verbose=!bs.get(0); if(verbose){ Data.sysout.println(); Data.sysout.println(toString(bs, r.bases.length)); Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.bases.length)); Data.sysout.println(toString(detectErrors(r, kca, k, thresh), r.bases.length-k+1)); } int lastloc=-99; int burst=1; while(!bs.get(0) && corrections\n"+toString(bs, r.bases.length));} }else{ r.errors=r.bases.length-bs.cardinality(); r.errorsCorrected+=corrections; if(verbose){System.err.println("Could not correct.");} r.bases[errorLoc]='N'; r.quality[errorLoc]=0; return bs; } } burst=1; while(bs.cardinality()\n"+toString(bs, r.bases.length));} }else{ r.errors=r.bases.length-bs.cardinality(); r.errorsCorrected+=corrections; r.bases[errorLoc]='N'; r.quality[errorLoc]=0; if(verbose){System.err.println("Could not correct.");} return bs; } } } r.errors=r.bases.length-bs.cardinality(); r.errorsCorrected+=corrections; assert(corrections<=maxCorrections); return bs; } /** * @param r * @param kca * @param k * @param thresh * @param bs * @param errorLoc * @return */ private static boolean correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) { final int kbits=2*k; final long mask=~((-1L)<<(kbits)); final int gap=kca.gap; final int setlen=k+gap; final int startLoc=error-(setlen)+1; final byte oldBase=r.bases[error]; final byte[] bases=r.bases; final int minAdvance=Tools.min(MIN_ADVANCE, bases.length-error); long kmer=0; int len=0; for(int i=startLoc; i=minLoc; i--){ if(!bs.get(i)){ minLoc=i+1; break; } } } if(verbose){ Data.sysout.println("correctFromRight. Error = "+error+", minloc="+minLoc); Data.sysout.println(new String(r.bases)); } for(int bnum=0; bnum<4; bnum++){ byte c=AminoAcid.numberToBase[bnum]; bases[error]=c; if(verbose){System.err.println("Considering "+(char)c);} long key=kmer; for(int loc=error; loc>=minLoc; loc--){ c=bases[loc]; int x=AminoAcid.baseToNumber[c]; if(x<0){ if(verbose){System.err.println("break: N");} break; } key=((key>>2)|(((long)x)<max){ max=array[i]; maxIndex=i; }else if(max==array[i]){ maxIndex=-1; } } return maxIndex; } public static final String toString(BitSet bs, int len){ // assert(verbose); StringBuilder sb=new StringBuilder(len); for(int i=0; i list){ for(int i=0; i0){ if(r.mate==null || r.mate.errors>0){ list.set(i, null); } } } } public static boolean verbose=false; /** Bails out if a read still has errors after correcting this many. */ public static int ERROR_CORRECTION_LIMIT=6; /** Max allowed number of nearby corrections. * A long error burst indicates the read simply has low coverage, and is not being corrected correctly. */ public static int MAX_ERROR_BURST=3; /** Bursts have at most this distance between errors. E.G. '1' means errors are adjacent. */ public static int BURST_THRESH=2; /** Withhold uncorrectable reads from output. */ public static boolean DONT_OUTPUT_BAD_READS=false; /** Do not correct an error if it is at most this far from the next error. Instead, bail out. */ public static int MIN_ADVANCE=1; /** Number of threads used for error correction. Does not control number of threads for creating the hash table. * Additionally, up to 2 threads are used for reading and up to 2 for writing. For this (singlethreaded) class, the number does nothing. */ public static final int THREADS=1; /** Output correction data instead of the corrected read */ public static boolean OUTPUT_INFO=false; }