package jgi; import java.io.File; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import stream.FASTQ; import stream.FastaReadInputStream; import stream.Read; import stream.SamLine; import dna.Data; import dna.Scaffold; import fileIO.ByteFile; import fileIO.ReadWrite; import fileIO.TextFile; import fileIO.TextStreamWriter; import align2.LongList; import align2.Shared; import align2.Tools; /** * * Processes a sam file of mapped ESTs. * These ESTs may have been broken into smaller pieces for mapping, * and if so, are reassembled. * * Produces a mapping statistics file. * * @author Brian Bushnell * @date Sep 27, 2013 * */ public class SamToEst { public static void main(String[] args){ ByteFile.FORCE_MODE_BF2=Shared.THREADS>2; FastaReadInputStream.SPLIT_READS=false; stream.FastaReadInputStream.MIN_READ_LEN=1; ReadWrite.USE_UNPIGZ=true; String est=null, stats=null, ref=null, sam=null; float fractionForAllCaptured=0.98f; for(int i=0; i1 ? split[1] : null; while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens if(Tools.isJavaFlag(arg)){ //jvm argument; do nothing }else if(a.equals("null")){ // do nothing }else if(a.equals("bf1")){ ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; }else if(a.equals("bf2")){ ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("sam")){ sam=b; }else if(a.equals("out") || a.equals("output") || a.equals("stats")){ stats=b; }else if(a.equals("ref")){ ref=b; }else if(a.equals("est")){ est=b; }else if(a.equals("fraction")){ fractionForAllCaptured=Float.parseFloat(b); }else if(a.equals("overwrite") || a.equals("ow")){ overwrite=Tools.parseBoolean(b); }else if(a.equals("qauto")){ FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; }else if(a.equals("tuc") || a.equals("touppercase")){ Read.TO_UPPER_CASE=Tools.parseBoolean(b); }else if(a.equals("ziplevel") || a.equals("zl")){ ReadWrite.ZIPLEVEL=Integer.parseInt(b); }else if(sam==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ sam=arg; }else if(stats==null && i==1 && !arg.contains("=")){ stats=arg; }else{ System.err.println("Unknown parameter "+args[i]); assert(false) : "Unknown parameter "+args[i]; // throw new RuntimeException("Unknown parameter "+args[i]); } } if(stats==null){stats="stdout";} SamToEst ste=new SamToEst(sam, stats, ref, est, fractionForAllCaptured); ste.process(); } public SamToEst(String in_, String stats_, String ref_, String est_, float fractionForAll_){ in=in_; stats=stats_; ref=ref_; estFile=est_; fractionForAll=fractionForAll_; } public void process(){ // HashMap table=new HashMap(initialSize); HashMap table=new HashMap(initialSize); TextFile tf=new TextFile(in, true, false); String line=null; String program=null; String version=null; boolean bbmap=false; float bbversion=-1; for(line=tf.nextLine(); line!=null && line.startsWith("@"); line=tf.nextLine()){ final String[] split=line.split("\t"); final String a=split[0]; if(a.equals("@SQ")){ Scaffold sc=new Scaffold(split); // assert(!table.containsKey(sc.name)) : "\nDuplicate scaffold name!\n"+sc+"\n\n"+table.get(sc.name); // table.put(sc.name, sc); refBases+=sc.length; refCount++; }else if(a.equals("@PG")){ for(String s : split){ if(s.startsWith("PN:")){ String s2=s.substring(3); if(s2.equalsIgnoreCase("bbmap") || s2.startsWith("BBMap")){bbmap=true;} if(program==null){program=Data.forceIntern(s.substring(3));} }else if(s.startsWith("VN:")){ if(bbmap && bbversion<0){bbversion=Float.parseFloat(s.substring(3));} if(version==null){version=Data.forceIntern(s.substring(3));} } } }else if(a.equals("@RG")){ //Do nothing }else if(a.equals("@HD")){ //Do nothing }else if(a.equals("@CO")){ //Do nothing }else{ // assert(false) : line; } } EST current=null; boolean err=false; for(; line!=null; line=tf.nextLine()){ if(line.length()==0){ }else if(line.charAt(0)=='@'){ if(!err){ System.err.println("Unexpected header line: "+line); System.err.println("This should not cause problems, and is probably due to concatenated sam files.\n" + "Supressing future unexpected header warnings."); err=true; } if(line.startsWith("@SQ")){ String[] split=line.split("\t"); Scaffold sc=new Scaffold(split); // if(!table.containsKey(sc.name)){ // table.put(sc.name, sc); // refBases+=sc.length; // refCount++; // } } }else{ SamLine sl=new SamLine(line); if(USE_SECONDARY || sl.primary()){ if(sl.mapped() && sl.cigar!=null){ String cigar=sl.cigar; if(cigar.contains("D") || cigar.contains("N")){ int len=0; for(int i=0; i0){ int partlen=name.length()-x-1; if(partlen>0 && partlen<6){ int p2=0; for(int i=x+1; i9){ p2=-1; break; } } if(p2>-1){ part=p2; name=name.substring(0, x); }else{ // assert(false) : x+"\t"+p2+"\t"+name; } }else{ // assert(false) : x+"\t"+name; } }else{ // assert(false) : x+"\t"+name; } if(current==null || !current.name.equals(name)){ // assert(part==1) : "Sam file must be in input order. Run BBMap with the 'ordered' flag.\n"+part+"\n"+sl.qname; if(current!=null){addEst(current);} current=new EST(name); } current.add(sl); } } } if(current!=null){addEst(current);} tf.close(); if(stats!=null){ final TextStreamWriter tsw=new TextStreamWriter(stats, overwrite, false, false); tsw.start(); // numRef: 786 // numEst: 30985 // EST-good: 30312 ( 97.83%) // EST-best: 30312 ( 97.83%) // EST-miss: 379 ( 1.22%) // EST-zero: 294 ( 0.95%) // tsw.println("EST-good:\t"+good+"\t"++""); // tsw.println("EST-best:\t"+best+"\t"++""); // tsw.println("EST-miss:\t"+miss+"\t"++""); // tsw.println("EST-zero:\t"+zero+"\t"++""); boolean oldStyle=false; if(oldStyle){ tsw.println("ref:\t"+ref); tsw.println("est:\t"+estFile); tsw.println("sam:\t"+in); tsw.println("numRef:\t"+refCount+"\t"+refBases); tsw.println("numEst:\t"+estCount+"\t"+estBases); tsw.println("type\t#ests\t%ests\t#bases\t%bases"); }else{ tsw.println("ref_file="+ref); tsw.println("est_file="+estFile); tsw.println("sam_file="+in); tsw.println("n_ref_scaffolds="+refCount); tsw.println("n_ref_bases="+refBases); tsw.println("n_est="+estCount); tsw.println("n_est_bases="+estBases); tsw.println("type\tn_est\tpct_est\tn_bases\tpct_bases"); } double multE=100.0/estCount; double multB=100.0/estBases; double allBasesPct=multE*allBasesMapped; double mostBasesPct=multE*mostBasesMapped; double someBasesPct=multE*someBasesMapped; double noBasesPct=multE*noBasesMapped; double multiScaffoldPct=multE*multiScaffold; double allBasesPctB=multB*allBasesMappedB; double mostBasesPctB=multB*mostBasesMappedB; double someBasesPctB=multB*someBasesMappedB; double noBasesPctB=multB*noBasesMappedB; double multiScaffoldPctB=multB*multiScaffoldB; int min=0, max=0, median=0; long sum=0, count=0; for(int i=minIntron; i0){ if(min==0){min=i;} max=i; sum+=(i*x); count+=x; } } if(count>0){ //If there are any introns long half=(count+1)/2; //50th percentile of number of introns assert(half<=count); long count2=0; //Current sum of length for(int i=0; count20){ count2+=x; median=i; } } } tsw.println("all:\t"+allBasesMapped+"\t"+String.format("%.4f%%",allBasesPct)+"\t"+allBasesMappedB+"\t"+String.format("%.4f%%",allBasesPctB)); tsw.println("most:\t"+mostBasesMapped+"\t"+String.format("%.4f%%",mostBasesPct)+"\t"+mostBasesMappedB+"\t"+String.format("%.4f%%",mostBasesPctB)); tsw.println("some:\t"+someBasesMapped+"\t"+String.format("%.4f%%",someBasesPct)+"\t"+someBasesMappedB+"\t"+String.format("%.4f%%",someBasesPctB)); tsw.println("zero:\t"+noBasesMapped+"\t"+String.format("%.4f%%",noBasesPct)+"\t"+noBasesMappedB+"\t"+String.format("%.4f%%",noBasesPctB)); tsw.println("multi:\t"+multiScaffold+"\t"+String.format("%.4f%%",multiScaffoldPct)+"\t"+multiScaffoldB+"\t"+String.format("%.4f%%",multiScaffoldPctB)); // tsw.println("numIntrons:\t"+count); // tsw.println("minIntron:\t"+min); // tsw.println("maxIntron:\t"+max); // tsw.println("medIntron:\t"+median); // tsw.println("avgIntron:\t"+(long)(sum/(double)(Tools.max(count,1)))); tsw.println("introns\tmin\tmax\tmedian\taverage"); tsw.println(count+"\t"+min+"\t"+max+"\t"+median+"\t"+String.format("%.1f", (sum/(double)(Tools.max(count,1))))); tsw.poisonAndWait(); } } private void addEst(EST est){ // Data.sysout.println("\n"+est); estCount++; partCount+=est.parts; estBases+=est.length; estBasesMapped+=est.mappedLength; partCountMapped+=est.mappedParts; for(int i=0; i1){ multiScaffold++; multiScaffoldB+=est.length; } if(est.mappedParts==est.parts){ // Data.sysout.print("A"); allPartsMapped++; }else if(est.mappedParts>=Tools.max(1, est.parts/2)){ // Data.sysout.print("B"); mostPartsMapped++; }else if(est.mappedParts>0){ // Data.sysout.print("C"); somePartsMapped++; }else{ // Data.sysout.print("D"); noPartsMapped++; } int match=est.match(); if(match>=(est.length*fractionForAll)){ // Data.sysout.print("E"); allBasesMapped++; allBasesMappedB+=est.length; }else if(match>=est.length/2){ // Data.sysout.print("F"); mostBasesMapped++; mostBasesMappedB+=est.length; }else if(match>0){ // Data.sysout.print("G"); someBasesMapped++; someBasesMappedB+=est.length; }else{ // Data.sysout.print("H"); noBasesMapped++; noBasesMappedB+=est.length; } } public final float fractionForAll; public final String in, stats, ref, estFile; public long refBases=0; public long estBases=0; public long estBasesMapped=0; public long refCount=0; public long estCount=0; public long partCount=0; public long partCountMapped=0; public long good=0, best=0, miss=0, zero=0; public long multiScaffold=0, multiScaffoldB=0; public long allPartsMapped=0, mostPartsMapped=0, somePartsMapped=0, noPartsMapped=0; public long allBasesMapped=0, mostBasesMapped=0, someBasesMapped=0, noBasesMapped=0; public long allBasesMappedB=0, mostBasesMappedB=0, someBasesMappedB=0, noBasesMappedB=0; public long[] msdicnOverall=new long[6]; public LongList introns=new LongList(1); public int initialSize=4096; public boolean ADD_FROM_REF=true; public boolean USE_SECONDARY=false; public static int minIntron=10; public static boolean overwrite=true; // public HashMap //Only needed if sam file is unordered. public static class EST{ public EST(String name_){ name=name_; } public void add(SamLine sl){ parts++; // length+=sl.seq.length(); length+=sl.seq.length; if(sl.mapped()){ // mappedLength+=sl.seq.length(); mappedLength+=sl.seq.length; mappedParts++; if(sl.cigar!=null){ String matchTag=sl.matchTag(); int[] temp; if(matchTag==null){ temp=SamLine.cigarToMsdic(sl.cigar); }else{ temp=Read.matchToMsdicn(matchTag.getBytes()); } for(int i=0; i scafnames=new HashSet(4); int[] msdicn=new int[6]; } }