package align2; import java.util.Arrays; import java.util.Random; import stream.FASTQ; import stream.Read; import dna.AminoAcid; import dna.ChromosomeArray; import dna.Data; import dna.Gene; public final class RandomReads { public static void main(String[] args){ Data.GENOME_BUILD=-1; int number=100000; int readlen=100; boolean paired=false; // final int maxInsertionLen=12; // final int maxSubLen=12; // final int maxDeletionLen=400; final int maxInsertionLen=20; final int maxSubLen=20; final int maxDeletionLen=20; int minChrom=-1; int maxChrom=-1; int maxSnps=3; int maxInss=2; int maxDels=2; int maxSubs=2; float snpRate=0.4f; float insRate=0.2f; float delRate=0.2f; float subRate=0.2f; PERFECT_READ_RATIO=0.4f; int minQuality=30; int midQuality=33; int maxQuality=35; minQuality=5; midQuality=20; maxQuality=35; maxSnps=3;//4; maxInss=3;//2; maxDels=3; maxSubs=3;//2; snpRate=0.25f; insRate=0.25f; delRate=0.25f; subRate=0.25f;//0.3f; PERFECT_READ_RATIO=0.2f;//0.2f;//0.8f for(int i=0; i=0); if(minChrom<1){minChrom=1;} if(maxChrom<1){maxChrom=Data.numChroms;} RandomReads rr=new RandomReads(paired); mateLen=readlen; System.err.println("snpRate = \t"+snpRate); System.err.println("insRate = \t"+insRate); System.err.println("delRate = \t"+delRate); System.err.println("subRate = \t"+subRate); System.err.println("Reads = \t"+number); System.err.println("Readlen = \t"+readlen); System.err.println("Paired = \t"+paired); System.err.println("Genome = \t"+Data.GENOME_BUILD); System.err.println("PERFECT_READ_RATIO="+PERFECT_READ_RATIO); String fname="reads_B"+Data.GENOME_BUILD+"_"+number+"x"+readlen+"bp_" +maxSnps+"S_"+maxInss+"I_"+maxDels+"D_"+maxSubs+"U_chr"+minChrom+"-"+maxChrom+(paired ? "_#" : "")+".fq"; Read[] reads=rr.makeRandomReadsX(number, readlen, maxSnps, maxInss, maxDels, maxSubs, snpRate, insRate, delRate, subRate, maxInsertionLen, maxDeletionLen, maxSubLen, minChrom, maxChrom, false, minQuality, midQuality, maxQuality); FASTQ.writeFASTQ(reads, fname.replace("#", "1")); if(paired){ for(int i=0; i=0 && num<=3 && num!=oldNum); byte[] bytes=s.getBytes(); bytes[index]=AminoAcid.numberToBase[num]; return new String(bytes); } public String addSUB(String s, int minlen, int maxlen, int readlen, Random rand){ assert(readlen<=s.length()) : readlen+", "+s.length(); assert(minlen>1); assert(maxlen>=minlen); // int len=minlen+rand.nextInt(maxlen-minlen+1); int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); assert(len>=minlen); assert(len<=maxlen); assert(readlen<=s.length()); int index=rand.nextInt(readlen-len+1); byte[] bytes=s.getBytes(); int lim=index+len-1; {//Change first and last to anything except old int i=index; byte old=bytes[i]; if(AminoAcid.isFullyDefined(old)){ byte oldNum=AminoAcid.baseToNumber[old]; int num=(oldNum+rand.nextInt(4))%4; assert(num>=0 && num<=3); byte base=AminoAcid.numberToBase[num]; bytes[i]=base; } i=lim; old=bytes[i]; if(AminoAcid.isFullyDefined(old)){ byte oldNum=AminoAcid.baseToNumber[old]; int num=(oldNum+rand.nextInt(4))%4; assert(num>=0 && num<=3); byte base=AminoAcid.numberToBase[num]; bytes[i]=base; } } for(int i=index+1; i=0 && num<=3 && num!=oldNum); byte base=AminoAcid.numberToBase[num]; bytes[i]=base; } } return new String(bytes); } public String addInsertion(String s, int minlen, int maxlen, int readlen, int[] dif, Random rand){ // assert(false) : minlen+","+maxlen; assert(readlen<=s.length()) : readlen+", "+s.length(); assert(minlen>0); assert(maxlen>=minlen); // int len=minlen+rand.nextInt(maxlen-minlen+1); int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); len=Tools.min(len, readlen-dif[1]-2); // assert(false) : len+", "+readlen+", "+dif[1]; if(len<1){return s;} if(verbose){System.err.println("\nAdding insertion of len="+len+", dif="+dif[0]);} dif[0]-=len; dif[1]+=len; int index=rand.nextInt(readlen-len+1); //Assures that all inserted bases will be within the read String mid=""; for(int i=0; i\n"+s2);} return s2; } public String addDeletion(String s, int minlen, int maxlen, int readlen, int[] dif, Random rand){ assert(s.length()>=readlen+maxlen); assert(minlen>0); assert(maxlen>=minlen); // int len=maxlen; // int len=minlen+rand.nextInt(maxlen-minlen+1); int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); // System.err.println("Made del len "+len); dif[0]+=len; // int index=rand.nextInt(s.length()-len); int index=1+rand.nextInt(readlen-1); //Assures there will never be a deletion of the first base, which would not technically be a deletion. // System.err.println("Added deletion "+len+" at "+index); String s2=s.substring(0, index)+s.substring(index+len); return s2; } public Read[] makeRandomReads(int readlen, int number, int minChrom, int maxChrom){ Read[] out=new Read[number]; for(int i=0; imaxChrom){ x=randy.nextInt(); chrom=randomChrom[(x&0x7FFFFFFF)%randomChrom.length]; // if(chrom>25 && Data.GENOME_BUILD==36){chrom=-1;} } byte strand=(byte) (x>=0 ? 0 : 1); // strand=0; //TODO // System.err.println("Chose chrom "+chrom+", strand "+strand); return makeRandomRead2(readlen, chrom, strand); } public Read makeRandomRead2(int readlen, int chrom, byte strand){ byte[] s=null; ChromosomeArray cha=Data.getChromosome(chrom); int loc=-1; while(s==null){ // loc=randy.nextInt(cha.maxIndex-40000); loc=randy.nextInt(cha.maxIndex-readlen); // loc=10180206; s=cha.getBytes(loc, loc+readlen-1); assert(s.length==readlen); // System.out.println(new String(s)); if(AminoAcid.countUndefined(s)>5){ s=null; // System.out.println("Tossed out string."); } } // System.err.println("Chose loc="+loc); assert(strand==Gene.MINUS || strand==Gene.PLUS); if(strand==Gene.MINUS){ s=AminoAcid.reverseComplementBases(s); } long id=nextReadID; nextReadID++; Read r=new Read(s, chrom, strand, loc, loc+s.length-1, id, null, false); r.setSynthetic(true); // System.err.println("Made read "+r.start+", "+r.stop); // assert(readlen==20); assert(r.bases.length==readlen); // assert(false) : r.start+", "+r.stop; return r; } public Read[] makeRandomReadsX(int numReads, int readlen, int maxSnps, int maxInss, int maxDels, int maxSubs, float snpRate, float insRate, float delRate, float subRate, int maxInsertionLen, int maxDeletionLen, int maxSubLen, int minChrom, int maxChrom, boolean colorspace, int minQual, int midQual, int maxQual){ assert(minQual<=midQual); assert(midQual<=maxQual); assert(minQual>=0 && maxQual<48); // System.err.println("Called makeRandomReadsX("+numReads+", "+readlen+", "+maxSnps+", "+maxDels+", "+maxInss+", "+ // snpRate+", "+delRate+", "+insRate+", "+maxInsertionLen+", "+maxDeletionLen+", "+minChrom+", "+maxChrom+")"); // if(colorspace){readlen++;} Read[] reads=new Read[numReads]; // assert(Index2.maxIndel==maxIndel); //Temporary final int maxQualP=Tools.max(35, maxQual); final int midQualP=30; final int minQualP=Tools.min(25, maxQual); for(int i=0; ireadlen){s=s.substring(0, readlen);} assert(s.length()==readlen); if(verbose){System.err.println("After length adjust 1 to "+readlen+": dif="+dif[0]+"\n"+s+"\n");} // int preInsertLength=s.length(); for(int j=0; jreadlen); s=s.substring(0, readlen); } if(verbose){System.err.println("After length adjust 2 to "+readlen+": dif="+dif[0]+"\n"+s+"\n");} // if(s.length()!=readlen){ // assert(s.length()>readlen); // boolean start=randyCutPos.nextBoolean(); // if(start){//Take first part of string // s=s.substring(0, readlen); // }else{//take last part of string // s=s.substring(s.length()-readlen); // } // } for(int j=0; j0){ int range=(perfect ? maxQualP-midQualP+1 : maxQual-midQual+1); int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range)); baseQuality=(byte)((perfect ? midQualP : midQual)+delta); }else{ int range=perfect ? midQualP-minQualP+1 : midQual-minQual+1; int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range)); baseQuality=(byte)((perfect ? midQualP : midQual)-delta); } } if(USE_FIXED_QUALITY){ r.quality=getFixedQualityRead(r.bases.length); }else{ if(perfect){ r.quality=QualityTools.makeQualityArray( r.bases.length, randyQual, minQualP, maxQualP, baseQuality, slant); }else{ r.quality=QualityTools.makeQualityArray( r.bases.length, randyQual, minQual, maxQual, baseQuality, slant); } } for(int j=0; jr.start) : "\n"+Read.header()+"\n"+r+"\n"+SNPs+", "+SUBs+", "+INSs+", "+DELs+"\n"+s+"\n"; if(colorspace){ r=reads[i]=r.translateToColorspace(true); r.obj=new String(r.bases); //TODO - for testing } r.mapLength=r.bases.length; if(paired){ Read r2=makeMate(r, mateLen, maxSnps, maxInss, maxDels, maxSubs, snpRate, insRate, delRate, subRate, maxInsertionLen, maxDeletionLen, maxSubLen, mateMiddleMin, mateMiddleMax, mateSameStrand, minQual, maxQual, baseQuality, slant, perfect); while(r2==null){ r2=makeMate(r, mateLen, maxSnps, maxInss, maxDels, maxSubs, snpRate, insRate, delRate, subRate, maxInsertionLen, maxDeletionLen, maxSubLen, mateMiddleMin, mateMiddleMax, mateSameStrand, minQual, maxQual, baseQuality, slant, perfect); } r.mate=r2; r2.mate=r; } // System.err.println("Made "+r.start+" ~ "+r.stop+" = "+(r.stop-r.start)); } // if(colorspace){ // for(int i=0; i=minMiddle); // assert(minMiddle>=0); int midRange=maxMiddle-minMiddle+1; int middle=(randyMate.nextInt(midRange)+randyMate.nextInt(midRange))/2+minMiddle; byte strand=(byte) (sameStrand ? other.strand() : other.strand()^1); // System.out.println(sameStrand+": "+other.strand+" -> "+strand); if(other.strand()==Gene.PLUS){ x=other.stop+middle; }else{ x=other.start-middle-readlen; } y=x+readlen+(maxDeletionLen*maxDels); if(x<0){x=0; y=readlen-1; maxDels=0;} if(y>Data.getChromosome(chrom).maxIndex){y=Data.getChromosome(chrom).maxIndex; x=y-readlen+1; maxDels=0;} String s=Data.getChromosome(chrom).getString(x, y); // System.out.println("Making string length "+s.length()+" from "+x+"-"+y+" of "+Data.getChromosome(chrom).maxIndex); //I already do this later. // if(strand==Gene.MINUS){ // s=AminoAcid.reverseComplementBases(s); // } long id=other.numericID; int SNPs=0; int INSs=0; int DELs=0; int SUBs=0; // assert(maxSnps==0 || (snpRate>.0001 && snpRate<=1)) : maxSnps+", "+snpRate; // assert(maxInss==0 || (snpRate>.0001 && insRate<=1)) : maxInss+", "+insRate; // assert(maxDels==0 || (snpRate>.0001 && delRate<=1)) : maxDels+", "+delRate; // assert(maxSubs==0 || (snpRate>.0001 && subRate<=1)) : maxSubs+", "+subRate; while(SNPsreadlen){s=s.substring(0, readlen);} assert(s.length()==readlen); // int preInsertLength=s.length(); int insLen=0; for(int j=0; jreadlen); s=s.substring(0, readlen); } // if(s.length()!=readlen){ // assert(s.length()>readlen); // boolean start=randyCutPos.nextBoolean(); // if(start){//Take first part of string // s=s.substring(0, readlen); // }else{//take last part of string // s=s.substring(s.length()-readlen); // } // } for(int j=0; jr.start) : "DELs="+DELs+", INSs="+INSs+", SUBs="+SUBs+", SNPs="+SNPs+ ", r.start="+r.start+", r.stop="+r.stop+", "+Data.getChromosome(chrom).maxIndex+"\n"+ (other==null ? "" : "\n\n"+other.toText(false)+"\n\n"); if(other.colorspace()){ r=r.translateToColorspace(true); r.obj=new String(r.bases); //TODO - for testing } assert(sameStrand == (r.strand()==other.strand())) : "\n"+r.toText(false)+"\n"+other.toText(false)+"\n\n"+ sameStrand+", "+r.strand()+", "+other.strand()+"\n"+r.pairnum()+", "+other.pairnum()+"\n"; return r; } public void addColorspaceErrors(Read r, int errors){ assert(r.colorspace()); for(int i=0; i=4); r.quality[loc]=(byte) (4+randyCSError.nextInt(r.quality[loc])); r.bases[loc]=(byte)((r.bases[loc]+randyCSError.nextInt(3)+1)&3); } } } // public static int[] approxChromLengths=new int[] { // 0, // 600 // }; // public static final int[] randomChrom=fillRandomChrom(approxChromLengths); public static int[] randomChrom; private static int[] fillRandomChrom(){ assert(Data.chromLengths!=null); int[] in=Arrays.copyOf(Data.chromLengths, Data.chromLengths.length); long total=Tools.sum(in); int div=(int)(total/1000); for(int i=0; i