#!/usr/bin/env python # written by Ka Ming Nip import glob import itertools import math import operator import os import shutil from .common_utils import run_shell_cmd from .common_utils import run_multi_shell_cmds from . import psl_cid_extractor from . import sam_cid_extractor def concat_fastas(path_prefix_map, cat): """Concatenate multiple fasta files together, giving the contigs of each set the specified prefix. """ with open(cat, 'w') as fout: for prefix, path in path_prefix_map.items(): line_start = '>' + prefix with open(path, 'r') as fin: for line in fin: if line[0] == '>': #header fout.write(line.replace('>', line_start, 1)) else: fout.write(line) #endif #endfor #endwith #endfor #endwith #enddef def blat_self_align(fasta, outputpsl, percent_id=0.95, max_consecutive_edits=1, min_seq_len=32, threads=1, skip_psl_self_awk=None): """Align all fasta sequences to each other with BLAT. """ files = bin_by_base_and_length(fasta, 20, fasta) index = 0 psls = [] cmds = [] # bin-to-bin alignments (both inter-bin and intra-bin) for a,b in itertools.combinations_with_replacement(files.keys(), 2): lower = min(a, b) upper = max(a, b) #print '%d => %d' % (lower, upper) psl = outputpsl + '.' + str(index) cmd_params = ['blat', '-noHead', '-t=dna', '-q=dna', '-out=psl'] minscore = int(math.ceil(percent_id * lower)) cmd_params.extend(['-maxGap=%d' % max_consecutive_edits, '-maxIntron=%d' % max_consecutive_edits, '-minScore=%d' % minscore, files[upper], files[lower]]) if not skip_psl_self_awk is None and os.path.isfile(skip_psl_self_awk): cmd_params.append('>(%s > %s) >&2' % (skip_psl_self_awk, psl)) else: cmd_params.append(psl) #endif #run_shell_cmd(' '.join(cmd_params)) cmds.append(' '.join(cmd_params)) psls.append(psl) index += 1 #endfor run_multi_shell_cmds(cmds, max_parallel=threads) # remove the tmp fasta files for f in files.values(): if os.path.isfile(f): os.remove(f) #endif #endfor # concatenate PSLs num_psls = len(psls) assert num_psls > 0 if num_psls == 1: shutil.move(psls[0], outputpsl) else: with open(outputpsl, 'w') as fout: for p in psls: with open(p, 'r') as fin: for line in fin: if len(line) > 0: fout.write(line) #endif #endfor #endwith # remove the tmp PSL os.remove(p) #endfor #endwith #endif #enddef def bin_by_base_and_length(fasta, bins, out_prefix): lengths = [] seqs = [] currentseq = None with open(fasta, 'r') as fin: for line in fin: if line[0] == '>': if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete lengths.append(len(currentseq.seq)) #endif currentseq = FastaSeq(header=line.rstrip()) seqs.append(currentseq) else: assert currentseq is not None if currentseq.seq is not None: # concatenate the sequence lines currentseq.seq += line.rstrip() else: currentseq.seq = line.rstrip() #endif #endif #endfor # The final fasta seq if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete lengths.append(len(currentseq.seq)) #endif #endwith lengths.sort() expected_bin_size = sum(lengths)/bins intervals = [] curr_bin_size = 0 lower = lengths[0] upper = lower for l in lengths: if curr_bin_size < expected_bin_size or l == upper: curr_bin_size += l upper = l else: intervals.append((lower, upper+1)) curr_bin_size = l lower = l upper = lower #endif #endfor intervals.append((lower, upper+1)) files = {} fhs = [] for i in range(len(intervals)): outpath = out_prefix + '.' + str(i) lower = intervals[i][0] #print str(intervals[i]) + ' : ' + outpath files[lower] = outpath fhs.append(open(outpath, 'w')) #endfor # [lower, upper) for s in seqs: length = len(s.seq) for index, (lower, upper) in enumerate(intervals): if length >= lower and length < upper: fhs[index].write('%s\n%s\n' % (s.header, s.seq)) break #endif #endfor #endfor for fh in fhs: fh.close() #endfor return files #enddef def bin_by_length(fasta, bins, out_prefix): lengths = [] seqs = [] currentseq = None with open(fasta, 'r') as fin: for line in fin: if line[0] == '>': if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete lengths.append(len(currentseq.seq)) #endif currentseq = FastaSeq(header=line.rstrip()) seqs.append(currentseq) else: assert currentseq is not None if currentseq.seq is not None: # concatenate the sequence lines currentseq.seq += line.rstrip() else: currentseq.seq = line.rstrip() #endif #endif #endfor # The final fasta seq if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete lengths.append(len(currentseq.seq)) #endif #endwith lengths.sort() count = len(lengths) expected_bin_size = count/bins intervals = [] lower = lengths[0] curr_bin_size = 0 upper = lower for l in lengths: if curr_bin_size < expected_bin_size or l == upper: curr_bin_size += 1 upper = l else: intervals.append((lower, upper+1)) lower = l upper = lower curr_bin_size = 1 #endif #endfor intervals.append((lower, upper+1)) files = {} fhs = [] for i in range(len(intervals)): outpath = out_prefix + '.' + str(i) lower = intervals[i][0] print(str(intervals[i]) + ' : ' + outpath) files[lower] = outpath fhs.append(open(outpath, 'w')) #endfor # [lower, upper) for s in seqs: length = len(s.seq) for index, (lower, upper) in enumerate(intervals): if length >= lower and length < upper: fhs[index].write('%s\n%s\n' % (s.header, s.seq)) break #endif #endfor #endfor for fh in fhs: fh.close() #endfor return files #enddef def blat_merge_fastas(path_prefix_map, merged_fa, concat_fa=None, concat_fa_selfalign_psl=None, percent_identity=0.95, strand_specific=False, cleanup=False, minoverlap=0, threads=1, indel_size_tolerance=1, min_seq_len=32, skip_psl_self_awk=None): """Merge fasta files into a single fasta file by removing redundant sequences. Redundancy is determined by BLAT alignments. """ if concat_fa is None: concat_fa = merged_fa + '.tmp.concat.fa' #endif if concat_fa_selfalign_psl is None: concat_fa_selfalign_psl = merged_fa + '.tmp.concat.psl' #endif # Concatenate the fastas together and give the contigs of each set a prefix concat_fastas(path_prefix_map, concat_fa) # Self-align concatenated fasta blat_self_align(concat_fa, concat_fa_selfalign_psl, percent_id=percent_identity, max_consecutive_edits=indel_size_tolerance, min_seq_len=min_seq_len, threads=threads, skip_psl_self_awk=skip_psl_self_awk) # Identify redundant contigs rrefs = psl_cid_extractor.extract_cids(psl=concat_fa_selfalign_psl, samestrand=strand_specific, min_percent_identity=percent_identity, max_consecutive_edits=indel_size_tolerance, report_redundant=True) tmpfiles = [] nr_fa_long = merged_fa + '.tmp.long.fa' nr_fa_short = None if minoverlap > 0: nr_fa_short = merged_fa + '.tmp.short.fa' tmpfiles.append(nr_fa_short) #endif # Gather the non-contained sequences and split into 2 partitions: # 1. shorter than (min overlap + 1) # 2. longer than or equal to (min overlap + 1) filter_fasta(concat_fa, nr_fa_long, min_length=minoverlap+1, remove_set=rrefs, fasta_out_st=nr_fa_short) # overlap-layout the long sequences if minoverlap > 0: # generate the sequence overlap graph overlap_dot = merged_fa + '.tmp.long.dot' overlap_cmd_params = ['abyss-overlap', '--threads=%d' % threads, '--min=%d' % minoverlap] if strand_specific: overlap_cmd_params.append('--SS') #endif overlap_cmd_params.append(nr_fa_long) overlap_cmd_params.append('>' + overlap_dot) run_shell_cmd(' '.join(overlap_cmd_params)) # layout contigs using the overlap graph layout_path = merged_fa + '.tmp.long.path' layout_cmd_params = ['abyss-layout', '--kmer=%d' % (minoverlap+1), '--out=%s' % layout_path] if strand_specific: layout_cmd_params.append('--SS') #endif layout_cmd_params.append(overlap_dot) run_shell_cmd(' '.join(layout_cmd_params)) # generate fasta for O-L overlap_fa = merged_fa + '.incomplete' mergecontigs_cmd_params = ['MergeContigs', '--kmer=%d' % (minoverlap+1), '--out=%s' % overlap_fa, nr_fa_long, overlap_dot, layout_path] run_shell_cmd(' '.join(mergecontigs_cmd_params)) # append the short sequences to the same fasta with open(overlap_fa, 'a') as fout: with open(nr_fa_short, 'r') as fin: for line in fin: fout.write(line) #endfor #endwith #endwith shutil.move(overlap_fa, merged_fa) tmpfiles.extend([concat_fa, concat_fa_selfalign_psl, nr_fa_long, nr_fa_short, overlap_dot, layout_path]) else: shutil.move(nr_fa_long, merged_fa) tmpfiles.extend([concat_fa, concat_fa_selfalign_psl]) #endif if cleanup and tmpfiles is not None: for t in tmpfiles: if t is not None and os.path.isfile(t): os.remove(t) #endif #endfor #endif #enddef def bowtie2_self_align(fasta, outputsam, threads=1, strand_specific=False, path_strip_sam_seq_qual=None, preset='--sensitive', k=2): """Align all fasta sequences to each other with Bowtie2. """ # Build index files for the concatenated fasta bt2_index_cmd_params = ['bowtie2-build --quiet', fasta, fasta] run_shell_cmd(' '.join(bt2_index_cmd_params)) # Self-align concatenated fasta with Bowtie2 bt2_align_cmd_params = ['bowtie2'] if strand_specific: bt2_align_cmd_params.append('--norc') #endif bt2_align_cmd_params.extend([preset, '-k %d' % k, '--omit-sec-seq --end-to-end -f', '-p %d' % threads, fasta, fasta]) if path_strip_sam_seq_qual: bt2_align_cmd_params.append('|' + path_strip_sam_seq_qual) #endif bt2_align_cmd_params.append('|gzip -c >' + outputsam) run_shell_cmd(' '.join(bt2_align_cmd_params)) #enddef def bowtie2_merge_fastas(path_prefix_map, merged_fa, concat_fa=None, concat_fa_selfalign_sam=None, bt2_threads=1, percent_identity=0.95, strand_specific=False, path_strip_sam_seq_qual=None, cleanup=False, bowtie2_preset='--sensitive', bowtie2_k=2, threads=1, indel_size_tolerance=1): """Merge fasta files into a single fasta file by removing redundant sequences. Redundancy is determined by Bowtie2 alignments. """ tmpfiles = [] if concat_fa is None: concat_fa = merged_fa + '.tmp.concat.fa' #endif if concat_fa_selfalign_sam is None: concat_fa_selfalign_sam = concat_fa + '.selfalign.sam.gz' #endif # Concatenate the fastas together and give the contigs of each set a prefix concat_fastas(path_prefix_map, concat_fa) # Self-align concatenated fasta with Bowtie2 bowtie2_self_align(concat_fa, concat_fa_selfalign_sam, threads=bt2_threads, strand_specific=strand_specific, path_strip_sam_seq_qual=path_strip_sam_seq_qual, preset=bowtie2_preset) # Identify NON-redundant contigs nrrefs = cid_extractor.extract_nr(sam=concat_fa_selfalign_sam, require_samestrand=strand_specific, min_percent_identity=percent_identity, max_consecutive_edits=indel_size_tolerance, report_redundant=False) nr_fa = merged_fa + '.tmp.nr.fa' # Gather the non-contained sequences filter_fasta(concat_fa, nr_fa, keep_set=nrrefs) shutil.move(nr_fa, merged_fa) tmpfiles.extend([concat_fa, concat_fa_selfalign_sam]) if cleanup and tmpfiles is not None: for t in tmpfiles: if t is not None and os.path.isfile(t): os.remove(t) #endif #endfor #endif #enddef def abyssmap_rmdups(in_fa, out_fa, strand_specific=False, cleanup=False, threads=1): ids_file = in_fa + '.dup_ids' # run abyssmap cmd_params = ['abyss-map', '--dup'] if strand_specific: cmd_params.append('--SS') #endif if threads > 1: cmd_params.append('--threads=%d' % threads) #endif cmd_params.extend([in_fa, in_fa]) cmd_params.append('> %s' % ids_file) run_shell_cmd(' '.join(cmd_params)) cids_set = set() with open(ids_file, 'r') as fh: for line in fh: line_stripped = line.strip() if len(line_stripped) > 0: cids_set.add(line_stripped) #endif #endfor #endwith filter_fasta(in_fa, out_fa, remove_set=cids_set) if cleanup: os.remove(ids_file) #endif #enddef def append_with_prefix(core_fa, additional_fa, prefix=''): """Append sequences from additional_fa to core_fa and give each a prefix """ with open(core_fa, 'a') as fout: with open(additional_fa, 'r') as fin: line_start = '>' + prefix for line in fin: if line[0] == '>': #header fout.write(line.replace('>', line_start, 1)) else: fout.write(line) #endif #endfor #endwith #endwith #enddef def abyssmap_merge_fastas(path_prefix_map, merged_fa, concat_fa=None, strand_specific=False, cleanup=False, threads=1, iterative=False): """Merge fasta files into a single fasta file by removing redundant sequences. Only completely contained sequences with exact match are removed. """ tmpfiles = [] if concat_fa is None: concat_fa = merged_fa + '.tmp.concat.fa' #endif tmpfiles.append(concat_fa) if iterative: tmp_merged_fa = merged_fa + '.tmp.fa' # get the file size for each fasta prefix_mem_tuples = [] for prefix, path in path_prefix_map.iteritems(): prefix_mem_tuples.append( (prefix, os.path.getsize(path)) ) #endfor # Sort by file size in ascending order prefix_mem_tuples.sort(key=operator.itemgetter(1)) # Add sequences from one additional fasta file to the merge pool in each iteration iteration = 0 for prefix, filesize in prefix_mem_tuples: if os.path.isfile(concat_fa): os.remove(concat_fa) #endif if iteration > 0: shutil.move(tmp_merged_fa, concat_fa) #endif append_with_prefix(concat_fa, path_prefix_map[prefix], prefix=prefix) abyssmap_rmdups(concat_fa, tmp_merged_fa, strand_specific=strand_specific, cleanup=cleanup, threads=threads) iteration += 1 #endfor shutil.move(tmp_merged_fa, merged_fa) tmpfiles.append(tmp_merged_fa) else: # Concatenate all fastas together and give the contigs of each set a prefix concat_fastas(path_prefix_map, concat_fa) # remove duplicates abyssmap_rmdups(concat_fa, merged_fa, strand_specific=strand_specific, cleanup=cleanup, threads=threads) #endif if cleanup and tmpfiles is not None: for t in tmpfiles: if t is not None and os.path.isfile(t): os.remove(t) #endif #endfor #endif #enddef class FastaSeq: """Data structure for one FASTA sequence. """ def __init__(self, header=None, seq=None): self.header = header self.seq = seq #enddef #endclass def filter_fasta(fasta_in, fasta_out, min_length=0, keep_set=None, fasta_out_st=None, remove_set=None): """Filter a FASTA file by selecting contigs by a length cutoff and/or a set of ids of contigs to keep/remove. """ currentseq = None fin = open(fasta_in, 'r') fout = open(fasta_out, 'w') fout_st = None if fasta_out_st is not None: # FASTA file for short sequences fout_st = open(fasta_out_st, 'w') #endif count = 0 count_st = 0 for line in fin: if line[0] == '>': if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete if (keep_set is None or currentseq.header.split()[0][1:] in keep_set) and (remove_set is None or not currentseq.header.split()[0][1:] in remove_set): current_seq_length = len(currentseq.seq) if min_length is None or current_seq_length >= min_length: fout.write('%s\n%s\n' % (currentseq.header, currentseq.seq)) count += 1 elif fout_st is not None: fout_st.write('%s\n%s\n' % (currentseq.header, currentseq.seq)) count_st += 1 #endif #endif #endif currentseq = FastaSeq(header=line.rstrip()) else: assert currentseq is not None if currentseq.seq is not None: # concatenate the sequence lines currentseq.seq += line.rstrip() else: currentseq.seq = line.rstrip() #endif #endif #endfor # The final fasta seq if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete if (keep_set is None or currentseq.header.split()[0][1:] in keep_set) and (remove_set is None or not currentseq.header.split()[0][1:] in remove_set): current_seq_length = len(currentseq.seq) if min_length is None or current_seq_length >= min_length: fout.write('%s\n%s\n' % (currentseq.header, currentseq.seq)) count += 1 elif fout_st is not None: fout_st.write('%s\n%s\n' % (currentseq.header, currentseq.seq)) count_st += 1 #endif #endif #endif fin.close() fout.close() if fout_st is not None: fout_st.close() #endif return count, count_st #enddef def partition_fasta(fasta_file, outdir, format='seq.%s.fa', max_bases=None, max_contigs=None): """Partition a FASTA into smaller chunks based on number of bases and/or number of contigs. """ path_format = os.path.join(outdir, format) for fa in glob.glob(path_format % '*'): #This could be a re-run; remove the existing output file. os.remove(fa) #endfor total_num_contigs = 0 total_num_bases = 0 currentseq = None output_num = 1 num_contigs = 0 num_bases = 0 fin = open(fasta_file, 'r') fout = open(path_format % str(output_num), 'w') for line in fin: if line[0] == '>': if currentseq is not None: assert currentseq.header is not None and currentseq.seq is not None # The current fasta sequence is now complete # write this fasta seq to file current_seq_length = len(currentseq.seq) if (max_contigs is not None and num_contigs + 1 > max_contigs) or (max_bases is not None and num_bases + current_seq_length > max_bases): # adding the current sequence to this batch would exceed the thresholds # this batch is done total_num_contigs += num_contigs total_num_bases += num_bases fout.close() # start the next batch output_num += 1 fout = open(path_format % str(output_num), 'w') num_contigs = 0 num_bases = 0 #endif fout.write('%s\n%s\n' % (currentseq.header, currentseq.seq)) currentseq = FastaSeq(header=line.rstrip()) num_contigs += 1 num_bases += current_seq_length else: currentseq = FastaSeq(header=line.rstrip()) #endif else: assert currentseq is not None if currentseq.seq is not None: # concatenate the sequence lines currentseq.seq += line.rstrip() else: currentseq.seq = line.rstrip() #endif #endif #endfor fin.close() # Write the final fasta seq to file current_seq_length = len(currentseq.seq) if (max_contigs is not None and num_contigs + 1 > max_contigs) or (max_bases is not None and num_bases + current_seq_length > max_bases): # adding the last sequence to this batch would exceed the thresholds # this batch is done total_num_contigs += num_contigs total_num_bases += num_bases fout.close() # start the next batch output_num += 1 fout = open(path_format % str(output_num), 'w') num_contigs = 0 num_bases = 0 #endif fout.write('%s\n%s\n' % (currentseq.header, currentseq.seq)) num_contigs += 1 num_bases += current_seq_length total_num_contigs += num_contigs total_num_bases += num_bases fout.close() return total_num_contigs, total_num_bases, output_num #enddef #EOF