############################################################################ # Copyright (c) 2011-2014 Saint-Petersburg Academic University # All Rights Reserved # See file LICENSE for details. ############################################################################ # # Some comments in this script was kindly provided by Plantagora team and # modified by QUAST team. It is not licensed under GPL as other parts of QUAST, # but it can be distributed and used in QUAST pipeline with current remarks and # citation. For more details about assess_assembly.pl please refer to # http://www.plantagora.org website and to the following paper: # # Barthelson R, McFarlin AJ, Rounsley SD, Young S (2011) Plantagora: Modeling # Whole Genome Sequencing and Assembly of Plant Genomes. PLoS ONE 6(12): # e28436. doi:10.1371/journal.pone.0028436 ############################################################################ from __future__ import with_statement import os import platform import datetime import fastaparser import shutil from libs import reporting, qconfig, qutils from libs.log import get_logger logger = get_logger(qconfig.LOGGER_DEFAULT_NAME) required_binaries = ['nucmer', 'delta-filter', 'show-coords', 'show-snps', 'mummer', 'mgaps'] if platform.system() == 'Darwin': mummer_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'MUMmer3.23-osx') else: mummer_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'MUMmer3.23-linux') def bin_fpath(fname): return os.path.join(mummer_dirpath, fname) class Misassembly: LOCAL = 0 RELOCATION = 1 TRANSLOCATION = 2 INVERSION = 3 class Mapping(object): def __init__(self, s1, e1, s2, e2, len1, len2, idy, ref, contig): self.s1, self.e1, self.s2, self.e2, self.len1, self.len2, self.idy, self.ref, self.contig = s1, e1, s2, e2, len1, len2, idy, ref, contig @classmethod def from_line(cls, line): # line from coords file,e.g. # 4324128 4496883 | 112426 285180 | 172755 172756 | 99.9900 | gi|48994873|gb|U00096.2| NODE_333_length_285180_cov_221082 line = line.split() assert line[2] == line[5] == line[8] == line[10] == '|', line contig = line[12] ref = line[11] s1, e1, s2, e2, len1, len2 = [int(line[i]) for i in [0, 1, 3, 4, 6, 7]] idy = float(line[9]) return Mapping(s1, e1, s2, e2, len1, len2, idy, ref, contig) def __str__(self): return ' '.join(str(x) for x in [self.s1, self.e1, '|', self.s2, self.e2, '|', self.len1, self.len2, '|', self.idy, '|', self.ref, self.contig]) def clone(self): return Mapping.from_line(str(self)) class Mappings(object): def __init__(self): self.aligns = {} # contig -> [mapping] self.cnt = 0 def add(self, mapping): self.aligns.setdefault(mapping.contig, []).append(mapping) @classmethod def from_coords(cls, fpath): f = open(fpath, 'w') f.close() def clear_files(fpath, nucmer_fpath): if qconfig.debug: return # delete temporary files for ext in ['.delta', '.coords_tmp', '.coords.headless']: if os.path.isfile(nucmer_fpath + ext): os.remove(nucmer_fpath + ext) if os.path.isfile('nucmer.error'): os.remove('nucmer.error') if os.path.isfile(fpath + '.clean'): os.remove(fpath + '.clean') class NucmerStatus: FAILED = 0 OK = 1 NOT_ALIGNED = 2 def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, planta_err_f): # additional GAGE params of Nucmer: '-l', '30', '-banded' return_code = qutils.call_subprocess( [bin_fpath('nucmer'), '-c', str(qconfig.mincluster), '-l', str(qconfig.mincluster), '--maxmatch', '-p', prefix, ref_fpath, contigs_fpath], stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) if return_code != 0: print >> planta_err_f, qutils.index_to_str(index) + 'Nucmer failed for', contigs_fpath, '\n' return return_code def __fail(contigs_fpath, index): logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference. ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) return NucmerStatus.FAILED, {}, [] def create_nucmer_successful_check(fpath, contigs_fpath, ref_fpath): nucmer_successful_check_file = open(fpath, 'w') nucmer_successful_check_file.write("Assembly file size in bytes: %d\n" % os.path.getsize(contigs_fpath)) nucmer_successful_check_file.write("Reference file size in bytes: %d\n" % os.path.getsize(ref_fpath)) nucmer_successful_check_file.write("Successfully finished on " + datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S') + '\n') nucmer_successful_check_file.close() def check_nucmer_successful_check(fpath, contigs_fpath, ref_fpath): successful_check_content = open(fpath).read().split('\n') if len(successful_check_content) < 2: return False if not successful_check_content[0].strip().endswith(str(os.path.getsize(contigs_fpath))): return False if not successful_check_content[1].strip().endswith(str(os.path.getsize(ref_fpath))): return False return True def plantakolya(cyclic, index, contigs_fpath, nucmer_fpath, output_dirpath, ref_fpath): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) # run plantakolya tool log_out_fpath = os.path.join(output_dirpath, "contigs_report_" + assembly_name + '.stdout') log_err_fpath = os.path.join(output_dirpath, "contigs_report_" + assembly_name + '.stderr') planta_out_f = open(log_out_fpath, 'w') planta_err_f = open(log_err_fpath, 'w') logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') maxun = 10 epsilon = 0.99 smgap = 1000 umt = 0.5 # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold) ort = 0.9 # threshold for skipping aligns that significantly overlaps with adjacent aligns (Overlap Relative Threshold) oat = 25 # threshold for skipping aligns that significantly overlaps with adjacent aligns (Overlap Absolute Threshold) nucmer_successful_check_fpath = nucmer_fpath + '.sf' coords_fpath = nucmer_fpath + '.coords' delta_fpath = nucmer_fpath + '.delta' filtered_delta_fpath = nucmer_fpath + '.fdelta' coords_filtered_fpath = nucmer_fpath + '.coords.filtered' unaligned_fpath = nucmer_fpath + '.unaligned' show_snps_fpath = nucmer_fpath + '.all_snps' used_snps_fpath = nucmer_fpath + '.used_snps' print >> planta_out_f, 'Aligning contigs to reference...' # Checking if there are existing previous nucmer alignments. # If they exist, using them to save time. using_existing_alignments = False if os.path.isfile(nucmer_successful_check_fpath) and\ os.path.isfile(coords_fpath) and\ os.path.isfile(show_snps_fpath): if check_nucmer_successful_check(nucmer_successful_check_fpath, contigs_fpath, ref_fpath): print >> planta_out_f, '\tUsing existing Nucmer alignments...' logger.info(' ' + qutils.index_to_str(index) + 'Using existing Nucmer alignments... ') using_existing_alignments = True if not using_existing_alignments: print >> planta_out_f, '\tRunning Nucmer' logger.info(' ' + qutils.index_to_str(index) + 'Running Nucmer') nucmer_failed = False if not qconfig.splitted_ref: nucmer_exit_code = run_nucmer(nucmer_fpath, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, planta_err_f) if nucmer_exit_code != 0: return __fail(contigs_fpath, index) else: prefixes_and_chr_files = [(nucmer_fpath + "_" + os.path.basename(chr_fname), chr_fname) for chr_fname in qconfig.splitted_ref] # Daemonic processes are not allowed to have children, # so if we are already one of parallel processes # (i.e. daemonic) we can't start new daemonic processes if qconfig.assemblies_num == 1: n_jobs = min(qconfig.max_threads, len(prefixes_and_chr_files)) else: n_jobs = 1 if n_jobs > 1: logger.info(' ' + 'Aligning to different chromosomes in parallel' ' (' + str(n_jobs) + ' threads)') # processing each chromosome separately (if we can) from joblib import Parallel, delayed nucmer_exit_codes = Parallel(n_jobs=n_jobs)(delayed(run_nucmer)( prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath, index, planta_err_f) for (prefix, chr_file) in prefixes_and_chr_files) if 0 not in nucmer_exit_codes: return __fail(contigs_fpath, index) else: # filling common delta file delta_file = open(delta_fpath, 'w') delta_file.write(ref_fpath + " " + contigs_fpath + "\n") delta_file.write("NUCMER\n") for i, (prefix, chr_fname) in enumerate(prefixes_and_chr_files): if nucmer_exit_codes[i] != 0: continue chr_delta_fpath = prefix + '.delta' if os.path.isfile(chr_delta_fpath): chr_delta_file = open(chr_delta_fpath) chr_delta_file.readline() chr_delta_file.readline() for line in chr_delta_file: delta_file.write(line) chr_delta_file.close() delta_file.close() # Filtering by IDY% = 95 (as GAGE did) return_code = qutils.call_subprocess( [bin_fpath('delta-filter'), '-i', '95', delta_fpath], stdout=open(filtered_delta_fpath, 'w'), stderr=planta_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: print >> planta_err_f, qutils.index_to_str(index) + 'Delta filter failed for', contigs_fpath, '\n' return __fail(contigs_fpath, index) shutil.move(filtered_delta_fpath, delta_fpath) tmp_coords_fpath = coords_fpath + '_tmp' return_code = qutils.call_subprocess( [bin_fpath('show-coords'), delta_fpath], stdout=open(tmp_coords_fpath, 'w'), stderr=planta_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: print >> planta_err_f, qutils.index_to_str(index) + 'Show-coords failed for', contigs_fpath, '\n' return __fail(contigs_fpath, index) # removing waste lines from coords file coords_file = open(coords_fpath, 'w') header = [] tmp_coords_file = open(tmp_coords_fpath) for line in tmp_coords_file: header.append(line) if line.startswith('====='): break coords_file.write(header[-2]) coords_file.write(header[-1]) for line in tmp_coords_file: coords_file.write(line) coords_file.close() tmp_coords_file.close() if not os.path.isfile(coords_fpath): print >> planta_err_f, qutils.index_to_str(index) + 'Nucmer failed for', contigs_fpath + ':', coords_fpath, 'doesn\'t exist.' logger.info(' ' + qutils.index_to_str(index) + 'Nucmer failed for ' + '\'' + assembly_name + '\'.') return NucmerStatus.FAILED, {}, [] if len(open(coords_fpath).readlines()[-1].split()) < 13: print >> planta_err_f, qutils.index_to_str(index) + 'Nucmer: nothing aligned for', contigs_fpath logger.info(' ' + qutils.index_to_str(index) + 'Nucmer: nothing aligned for ' + '\'' + assembly_name + '\'.') return NucmerStatus.NOT_ALIGNED, {}, [] with open(coords_fpath) as coords_file: headless_coords_fpath = coords_fpath + '.headless' headless_coords_f = open(headless_coords_fpath, 'w') coords_file.readline() coords_file.readline() headless_coords_f.write(coords_file.read()) headless_coords_f.close() headless_coords_f = open(headless_coords_fpath) return_code = qutils.call_subprocess( [bin_fpath('show-snps'), '-S', '-T', '-H', delta_fpath], stdin=headless_coords_f, stdout=open(show_snps_fpath, 'w'), stderr=planta_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: print >> planta_err_f, qutils.index_to_str(index) + 'Show-snps failed for', contigs_fpath, '\n' return __fail(contigs_fpath, index) create_nucmer_successful_check(nucmer_successful_check_fpath, contigs_fpath, ref_fpath) # Loading the alignment files print >> planta_out_f, 'Parsing coords...' aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) sum_idy = 0.0 num_idy = 0 for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) sum_idy += mapping.idy num_idy += 1 aligns.setdefault(mapping.contig, []).append(mapping) avg_idy = sum_idy / num_idy if num_idy else 0 #### auxiliary functions #### def distance_between_alignments(align1, align2, pos_strand, cyclic_ref_len=None): ''' returns distance (in reference) between two alignments ''' if pos_strand: # alignment 1 should be earlier in reference distance = align2.s1 - align1.e1 - 1 else: # alignment 2 should be earlier in reference distance = align1.s1 - align2.e1 - 1 cyclic_moment = False if cyclic_ref_len is not None: if align1.e1 < align2.e1 and (cyclic_ref_len - align2.e1 + align1.s1 - 1) < smgap: distance += cyclic_ref_len * (-1 if pos_strand else 1) cyclic_moment = True elif align1.e1 >= align2.e1 and (cyclic_ref_len - align1.e1 + align2.s1 - 1) < smgap: distance += cyclic_ref_len * (1 if pos_strand else -1) cyclic_moment = True return distance, cyclic_moment def is_misassembly(align1, align2, cyclic_ref_lens=None): #Calculate inconsistency between distances on the reference and on the contig distance_on_contig = min(align2.e2, align2.s2) - max(align1.e2, align1.s2) - 1 if cyclic_ref_lens is not None and align1.ref == align2.ref: distance_on_reference, cyclic_moment = distance_between_alignments(align1, align2, align1.s2 < align1.e2, cyclic_ref_lens[align1.ref]) else: distance_on_reference, cyclic_moment = distance_between_alignments(align1, align2, align1.s2 < align1.e2) misassembly_internal_overlap = 0 if distance_on_contig < 0: if distance_on_reference >= 0: misassembly_internal_overlap = (-distance_on_contig) elif (-distance_on_reference) < (-distance_on_contig): misassembly_internal_overlap = (distance_on_reference - distance_on_contig) strand1 = (align1.s2 < align1.e2) strand2 = (align2.s2 < align2.e2) inconsistency = distance_on_reference - distance_on_contig aux_data = {"inconsistency": inconsistency, "distance_on_contig": distance_on_contig, "misassembly_internal_overlap": misassembly_internal_overlap, "cyclic_moment": cyclic_moment} # different chromosomes or large inconsistency (a gap or an overlap) or different strands if align1.ref != align2.ref or abs(inconsistency) > smgap or (strand1 != strand2): return True, aux_data else: return False, aux_data def process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, reg_lens, ref_aligns, ref_features): misassembly_internal_overlap = 0 prev = sorted_aligns[0].clone() cur_aligned_length = prev.len2 is_misassembled = False for i in range(len(sorted_aligns) - 1): print >> planta_out_f, '\t\t\tReal Alignment %d: %s' % (i+1, str(sorted_aligns[i])) is_extensive_misassembly, aux_data = is_misassembly(sorted_aligns[i], sorted_aligns[i+1], reg_lens if cyclic else None) inconsistency = aux_data["inconsistency"] distance_on_contig = aux_data["distance_on_contig"] misassembly_internal_overlap += aux_data["misassembly_internal_overlap"] cyclic_moment = aux_data["cyclic_moment"] ref_aligns.setdefault(sorted_aligns[i].ref, []).append(sorted_aligns[i]) print >> coords_filtered_file, str(prev) if is_extensive_misassembly: is_misassembled = True aligned_lengths.append(cur_aligned_length) cur_aligned_length = 0 print >> planta_out_f, '\t\t\t Extensive misassembly (', if sorted_aligns[i].ref != sorted_aligns[i+1].ref: region_misassemblies.append(Misassembly.TRANSLOCATION) print >> planta_out_f, 'translocation', elif abs(inconsistency) > smgap: region_misassemblies.append(Misassembly.RELOCATION) print >> planta_out_f, 'relocation, inconsistency =', inconsistency, else: #if strand1 != strand2: region_misassemblies.append(Misassembly.INVERSION) print >> planta_out_f, 'inversion', print >> planta_out_f, ') between these two alignments' ref_features.setdefault(sorted_aligns[i].ref, {})[sorted_aligns[i].e1] = 'M' ref_features.setdefault(sorted_aligns[i+1].ref, {})[sorted_aligns[i+1].e1] = 'M' else: if cyclic_moment and inconsistency == 0: print >> planta_out_f, '\t\t\t Fake misassembly (caused by linear representation of circular genome) between these two alignments' else: if qconfig.strict_NA: aligned_lengths.append(cur_aligned_length) cur_aligned_length = 0 if inconsistency < 0: #There is an overlap between the two alignments, a local misassembly print >> planta_out_f, '\t\t\t Overlap between these two alignments (local misassembly).', else: #There is a small gap between the two alignments, a local misassembly print >> planta_out_f, '\t\t\t Gap between these two alignments (local misassembly).', #print >> plantafile_out, 'Distance on contig =', distance_on_contig, ', distance on reference =', distance_on_reference print >> planta_out_f, 'Inconsistency =', inconsistency, "(linear representation of circular genome)" if cyclic_moment else "" region_misassemblies.append(Misassembly.LOCAL) prev = sorted_aligns[i+1].clone() cur_aligned_length += prev.len2 - (-distance_on_contig if distance_on_contig < 0 else 0) #Record the very last alignment i = len(sorted_aligns) - 1 print >> planta_out_f, '\t\t\tReal Alignment %d: %s' % (i + 1, str(sorted_aligns[i])) ref_aligns.setdefault(sorted_aligns[i].ref, []).append(sorted_aligns[i]) print >> coords_filtered_file, str(prev) aligned_lengths.append(cur_aligned_length) return is_misassembled, misassembly_internal_overlap #### end of aux. functions ### # Loading the assembly contigs print >> planta_out_f, 'Loading Assembly...' assembly = {} assembly_ns = {} for name, seq in fastaparser.read_fasta(contigs_fpath): assembly[name] = seq if 'N' in seq: assembly_ns[name] = [pos for pos in xrange(len(seq)) if seq[pos] == 'N'] # Loading the reference sequences print >> planta_out_f, 'Loading reference...' # TODO: move up references = {} ref_aligns = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header references[name] = seq print >> planta_out_f, '\tLoaded [%s]' % name #Loading the SNP calls print >> planta_out_f, 'Loading SNPs...' class SNP(): def __init__(self, ref=None, ctg=None, ref_pos=None, ctg_pos=None, ref_nucl=None, ctg_nucl=None): self.ref = ref self.ctg = ctg self.ref_pos = ref_pos self.ctg_pos = ctg_pos self.ref_nucl = ref_nucl self.ctg_nucl = ctg_nucl self.type = 'I' if self.ref_nucl == '.' else ('D' if ctg_nucl == '.' else 'S') snps = {} prev_line = None for line in open(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref=ref, ctg=ctg, ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref=ref, ctg=ctg, ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} reg_lens = {} total_reg_len = 0 total_regions = 0 print >> planta_out_f, 'Loading regions...' # TODO: gff print >> planta_out_f, '\tNo regions given, using whole reference.' for name, seq in references.iteritems(): regions.setdefault(name, []).append([1, len(seq)]) reg_lens[name] = len(seq) total_regions += 1 total_reg_len += len(seq) print >> planta_out_f, '\tTotal Regions: %d' % total_regions print >> planta_out_f, '\tTotal Region Length: %d' % total_reg_len unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 uncovered_regions = 0 uncovered_region_bases = 0 total_redundant = 0 partially_unaligned_with_misassembly = 0 partially_unaligned_with_significant_parts = 0 misassembly_internal_overlap = 0 region_misassemblies = [] misassembled_contigs = {} aligned_lengths = [] print >> planta_out_f, 'Analyzing contigs...' unaligned_file = open(unaligned_fpath, 'w') for contig, seq in assembly.iteritems(): #Recording contig stats ctg_len = len(seq) print >> planta_out_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len) #Check if this contig aligned to the reference if contig in aligns: #Pull all aligns for this contig num_aligns = len(aligns[contig]) #Sort aligns by length and identity sorted_aligns = sorted(aligns[contig], key=lambda x: (x.len2 * x.idy, x.len2), reverse=True) top_len = sorted_aligns[0].len2 top_id = sorted_aligns[0].idy top_aligns = [] print >> planta_out_f, 'Top Length: %s Top ID: %s' % (top_len, top_id) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy: while sorted_aligns and ((sorted_aligns[0].len2 * sorted_aligns[0].idy) / (top_len * top_id) > epsilon): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as ambiguous while sorted_aligns: ambig = sorted_aligns.pop() print >> planta_out_f, '\t\tMarking as insignificant: %s' % str(ambig) # former ambiguous # Kolya: removed redundant code about $ref (for gff AFAIU) if len(top_aligns) == 1: #There is only one top align, life is good print >> planta_out_f, '\t\tOne align captures most of this contig: %s' % str(top_aligns[0]) ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) print >> coords_filtered_file, str(top_aligns[0]) aligned_lengths.append(top_aligns[0].len2) else: #There is more than one top align print >> planta_out_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len( top_aligns) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # in case --allow-ambiguity is not set the number of extra bases will be negative! ambiguous_contigs_extra_bases -= top_aligns[0].len2 # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": print >> planta_out_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):' for align in top_aligns: print >> planta_out_f, '\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "one": print >> planta_out_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):' print >> planta_out_f, '\t\tAlignment: %s' % str(top_aligns[0]) ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0].len2) print >> coords_filtered_file, str(top_aligns[0]) top_aligns = top_aligns[1:] for align in top_aligns: print >> planta_out_f, '\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "all": print >> planta_out_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):' # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True while len(top_aligns): print >> planta_out_f, '\t\tAlignment: %s' % str(top_aligns[0]) ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0].len2) ambiguous_contigs_extra_bases += top_aligns[0].len2 print >> coords_filtered_file, str(top_aligns[0]), "ambiguous" top_aligns = top_aligns[1:] #Record these alignments as ambiguous on the reference # for align in top_aligns: # print >> plantafile_out, '\t\t\tAmbiguous Alignment: %s' % str(align) # ref = align.ref # for i in xrange(align.s1, align.e1+1): # if (ref not in ref_features) or (i not in ref_features[ref]): # ref_features.setdefault(ref, {})[i] = 'A' #Increment count of ambiguous contigs and bases #ambiguous += 1 #total_ambiguous += ctg_len else: #Sort all aligns by position on contig, then length sorted_aligns = sorted(sorted_aligns, key=lambda x: (x.len2, x.idy), reverse=True) sorted_aligns = sorted(sorted_aligns, key=lambda x: min(x.s2, x.e2)) #Push first alignment on to real aligns real_aligns = [sorted_aligns[0]] last_end = max(sorted_aligns[0].s2, sorted_aligns[0].e2) last_real = sorted_aligns[0] #Walk through alignments, if not fully contained within previous, record as real real_groups = dict() for i in xrange(1, num_aligns): cur_group = (last_end - last_real.len2 + 1, last_end) #If this alignment extends past last alignment's endpoint, add to real, else skip extension = max(sorted_aligns[i].s2, sorted_aligns[i].e2) - last_end # negative if no extension if (extension > maxun) and (float(extension) / min(sorted_aligns[i].len2, last_real.len2) > 1.0 - epsilon): # check whether previous alignment is almost contained in this extension prev_extension = min(sorted_aligns[i].s2, sorted_aligns[i].e2) - min(last_real.s2, last_real.e2) if (prev_extension <= maxun) or (float(prev_extension) / min(sorted_aligns[i].len2, last_real.len2) <= 1.0 - epsilon): if cur_group in real_groups: for align in real_groups[cur_group]: print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(align)) del real_groups[cur_group] else: real_aligns = real_aligns[:-1] print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(last_real)) real_aligns = real_aligns + [sorted_aligns[i]] last_end = max(sorted_aligns[i].s2, sorted_aligns[i].e2) last_real = sorted_aligns[i] else: if (sorted_aligns[i].len2 * sorted_aligns[i].idy) / (last_real.idy * last_real.len2) > epsilon: if cur_group not in real_groups: real_groups[cur_group] = [ real_aligns[-1] ] real_aligns = real_aligns[:-1] real_groups[cur_group].append(sorted_aligns[i]) else: print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(sorted_aligns[i])) # Kolya: removed redundant code about $ref (for gff AFAIU) # choose appropriate alignments (to minimize total size of contig alignment and reduce # misassemblies if len(real_groups) > 0: # auxiliary function def get_group_id_of_align(align): for k,v in real_groups.items(): if align in v: return k return None # adding degenerate groups for single real aligns if len(real_aligns) > 0: for align in real_aligns: cur_group = (min(align.s2, align.e2), max(align.s2, align.e2)) real_groups[cur_group] = [align] sorted_aligns = sorted((align for group in real_groups.values() for align in group), key=lambda x: x.s1) min_selection = [] min_selection_distance = None cur_selection = [] cur_selection_group_ids = [] for cur_align in sorted_aligns: cur_align_group_id = get_group_id_of_align(cur_align) if cur_align_group_id not in cur_selection_group_ids: cur_selection.append(cur_align) cur_selection_group_ids.append(cur_align_group_id) else: for align in cur_selection: if get_group_id_of_align(align) == cur_align_group_id: cur_selection.remove(align) break cur_selection.append(cur_align) if len(cur_selection_group_ids) == len(real_groups.keys()): cur_selection_distance = cur_selection[-1].e1 - cur_selection[0].s1 if (not min_selection) or (cur_selection_distance < min_selection_distance): min_selection = list(cur_selection) min_selection_distance = cur_selection_distance # save min selection to real aligns and skip others (as redundant) real_aligns = list(min_selection) print >> planta_out_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments' for align in sorted_aligns: if align not in real_aligns: print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(align)) if len(real_aligns) == 1: #There is only one alignment of this contig to the reference print >> coords_filtered_file, str(real_aligns[0]) aligned_lengths.append(real_aligns[0].len2) #Is the contig aligned in the reverse compliment? #Record beginning and end of alignment in contig if sorted_aligns[0].s2 > sorted_aligns[0].e2: end, begin = sorted_aligns[0].s2, sorted_aligns[0].e2 else: end, begin = sorted_aligns[0].e2, sorted_aligns[0].s2 if (begin - 1) or (ctg_len - end): #Increment tally of partially unaligned contigs partially_unaligned += 1 #Increment tally of partially unaligned bases unaligned_bases = (begin - 1) + (ctg_len - end) partially_unaligned_bases += unaligned_bases print >> planta_out_f, '\t\tThis contig is partially unaligned. (%d out of %d)' % (top_len, ctg_len) print >> planta_out_f, '\t\tAlignment: %s' % str(sorted_aligns[0]) if begin - 1: print >> planta_out_f, '\t\tUnaligned bases: 1 to %d (%d)' % (begin - 1, begin - 1) if ctg_len - end: print >> planta_out_f, '\t\tUnaligned bases: %d to %d (%d)' % (end + 1, ctg_len, ctg_len - end) # check if both parts (aligned and unaligned) have significant length if (unaligned_bases >= qconfig.min_contig) and (ctg_len - unaligned_bases >= qconfig.min_contig): partially_unaligned_with_significant_parts += 1 print >> planta_out_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= min-contig)!' ref_aligns.setdefault(sorted_aligns[0].ref, []).append(sorted_aligns[0]) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (min(x.s2, x.e2), max(x.s2, x.e2))) #Extra skipping of redundant alignments (fully or almost fully covered by adjacent alignments) if len(sorted_aligns) >= 3: was_extra_skip = False prev_end = max(sorted_aligns[0].s2, sorted_aligns[0].e2) for i in range(1, len(sorted_aligns) - 1): succ_start = min(sorted_aligns[i + 1].s2, sorted_aligns[i + 1].e2) gap = succ_start - prev_end - 1 if gap > smgap: prev_end = max(sorted_aligns[i].s2, sorted_aligns[i].e2) continue overlap = 0 if prev_end - min(sorted_aligns[i].s2, sorted_aligns[i].e2) + 1 > 0: overlap += prev_end - min(sorted_aligns[i].s2, sorted_aligns[i].e2) + 1 if max(sorted_aligns[i].s2, sorted_aligns[i].e2) - succ_start + 1 > 0: overlap += max(sorted_aligns[i].s2, sorted_aligns[i].e2) - succ_start + 1 if gap < oat or (float(overlap) / sorted_aligns[i].len2) > ort: if not was_extra_skip: was_extra_skip = True print >> planta_out_f, '\t\t\tSkipping redundant alignments which significantly overlap with adjacent alignments' print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(sorted_aligns[i])) real_aligns.remove(sorted_aligns[i]) else: prev_end = max(sorted_aligns[i].s2, sorted_aligns[i].e2) if was_extra_skip: sorted_aligns = sorted(real_aligns, key=lambda x: (min(x.s2, x.e2), max(x.s2, x.e2))) #There is more than one alignment of this contig to the reference print >> planta_out_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns # Counting misassembled contigs which are mostly partially unaligned # counting aligned and unaligned bases of a contig aligned_bases_in_contig = 0 last_e2 = 0 for cur_align in sorted_aligns: if max(cur_align.s2, cur_align.e2) <= last_e2: continue elif min(cur_align.s2, cur_align.e2) > last_e2: aligned_bases_in_contig += (abs(cur_align.e2 - cur_align.s2) + 1) else: aligned_bases_in_contig += (max(cur_align.s2, cur_align.e2) - last_e2) last_e2 = max(cur_align.s2, cur_align.e2) #aligned_bases_in_contig = sum(x.len2 for x in sorted_aligns) if aligned_bases_in_contig < umt * ctg_len: print >> planta_out_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig) partially_unaligned_with_misassembly += 1 for align in sorted_aligns: print >> planta_out_f, '\t\tAlignment: %s' % str(align) print >> coords_filtered_file, str(align) aligned_lengths.append(align.len2) #Increment tally of partially unaligned contigs partially_unaligned += 1 #Increment tally of partially unaligned bases partially_unaligned_bases += ctg_len - aligned_bases_in_contig print >> planta_out_f, '\t\tUnaligned bases: %d' % (ctg_len - aligned_bases_in_contig) # check if both parts (aligned and unaligned) have significant length if (aligned_bases_in_contig >= qconfig.min_contig) and (ctg_len - aligned_bases_in_contig >= qconfig.min_contig): partially_unaligned_with_significant_parts += 1 print >> planta_out_f, '\t\tThis contig has both significant aligned and unaligned parts '\ '(of length >= min-contig)!' continue ### processing misassemblies is_misassembled, current_mio = process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, reg_lens, ref_aligns, ref_features) misassembly_internal_overlap += current_mio if is_misassembled: misassembled_contigs[contig] = len(assembly[contig]) else: #No aligns to this contig print >> planta_out_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len print >> unaligned_file, contig #Increment unaligned contig count and bases unaligned += 1 fully_unaligned_bases += ctg_len print >> planta_out_f, '\t\tUnaligned bases: %d total: %d' % (ctg_len, fully_unaligned_bases) print >> planta_out_f coords_filtered_file.close() unaligned_file.close() print >> planta_out_f, 'Analyzing coverage...' print >> planta_out_f, 'Writing SNPs into', used_snps_fpath region_covered = 0 region_ambig = 0 region_snp = 0 region_insertion = 0 region_deletion = 0 gaps = [] neg_gaps = [] redundant = [] snip_left = 0 snip_right = 0 # for counting short and long indels indels_list = [] prev_snp = None cur_indel = 0 nothing_aligned = True #Go through each header in reference file for ref, value in regions.iteritems(): #Check to make sure this reference ID contains aligns. if ref not in ref_aligns: print >> planta_out_f, 'ERROR: Reference [$ref] does not have any alignments! Check that this is the same file used for alignment.' print >> planta_out_f, 'ERROR: Alignment Reference Headers: %s' % ref_aligns.keys() continue nothing_aligned = False #Sort all alignments in this reference by start location sorted_aligns = sorted(ref_aligns[ref], key=lambda x: x.s1) total_aligns = len(sorted_aligns) print >> planta_out_f, '\tReference %s: %d total alignments. %d total regions.' % (ref, total_aligns, len(regions[ref])) #Walk through each region on this reference sequence for region in regions[ref]: end = 0 reg_length = region[1] - region[0] + 1 print >> planta_out_f, '\t\tRegion: %d to %d (%d bp)' % (region[0], region[1], reg_length) #Skipping alignments not in the next region while sorted_aligns and sorted_aligns[0].e1 < region[0]: skipped = sorted_aligns[0] sorted_aligns = sorted_aligns[1:] # Kolya: slooow, but should never happens without gff :) print >> planta_out_f, '\t\t\tThis align occurs before our region of interest, skipping: %s' % skipped if not sorted_aligns: print >> planta_out_f, '\t\t\tThere are no more aligns. Skipping this region.' continue #If region starts in a contig, ignore portion of contig prior to region start if sorted_aligns and region and sorted_aligns[0].s1 < region[0]: print >> planta_out_f, '\t\t\tSTART within alignment : %s' % sorted_aligns[0] #Track number of bases ignored at the start of the alignment snip_left = region[0] - sorted_aligns[0].s1 #Modify to account for any insertions or deletions that are present for z in xrange(sorted_aligns[0].s1, region[0] + 1): if (ref in snps) and (sorted_aligns[0].contig in snps[ref]) and (z in snps[ref][sorted_aligns[0].contig]) and \ (ref in ref_features) and (z in ref_features[ref]) and (ref_features[ref][z] != 'A'): # Kolya: never happened before because of bug: z -> i for cur_snp in snps[ref][sorted_aligns[0].contig][z]: if cur_snp.type == 'I': snip_left += 1 elif cur_snp.type == 'D': snip_left -= 1 #Modify alignment to start at region print >> planta_out_f, '\t\t\t\tMoving reference start from %d to %d' % (sorted_aligns[0].s1, region[0]) sorted_aligns[0].s1 = region[0] #Modify start position in contig if sorted_aligns[0].s2 < sorted_aligns[0].e2: print >> planta_out_f, '\t\t\t\tMoving contig start from %d to %d.' % (sorted_aligns[0].s2, sorted_aligns[0].s2 + snip_left) sorted_aligns[0].s2 += snip_left else: print >> planta_out_f, '\t\t\t\tMoving contig start from %d to %d.' % (sorted_aligns[0].s2, sorted_aligns[0].s2 - snip_left) sorted_aligns[0].s2 -= snip_left #No aligns in this region if sorted_aligns[0].s1 > region[1]: print >> planta_out_f, '\t\t\tThere are no aligns within this region.' gaps.append([reg_length, 'START', 'END']) #Increment uncovered region count and bases uncovered_regions += 1 uncovered_region_bases += reg_length continue #Record first gap, and first ambiguous bases within it if sorted_aligns[0].s1 > region[0]: size = sorted_aligns[0].s1 - region[0] print >> planta_out_f, '\t\t\tSTART in gap: %d to %d (%d bp)' % (region[0], sorted_aligns[0].s1, size) gaps.append([size, 'START', sorted_aligns[0].contig]) #Increment any ambiguously covered bases in this first gap for i in xrange(region[0], sorted_aligns[0].e1): if (ref in ref_features) and (i in ref_features[ref]) and (ref_features[ref][i] == 'A'): region_ambig += 1 #For counting number of alignments counter = 0 negative = False current = None while sorted_aligns and sorted_aligns[0].s1 < region[1] and not end: #Increment alignment count counter += 1 if counter % 1000 == 0: print >> planta_out_f, '\t...%d of %d' % (counter, total_aligns) end = False #Check to see if previous gap was negative if negative: print >> planta_out_f, '\t\t\tPrevious gap was negative, modifying coordinates to ignore overlap' #Ignoring OL part of next contig, no SNPs or N's will be recorded snip_left = current.e1 + 1 - sorted_aligns[0].s1 #Account for any indels that may be present for z in xrange(sorted_aligns[0].s1, current.e1 + 2): if (ref in snps) and (sorted_aligns[0].contig in snps[ref]) and (z in snps[ref][sorted_aligns[0].contig]): for cur_snp in snps[ref][sorted_aligns[0].contig][z]: if cur_snp.type == 'I': snip_left += 1 elif cur_snp.type == 'D': snip_left -= 1 #Modifying position in contig of next alignment sorted_aligns[0].s1 = current.e1 + 1 if sorted_aligns[0].s2 < sorted_aligns[0].e2: print >> planta_out_f, '\t\t\t\tMoving contig start from %d to %d.' % (sorted_aligns[0].s2, sorted_aligns[0].s2 + snip_left) sorted_aligns[0].s2 += snip_left else: print >> planta_out_f, '\t\t\t\tMoving contig start from %d to %d.' % (sorted_aligns[0].s2, sorted_aligns[0].s2 - snip_left) sorted_aligns[0].s2 -= snip_left negative = False #Pull top alignment current = sorted_aligns[0] sorted_aligns = sorted_aligns[1:] #print >>plantafile_out, '\t\t\tAlign %d: %s' % (counter, current) #(self, s1, e1, s2, e2, len1, len2, idy, ref, contig): print >>planta_out_f, '\t\t\tAlign %d: %s' % (counter, '%d %d %s %d %d' % (current.s1, current.e1, current.contig, current.s2, current.e2)) #Check if: # A) We have no more aligns to this reference # B) The current alignment extends to or past the end of the region # C) The next alignment starts after the end of the region if not sorted_aligns or current.e1 >= region[1] or sorted_aligns[0].s1 > region[1]: #Check if last alignment ends before the regions does (gap at end of the region) if current.e1 >= region[1]: #print "Ends inside current alignment.\n"; print >> planta_out_f, '\t\t\tEND in current alignment. Modifying %d to %d.' % (current.e1, region[1]) #Pushing the rest of the alignment back on the stack sorted_aligns = [current] + sorted_aligns #Flag to end loop through alignment end = True #Clip off right side of contig alignment snip_right = current.e1 - region[1] #End current alignment in region current.e1 = region[1] else: #Region ends in a gap size = region[1] - current.e1 print >> planta_out_f, '\t\t\tEND in gap: %d to %d (%d bp)' % (current.e1, region[1], size) #Record gap if not sorted_aligns: #No more alignments, region ends in gap. gaps.append([size, current.contig, 'END']) else: #Gap between end of current and beginning of next alignment. gaps.append([size, current.contig, sorted_aligns[0].contig]) #Increment any ambiguous bases within this gap for i in xrange(current.e1, region[1]): if (ref in ref_features) and (i in ref_features[ref]) and (ref_features[ref][i] == 'A'): region_ambig += 1 else: #Grab next alignment next = sorted_aligns[0] #print >> plantafile_out, '\t\t\t\tNext Alignment: %d %d %s %d %d' % (next.s1, next.e1, next.contig, next.s2, next.e2) if next.e1 <= current.e1: #The next alignment is redundant to the current alignmentt while next.e1 <= current.e1 and sorted_aligns: total_redundant += next.e1 - next.s1 + 1 print >> planta_out_f, '\t\t\t\tThe next alignment (%d %d %s %d %d) is redundant. Skipping.' \ % (next.s1, next.e1, next.contig, next.s2, next.e2) redundant.append(current.contig) sorted_aligns = sorted_aligns[1:] if sorted_aligns: next = sorted_aligns[0] counter += 1 else: #Flag to end loop through alignment end = True if not end: if next.s1 > current.e1 + 1: #There is a gap beetween this and the next alignment size = next.s1 - current.e1 - 1 gaps.append([size, current.contig, next.contig]) print >> planta_out_f, '\t\t\t\tGap between this and next alignment: %d to %d (%d bp)' % (current.e1, next.s1, size) #Record ambiguous bases in current gap for i in xrange(current.e1, next.s1): if (ref in ref_features) and (i in ref_features[ref]) and (ref_features[ref][i] == 'A'): region_ambig += 1 elif next.s1 <= current.e1: #This alignment overlaps with the next alignment, negative gap #If contig extends past the region, clip if current.e1 > region[1]: current.e1 = region[1] #Record gap size = next.s1 - current.e1 neg_gaps.append([size, current.contig, next.contig]) print >>planta_out_f, '\t\t\t\tNegative gap (overlap) between this and next alignment: %d to %d (%d bp)' % (current.e1, next.s1, size) #Mark this alignment as negative so overlap region can be ignored negative = True print >> planta_out_f, '\t\t\t\tNext Alignment: %d %d %s %d %d' % (next.s1, next.e1, next.contig, next.s2, next.e2) #Initiate location of SNP on assembly to be first or last base of contig alignment contig_estimate = current.s2 enable_SNPs_output = False if enable_SNPs_output: print >> planta_out_f, '\t\t\t\tContig start coord: %d' % contig_estimate #Assess each reference base of the current alignment for i in xrange(current.s1, current.e1 + 1): #Mark as covered region_covered += 1 if current.s2 < current.e2: pos_strand = True else: pos_strand = False #If there is a misassembly, increment count and contig length #if (exists $ref_features{$ref}[$i] && $ref_features{$ref}[$i] eq "M") { # $region_misassemblies++; # $misassembled_contigs{$current[2]} = length($assembly{$current[2]}); #} #If there is a SNP, and no alternative alignments over this base, record SNPs if (ref in snps) and (current.contig in snps[ref]) and (i in snps[ref][current.contig]): cur_snps = snps[ref][current.contig][i] # sorting by pos in contig if pos_strand: cur_snps = sorted(cur_snps, key=lambda x: x.ctg_pos) else: # for reverse complement cur_snps = sorted(cur_snps, key=lambda x: x.ctg_pos, reverse=True) for cur_snp in cur_snps: if enable_SNPs_output: print >> planta_out_f, '\t\t\t\tSNP: %s, reference coord: %d, contig coord: %d, estimated contig coord: %d' % \ (cur_snp.type, i, cur_snp.ctg_pos, contig_estimate) #Capture SNP base snp = cur_snp.type #Check that there are not multiple alignments at this location ### Alex: obsolete, we changed algorithm for ambiguous contigs #if (ref in ref_features) and (i in ref_features[ref]): # print >> plantafile_out, '\t\t\t\t\tERROR: SNP at a position where there are multiple alignments (%s). Skipping.\n' % ref_features[ref][i] # if current.s2 < current.e2: contig_estimate += 1 # else: contig_estimate -= 1 # continue #Check that the position of the SNP in the contig is close to the position of this SNP if abs(contig_estimate - cur_snp.ctg_pos) > 2: if enable_SNPs_output: print >> planta_out_f, '\t\t\t\t\tERROR: SNP position in contig was off by %d bp! (%d vs %d)' \ % (abs(contig_estimate - cur_snp.ctg_pos), contig_estimate, cur_snp.ctg_pos) continue print >> used_snps_file, '%s\t%s\t%d\t%s\t%s\t%d' % (ref, current.contig, cur_snp.ref_pos, cur_snp.ref_nucl, cur_snp.ctg_nucl, cur_snp.ctg_pos) #If SNP is an insertion, record if snp == 'I': region_insertion += 1 if pos_strand: contig_estimate += 1 else: contig_estimate -= 1 #If SNP is a deletion, record if snp == 'D': region_deletion += 1 if pos_strand: contig_estimate -= 1 else: contig_estimate += 1 #If SNP is a mismatch, record if snp == 'S': region_snp += 1 if cur_snp.type == 'D' or cur_snp.type == 'I': if prev_snp and (prev_snp.ref == cur_snp.ref) and (prev_snp.ctg == cur_snp.ctg) and \ ((cur_snp.type == 'D' and (prev_snp.ref_pos == cur_snp.ref_pos - 1) and (prev_snp.ctg_pos == cur_snp.ctg_pos)) or (cur_snp.type == 'I' and ((pos_strand and (prev_snp.ctg_pos == cur_snp.ctg_pos - 1)) or (not pos_strand and (prev_snp.ctg_pos == cur_snp.ctg_pos + 1))) and (prev_snp.ref_pos == cur_snp.ref_pos))): cur_indel += 1 else: if cur_indel: indels_list.append(cur_indel) cur_indel = 1 prev_snp = cur_snp if pos_strand: contig_estimate += 1 else: contig_estimate -= 1 #Record Ns in current alignment if current.s2 < current.e2: #print "\t\t(forward)Recording Ns from $current[3]+$snip_left to $current[4]-$snip_right...\n"; for i in (current.s2 + snip_left, current.e2 - snip_right + 1): if (current.contig in assembly_ns) and (i in assembly_ns[current.contig]): region_ambig += 1 else: #print "\t\t(reverse)Recording Ns from $current[4]+$snip_right to $current[3]-$snip_left...\n"; for i in (current.e2 + snip_left, current.s2 - snip_right + 1): if (current.contig in assembly_ns) and (i in assembly_ns[current.contig]): region_ambig += 1 snip_left = 0 snip_right = 0 if cur_indel: indels_list.append(cur_indel) prev_snp = None cur_indel = 0 print >> planta_out_f ##### getting results from Plantagora's algorithm SNPs = region_snp indels = region_insertion + region_deletion total_aligned_bases = region_covered print >> planta_out_f, 'Analysis is finished!' print >> planta_out_f, 'Founded SNPs were written into', used_snps_fpath print >> planta_out_f, '\nResults:' print >> planta_out_f, '\tLocal Misassemblies: %d' % region_misassemblies.count(Misassembly.LOCAL) print >> planta_out_f, '\tMisassemblies: %d' % (len(region_misassemblies) - region_misassemblies.count(Misassembly.LOCAL)) print >> planta_out_f, '\t\tRelocations: %d' % region_misassemblies.count(Misassembly.RELOCATION) print >> planta_out_f, '\t\tTranslocations: %d' % region_misassemblies.count(Misassembly.TRANSLOCATION) print >> planta_out_f, '\t\tInversions: %d' % region_misassemblies.count(Misassembly.INVERSION) print >> planta_out_f, '\tMisassembled Contigs: %d' % len(misassembled_contigs) misassembled_bases = sum(misassembled_contigs.itervalues()) print >> planta_out_f, '\tMisassembled Contig Bases: %d' % misassembled_bases print >> planta_out_f, '\tMisassmblies Inter-Contig Overlap: %d' % misassembly_internal_overlap print >> planta_out_f, 'Uncovered Regions: %d (%d)' % (uncovered_regions, uncovered_region_bases) print >> planta_out_f, 'Unaligned Contigs: %d + %d part' % (unaligned, partially_unaligned) print >> planta_out_f, 'Partially Unaligned Contigs with Misassemblies: %d' % partially_unaligned_with_misassembly print >> planta_out_f, 'Unaligned Contig Bases: %d' % (fully_unaligned_bases + partially_unaligned_bases) print >> planta_out_f, '' print >> planta_out_f, 'Ambiguously Mapped Contigs: %d' % ambiguous_contigs if qconfig.ambiguity_usage == "all": print >> planta_out_f, 'Extra Bases in Ambiguously Mapped Contigs: %d' % ambiguous_contigs_extra_bases print >> planta_out_f, 'Note that --allow-ambiguity option was set to "all" and each contig was used several times.' else: print >> planta_out_f, 'Total Bases in Ambiguously Mapped Contigs: %d' % (-ambiguous_contigs_extra_bases) if qconfig.ambiguity_usage == "none": print >> planta_out_f, 'Note that --allow-ambiguity option was set to "none" and these contigs were skipped.' else: print >> planta_out_f, 'Note that --allow-ambiguity option was set to "one" and only first alignment per each contig was used.' ambiguous_contigs_extra_bases = 0 # this variable is used in Duplication ratio but we don't need it in this case #print >> plantafile_out, 'Mismatches: %d' % SNPs #print >> plantafile_out, 'Single Nucleotide Indels: %d' % indels print >> planta_out_f, '' print >> planta_out_f, '\tCovered Bases: %d' % region_covered #print >> plantafile_out, '\tAmbiguous Bases (e.g. N\'s): %d' % region_ambig print >> planta_out_f, '' print >> planta_out_f, '\tSNPs: %d' % region_snp print >> planta_out_f, '\tInsertions: %d' % region_insertion print >> planta_out_f, '\tDeletions: %d' % region_deletion #print >> plantafile_out, '\tList of indels lengths:', indels_list print >> planta_out_f, '' print >> planta_out_f, '\tPositive Gaps: %d' % len(gaps) internal = 0 external = 0 summ = 0 for gap in gaps: if gap[1] == gap[2]: internal += 1 else: external += 1 summ += gap[0] print >> planta_out_f, '\t\tInternal Gaps: % d' % internal print >> planta_out_f, '\t\tExternal Gaps: % d' % external print >> planta_out_f, '\t\tExternal Gap Total: % d' % summ if external: avg = summ * 1.0 / external else: avg = 0.0 print >> planta_out_f, '\t\tExternal Gap Average: %.0f' % avg print >> planta_out_f, '\tNegative Gaps: %d' % len(neg_gaps) internal = 0 external = 0 summ = 0 for gap in neg_gaps: if gap[1] == gap[2]: internal += 1 else: external += 1 summ += gap[0] print >> planta_out_f, '\t\tInternal Overlaps: % d' % internal print >> planta_out_f, '\t\tExternal Overlaps: % d' % external print >> planta_out_f, '\t\tExternal Overlaps Total: % d' % summ if external: avg = summ * 1.0 / external else: avg = 0.0 print >> planta_out_f, '\t\tExternal Overlaps Average: %.0f' % avg redundant = list(set(redundant)) print >> planta_out_f, '\tContigs with Redundant Alignments: %d (%d)' % (len(redundant), total_redundant) result = {'avg_idy': avg_idy, 'region_misassemblies': region_misassemblies, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'SNPs': SNPs, 'indels_list': indels_list, 'total_aligned_bases': total_aligned_bases, 'partially_unaligned_with_misassembly': partially_unaligned_with_misassembly, 'partially_unaligned_with_significant_parts': partially_unaligned_with_significant_parts} ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta( os.path.join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) alignment_tsv_fpath = os.path.join(output_dirpath, "alignments_" + assembly_name + '.tsv') logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) alignment_tsv_f = open(alignment_tsv_fpath, 'w') for ref_name, aligns in ref_aligns.iteritems(): alignment_tsv_f.write(ref_name) for align in aligns: alignment_tsv_f.write('\t' + align.contig) alignment_tsv_f.write('\n') alignment_tsv_f.close() planta_out_f.close() planta_err_f.close() used_snps_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if nothing_aligned: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths else: return NucmerStatus.OK, result, aligned_lengths def plantakolya_process(cyclic, nucmer_output_dirpath, contigs_fpath, i, output_dirpath, ref_fpath): assembly_name = qutils.name_from_fpath(contigs_fpath) nucmer_fname = os.path.join(nucmer_output_dirpath, assembly_name) nucmer_is_ok, result, aligned_lengths = plantakolya( cyclic, i, contigs_fpath, nucmer_fname, output_dirpath, ref_fpath) clear_files(contigs_fpath, nucmer_fname) return nucmer_is_ok, result, aligned_lengths def all_required_binaries_exist(mummer_dirpath): for required_binary in required_binaries: if not os.path.isfile(os.path.join(mummer_dirpath, required_binary)): return False return True def do(reference, contigs_fpaths, cyclic, output_dir): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.info('Running Contig analyzer...') if not all_required_binaries_exist(mummer_dirpath): # making logger.info('Compiling MUMmer (details are in ' + os.path.join(mummer_dirpath, 'make.log') + " and make.err)") return_code = qutils.call_subprocess( ['make', '-C', mummer_dirpath], stdout=open(os.path.join(mummer_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(mummer_dirpath, 'make.err'), 'w'),) if return_code != 0 or not all_required_binaries_exist(mummer_dirpath): logger.error("Failed to compile MUMmer (" + mummer_dirpath + ")! " "Try to compile it manually. " + ("You can restart Quast with the --debug flag " "to see the command line." if not qconfig.debug else "")) logger.info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None nucmer_output_dir = os.path.join(output_dir, 'nucmer_output') if not os.path.isdir(nucmer_output_dir): os.mkdir(nucmer_output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(plantakolya_process)( cyclic, nucmer_output_dir, fname, i, output_dir, reference) for i, fname in enumerate(contigs_fpaths)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] def save_result(result): report = reporting.get(fname) avg_idy = result['avg_idy'] region_misassemblies = result['region_misassemblies'] misassembled_contigs = result['misassembled_contigs'] misassembled_bases = result['misassembled_bases'] misassembly_internal_overlap = result['misassembly_internal_overlap'] unaligned = result['unaligned'] partially_unaligned = result['partially_unaligned'] partially_unaligned_bases = result['partially_unaligned_bases'] fully_unaligned_bases = result['fully_unaligned_bases'] ambiguous_contigs = result['ambiguous_contigs'] ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases'] SNPs = result['SNPs'] indels_list = result['indels_list'] total_aligned_bases = result['total_aligned_bases'] partially_unaligned_with_misassembly = result['partially_unaligned_with_misassembly'] partially_unaligned_with_significant_parts = result['partially_unaligned_with_significant_parts'] report.add_field(reporting.Fields.AVGIDY, '%.3f' % avg_idy) report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL)) report.add_field(reporting.Fields.MISASSEMBL, len(region_misassemblies) - region_misassemblies.count(Misassembly.LOCAL)) report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases) report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned)) report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases)) report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs) report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases) report.add_field(reporting.Fields.MISMATCHES, SNPs) # different types of indels: report.add_field(reporting.Fields.INDELS, len(indels_list)) report.add_field(reporting.Fields.INDELSBASES, sum(indels_list)) report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD])) report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD])) if total_aligned_bases: report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases))) report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(total_aligned_bases))) # for misassemblies report: report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, len(region_misassemblies) - region_misassemblies.count(Misassembly.LOCAL)) report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION)) report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION)) report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION)) report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases) report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL)) # for unaligned report: report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned) report.add_field(reporting.Fields.UNALIGNED_PART_WITH_MISASSEMBLY, partially_unaligned_with_misassembly) report.add_field(reporting.Fields.UNALIGNED_PART_SIGNIFICANT_PARTS, partially_unaligned_with_significant_parts) report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases) def save_result_for_unaligned(result): report = reporting.get(fname) unaligned_ctgs = report.get_field(reporting.Fields.CONTIGS) unaligned_length = report.get_field(reporting.Fields.TOTALLEN) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned_ctgs, 0)) report.add_field(reporting.Fields.UNALIGNEDBASES, unaligned_length) report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned_ctgs) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, unaligned_length) for index, fname in enumerate(contigs_fpaths): if statuses[index] == NucmerStatus.OK: save_result(results[index]) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index]) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) # nucmer_statuses = {} # # for id, filename in enumerate(filenames): # nucmer_status = plantakolya_process(cyclic, filename, id, myenv, output_dir, reference) # nucmer_statuses[filename] = nucmer_status if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) oks = nucmer_statuses.values().count(NucmerStatus.OK) not_aligned = nucmer_statuses.values().count(NucmerStatus.NOT_ALIGNED) failed = nucmer_statuses.values().count(NucmerStatus.FAILED) problems = not_aligned + failed all = len(nucmer_statuses) if oks == all: logger.info('Done.') if oks < all and problems < all: logger.info('Done for ' + str(all - problems) + 'out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') # if NucmerStatus.FAILED in nucmer_statuses.values(): # log.info(' ' + str(failed) + 'file' + (' ' if failed == 1 else 's ') + 'failed to align to the reference. Only basic stats have been evaluated.') # if NucmerStatus.NOT_ALIGNED in nucmer_statuses.values(): # log.info(' ' + str(not_aligned) + ' file' + (' was' if not_aligned == 1 else 's were') + ' not aligned to the reference. Only basic stats have been evaluated.') # if problems == all: # log.info(' Nucmer failed.') return nucmer_statuses, aligned_lengths_per_fpath