#!/usr/bin/env python # coding: utf-8 """ .. module:: BuscoAnalysis :synopsis: BuscoAnalysis implements general BUSCO analysis specifics .. versionadded:: 3.0.0 .. versionchanged:: 3.0.1 Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ from abc import ABCMeta, abstractmethod from quast_libs.busco.BuscoConfig import BuscoConfig import copy import inspect import os from quast_libs.busco.pipebricks.PipeHelper import Analysis from quast_libs.busco.pipebricks.PipeLogger import PipeLogger from quast_libs.busco.pipebricks.Toolset import Tool import random import subprocess from collections import deque # from overrides import overrides # useful fro dev, but don't want all user to install this try: import queue as Queue except ImportError: import Queue # Python 2.7 try: from configparser import NoOptionError from configparser import NoSectionError except ImportError: from ConfigParser import NoOptionError # Python 2.7 from ConfigParser import NoSectionError # Python 2.7 class BuscoAnalysis(Analysis): """ This abstract class defines methods required for most of BUSCO analyses and has to be extended by each specific analysis class It extends the pipebricks.PipeHelper Analysis """ # declare a metaclass ABCMeta, which means that this class is abstract __metaclass__ = ABCMeta _logger = PipeLogger.get_logger(__name__) # # magic or public, meant to be accessed by external scripts [instance] # def __init__(self, config): """ :param config: Values of all parameters to be used during the analysis :type config: BuscoConfig """ # 1) load parameters # 2) load and validate tools # 3) check data and dataset integrity # 4) Ready for analysis # See also parent __init__ super(BuscoAnalysis, self).__init__(config) # _init tool called here try: # this both detect if a subclass did not properly declare the mode before calling parent __init__ # and maintains PEP8 compliance checks if self._mode is None: self._mode = '' except AttributeError as e: BuscoAnalysis._logger.critical(e) raise SystemExit self._random = "_"+str(random.getrandbits(32)) # to have a unique component for temporary file names # runtime directory self.mainout = None self._root_folder = config.get('busco', 'out_path') self._out = config.get('busco', 'out') self._tmp = config.get('busco', 'tmp_path') self._force = config.getboolean('busco', 'force') self._long = config.getboolean('busco', 'long') self._restart = config.getboolean('busco', 'restart') self._cpus = config.getint('busco', 'cpu') self._blast_single_core = config.getboolean('busco', 'blast_single_core') self._sequences = config.get('busco', 'in') self._lineage_path = config.get('busco', 'lineage_path') self._lineage_name = config.get('busco', 'clade_name') self._domain = config.get('busco', 'domain') self._ev_cutoff = config.getfloat('busco', 'evalue') self._region_limit = config.getint('busco', 'limit') self._has_variants_file = False self._missing_busco_list = [] self._fragmented_busco_list = [] # -- end -- self._tarzip = config.getboolean('busco', 'tarzip') self._dataset_creation_date = config.get('busco', 'dataset_creation_date') self._dataset_nb_species = config.get('busco', 'dataset_nb_species') self._dataset_nb_buscos = config.get('busco', 'dataset_nb_buscos') # self._totalbuscos = 0 self._total = 0 self._cutoff_dictionary = {} self._thread_list = None self._no_prediction = None self._exit_flag = None self._queue_lock = None self._work_queue = None self._location_dic = {} self._single_copy_files = None self._contig_length = {} # check data integrity self._check_dataset() BuscoAnalysis._logger.info('Check input file...') for line in open(self._sequences): if line.startswith('>'): self._check_fasta_header(line) self._set_rerun_busco_command() BuscoAnalysis._logger.info('To reproduce this run: %s' % self._rerun_cmd) @abstractmethod # @override def run_analysis(self): """ Abstract method, override to call all needed steps for running the child analysis. """ BuscoAnalysis._logger.info('Mode is: %s' % self._mode) BuscoAnalysis._logger.info('The lineage dataset is: %s (%s)' % (self._lineage_name, self._domain)) # create the run(main) and the temporary directories self._create_main_and_tmp_dirs() def cleanup(self): """ This function cleans temporary files. \ It has to be overriden by subclasses when needed """ self._p_open(['rm', '%stemp_%s%s' % (self._tmp, self._out, self._random)], 'bash', shell=False) def get_checkpoint(self, reset_random_suffix=False): """ This function return the checkpoint if the checkpoint.tmp file exits or None if absent :param reset_random_suffix: to tell whether to reset the self._random value with the one found the checkpoint :type reset_random_suffix: bool :return: the checkpoint name :rtype: int """ if os.path.exists('%scheckpoint.tmp' % self.mainout): line = open('%scheckpoint.tmp' % self.mainout, 'r').readline() if reset_random_suffix: BuscoAnalysis._logger.debug('Resetting random suffix to %s' % self._random) self._random = line.split('.')[-1] return int(int(line.split('.')[0])) else: return None # # public, meant to be accessed by external scripts [class] # @staticmethod def get_analysis(config): """ This method returns the appropriate instance for the BUSCO mode specifed in parameters :param config: :type config: BuscoConfig :return: """ # Need to import the sublcasses here, not sure this is a good design pattern then. (?) from quast_libs.busco.GenomeAnalysis import GenomeAnalysis from quast_libs.busco.TranscriptomeAnalysis import TranscriptomeAnalysis from quast_libs.busco.GeneSetAnalysis import GeneSetAnalysis mode = config.get('busco', 'mode') if mode == 'genome' or mode == 'geno': return GenomeAnalysis(config) elif mode == 'transcriptome' or mode == 'tran': return TranscriptomeAnalysis(config) elif mode == 'proteins' or mode == 'prot': return GeneSetAnalysis(config) else: BuscoAnalysis._logger.error('Unknown mode %s, use genome, transcriptome, or proteins', mode) raise SystemExit # # method that should be used as if protected, for internal use [instance] # to move to public and rename if meaningful # # @override def _init_tools(self): """ Init the tools needed for the analysis """ BuscoAnalysis._logger.info('Init tools...') self._hmmer = Tool('hmmsearch', self._params) self._mkblast = Tool('makeblastdb', self._params) self._tblastn = Tool('tblastn', self._params) BuscoAnalysis._logger.info('Check dependencies...') self._check_tool_dependencies() def _check_tool_dependencies(self): """ check dependencies on tools :raises SystemExit: if a Tool is not available """ # check 'tblastn' command availability if not Tool.check_tool_available('tblastn', self._params): BuscoAnalysis._logger.error( '\"tblastn\" is not accessible, please ' 'add or modify its path in the config file. Do not include the commmand in the path !') raise SystemExit # check 'makeblastdb' command availability if not Tool.check_tool_available('makeblastdb', self._params): BuscoAnalysis._logger.error( '\"makeblastdb\" is not accessible, please ' 'add or modify its path in the config file. Do not include the commmand in the path !') raise SystemExit # check 'hmmersearch' command availability if not Tool.check_tool_available('hmmsearch', self._params): BuscoAnalysis._logger.error( '\"hmmsearch\" is not accessible, ' 'add or modify its path in the config file. Do not include the commmand in the path !') raise SystemExit # check hmm version if self._get_hmmer_version(self._hmmer.cmd[0]) >= BuscoConfig.HMMER_VERSION: pass else: BuscoAnalysis._logger.error( 'HMMer version detected is not supported, please use HMMer ' ' v.%s +' % BuscoConfig.HMMER_VERSION) raise SystemExit def _check_fasta_header(self, header): """ This function checks problematic characters in fasta headers, and warns the user and stops the execution :param header: a fasta header to check :type header: str :raises SystemExit: if a problematic character is found """ for char in BuscoConfig.FORBIDDEN_HEADER_CHARS: if char in header: BuscoAnalysis._logger.error( 'The character \'%s\' is present in the fasta header %s, ' 'which will crash BUSCO. Please clean the header of your ' 'input file.' % (char, header.strip())) raise SystemExit for char in BuscoConfig.FORBIDDEN_HEADER_CHARS_BEFORE_SPLIT: if char in header.split()[0]: BuscoAnalysis._logger.error( 'The character \'%s\' is present in the fasta header %s, ' 'which will crash Reader. Please clean the header of your' ' input file.' % (char, header.split()[0].strip())) raise SystemExit if header.split()[0] == '>': BuscoAnalysis._logger.error( 'A space is present in the fasta header %s, directly after ' '\'>\' which will crash Reader. Please clean the header of ' 'your input file.' % (header.strip())) raise SystemExit def _check_dataset(self): """ Check the input dataset integrity, both files and folder are available :raises SystemExit: if the dataset miss files or folders """ # hmm folder flag = False for dirpath, dirnames, files in os.walk('%shmms' % self._lineage_path): if files: flag = True if not flag: BuscoAnalysis._logger.error( 'The dataset you provided lacks hmm profiles in %shmms' % self._lineage_path) raise SystemExit # note: score and length cutoffs are checked when read, # see _load_scores and _load_lengths @abstractmethod def _run_hmmer(self): """ This function runs hmmsearch. """ pass def _write_output_header(self, out): """ This function adds a header to the provided file :param out: a file to which the header will be added :type out: file """ out.write( '# BUSCO version is: %s \n# The lineage dataset is: %s (Creation ' 'date: %s, number of species: %s, number of BUSCOs: %s)\n' % (BuscoConfig.VERSION, self._lineage_name, self._dataset_creation_date, self._dataset_nb_species, self._dataset_nb_buscos)) out.write('# To reproduce this run: %s\n#\n' % self._rerun_cmd) def _extract_missing_and_frag_buscos_ancestral(self, ancestral_variants=False): """ This function extracts from the file ancestral the sequences that match missing or fragmented buscos :param ancestral_variants: tell whether to use the ancestral_variants file :type ancestral_variants: bool """ if self._has_variants_file: BuscoAnalysis._logger.info( 'Extracting missing and fragmented buscos from ' 'the ancestral_variants file...') else: BuscoAnalysis._logger.info( 'Extracting missing and fragmented buscos from ' 'the ancestral file...') if ancestral_variants: ancestral = open('%sancestral_variants' % self._lineage_path, 'r') output = open( '%sblast_output/missing_and_frag_ancestral_variants' % self.mainout, 'w') else: ancestral = open('%sancestral' % self._lineage_path, 'r') output = open( '%sblast_output/missing_and_frag_ancestral' % self.mainout, 'w') result = '' buscos_to_retrieve = \ self._missing_busco_list + self._fragmented_busco_list buscos_retrieved = [] add = False for line in ancestral: if line.startswith('>'): if ancestral_variants: line = '_'.join(line.split("_")[:-1]) # This pattern can support name like EOG00_1234_1 busco_id = line.strip().strip('>') if busco_id in buscos_to_retrieve: BuscoAnalysis._logger.debug('Found contig %s' % busco_id) add = True buscos_retrieved.append(busco_id) else: add = False if add: if line.endswith('\n'): result += line else: result += line + '\n' if len(list(set(buscos_to_retrieve) - set(buscos_retrieved))) > 0: if self._has_variants_file: BuscoAnalysis._logger.warning( 'The busco id(s) %s were not found in the ' 'ancestral_variants file' % list(set(buscos_to_retrieve) - set(buscos_retrieved))) else: BuscoAnalysis._logger.warning( 'The busco id(s) %s were not found in the ancestral file' % list(set(buscos_to_retrieve) - set(buscos_retrieved))) output.write(result) output.close() ancestral.close() output.close() def _produce_short_summary(self): """ This function reads the result files and produces the final short summary file """ hmmer_results = os.listdir('%shmmer_output' % self.mainout) hmmer_results.sort() hmmer_results_files = [] for entry in hmmer_results: hmmer_results_files.append(entry) results_from_hmmer = self._parse_hmmer(hmmer_results_files) single_copy = results_from_hmmer[0] # int multi_copy = results_from_hmmer[1] # int only_fragments = results_from_hmmer[2] # int self._missing_busco_list = results_from_hmmer[3] # list of BUSCO ids # list of BUSCO ids self._fragmented_busco_list = results_from_hmmer[4] self._single_copy_files = results_from_hmmer[5] summary_file = open('%sshort_summary_%s.txt' % (self.mainout, self._out), 'w') self._write_output_header(summary_file) summary_file.write( '# Summarized benchmarking in BUSCO notation for file %s\n# BUSCO ' 'was run in mode: %s\n\n' % (self._sequences, self._mode)) s_percent = round((single_copy/float(self._totalbuscos))*100, 1) d_percent = round((multi_copy/float(self._totalbuscos))*100, 1) f_percent = round((only_fragments/float(self._totalbuscos))*100, 1) BuscoAnalysis._logger.info('Results:') out_line = ('\tC:%s%%[S:%s%%,D:%s%%],F:%s%%,M:%s%%,n:%s\n\n' % (round(s_percent + d_percent, 1), s_percent, d_percent, f_percent, round(100 - s_percent - d_percent - f_percent, 1), self._totalbuscos)) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', '').strip()) out_line = ('\t%s\tComplete BUSCOs (C)\n' % (single_copy + multi_copy)) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', ' ').strip()) out_line = ('\t%s\tComplete and single-copy BUSCOs (S)\n' % single_copy) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', ' ').strip()) out_line = ('\t%s\tComplete and duplicated BUSCOs (D)\n' % multi_copy) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', ' ').strip()) out_line = ('\t%s\tFragmented BUSCOs (F)\n' % only_fragments) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', ' ').strip()) out_line = ('\t%s\tMissing BUSCOs (M)\n' % str(self._totalbuscos - single_copy - multi_copy - only_fragments)) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', ' ').strip()) out_line = ('\t%s\tTotal BUSCO groups searched\n' % self._totalbuscos) summary_file.write(out_line) BuscoAnalysis._logger.info(out_line.replace('\t', ' ').strip()) summary_file.close() def _load_score(self): """ This function loads the score cutoffs file :raises SystemExit: if the scores_cutoff file cannot be read """ try: # open target scores file score_file = open('%sscores_cutoff' % self._lineage_path) except IOError: BuscoAnalysis._logger.error( 'Impossible to read the scores in %sscores_cutoff' % self._lineage_path) raise SystemExit score_dic = {} for entry in score_file: entry = entry.strip().split() try: score_dic[entry[0]] = float(entry[1]) # name : score self._cutoff_dictionary[entry[0]] = {'score': float(entry[1])} except(IndexError, KeyError): pass self._totalbuscos = len(list(self._cutoff_dictionary.keys())) score_file.close() def _load_length(self): """ This function loads the length cutoffs file """ leng_dic = {} sd_dic = {} try: f = open('%slengths_cutoff' % self._lineage_path) except IOError: BuscoAnalysis._logger.error( 'Impossible to read the lengths in %slengths_cutoff' % self._lineage_path) raise SystemExit for line in f: line = line.strip().split() leng_dic[line[0]] = float(line[3]) # legacy sd_dic[line[0]] = float(line[2]) # legacy self._cutoff_dictionary[line[0]]['sigma'] = float(line[2]) # there is an arthropod profile with sigma 0 # that causes a crash on divisions if float(line[2]) == 0.0: self._cutoff_dictionary[line[0]]['sigma'] = 1 self._cutoff_dictionary[line[0]]['length'] = float(line[3]) f.close() def _parse_hmmer(self, hmmer_results_files): """ This function parses the hmmsearch output files and produces the full_table output file :param hmmer_results_files: the list of all output files :type hmmer_results_files: list """ # TODO: replace self._mode ==... by a proper parent-child behavior # that overrides code when needed env = [] everything = {} # all info from hit_dic + lengths for file_name in hmmer_results_files: f = open('%shmmer_output/%s' % (self.mainout, file_name)) hit_dic = {} bit_score_list = [] busco_query = None for line in f: if line.startswith('#'): pass else: line = line.strip().split() if self._mode == 'genome': prot_id = line[0] + '-' + file_name else: prot_id = line[0] busco_query = line[3] bit_score = float(line[7]) bit_score_list.append(bit_score) hmm_start = int(line[15]) hmm_end = int(line[16]) # new protein that passes score cutoff if bit_score >= \ self._cutoff_dictionary[busco_query]['score']: if prot_id not in hit_dic.keys(): hit_dic[prot_id] = [[hmm_start, hmm_end, bit_score]] else: hit_dic[prot_id].append([hmm_start, hmm_end, bit_score]) f.close() length = self._measuring(hit_dic) length_count = 0 if busco_query: if busco_query not in everything: everything[busco_query] = hit_dic else: for part in hit_dic: everything[busco_query][part] = hit_dic[part] for hit in hit_dic: everything[busco_query][hit][0].append( length[length_count]) length_count += 1 # classify genes using sigmas for entry in everything[busco_query]: size = everything[busco_query][entry][0][3] sigma = (self._cutoff_dictionary[busco_query]['length'] - size) / \ self._cutoff_dictionary[busco_query]['sigma'] everything[busco_query][entry][0].append(sigma) everything[busco_query][entry][0].append(file_name) # REFINE CLASSIFICATION # separate complete into multi and single-copy, and keep gene # over 2 sigma in a separate dict is_complete = {} is_fragment = {} is_very_large = {} for thing in everything: for sequence in everything[thing]: all_data = everything[thing][sequence][0] sigma = everything[thing][sequence][0][-2] seq_name = sequence if -2 <= sigma <= 2: if thing not in is_complete: is_complete[thing] = {} is_complete[thing][seq_name] = all_data elif sigma > 2: if thing not in is_fragment: is_fragment[thing] = {} is_fragment[thing][seq_name] = all_data else: if thing not in is_very_large: is_very_large[thing] = {} is_very_large[thing][seq_name] = all_data the_sc = {} the_mc = {} the_fg = {} sc_count = 0 mc_count = 0 fg_count = 0 has_complete_match = [] # filter gene matching two BUSCOs is_complete = self._filter_multi_match_genes(is_complete) is_very_large = self._filter_multi_match_genes(is_very_large) # filter duplicated gene that have a bad ratio compared to # the top scoring match is_complete = self._remove_bad_ratio_genes(is_complete) is_very_large = self._remove_bad_ratio_genes(is_very_large) for entity in is_complete: # single copy if len(is_complete[entity]) == 1: # e.g. BUSCOaEOG7QCM97 the_sc[entity] = is_complete[entity] sc_count += 1 has_complete_match.append(entity) elif len(is_complete[entity]) >= 2: the_mc[entity] = is_complete[entity] mc_count += 1 has_complete_match.append(entity) # consider the very large genes as true findings only # if there is no BUSCO < 2 sigma already found. for entity in is_very_large: if entity not in the_sc and entity not in the_mc: if len(is_very_large[entity]) >= 2: the_mc[entity] = is_very_large[entity] else: the_sc[entity] = is_very_large[entity] has_complete_match.append(entity) for entity in is_fragment: if entity not in has_complete_match: best_fragment_key = list(is_fragment[entity].keys())[0] for fragment_key in list(is_fragment[entity].keys()): # best score if is_fragment[entity][fragment_key][2] > \ is_fragment[entity][best_fragment_key][2]: best_fragment_key = fragment_key fg_count += 1 the_fg[entity] = {best_fragment_key: is_fragment[entity] [best_fragment_key]} sc_count = len(the_sc) mc_count = len(the_mc) env.append(sc_count) env.append(mc_count) env.append(fg_count) out = open('%sfull_table_%s.tsv' % (self.mainout, self._out), 'w') self._write_output_header(out) self._write_full_table_header(out) out_lines = [] not_missing = [] fragmented = [] csc = {} for entity in the_sc: for seq_id in the_sc[entity]: bit_score = the_sc[entity][seq_id][2] seq_len = the_sc[entity][seq_id][3] not_missing.append(entity) if self._mode == 'proteins' or self._mode == 'transcriptome': out_lines.append('%s\tComplete\t%s\t%s\t%s\n' % (entity, self._reformats_seq_id(seq_id), bit_score, seq_len)) elif self._mode == 'genome': scaff = self._split_seq_id(seq_id) out_lines.append( '%s\tComplete\t%s\t%s\t%s\t%s\t%s\n' % (entity, self._reformats_seq_id(scaff['id']), scaff['start'], scaff['end'], bit_score, seq_len)) csc[entity] = seq_id else: BuscoAnalysis._logger.debug(self._mode) raise SystemExit for entity in the_mc: for seq_id in the_mc[entity]: bit_score = the_mc[entity][seq_id][2] seq_len = the_mc[entity][seq_id][3] not_missing.append(entity) if self._mode == 'proteins' or self._mode == 'transcriptome': out_lines.append('%s\tDuplicated\t%s\t%s\t%s\n' % (entity, self._reformats_seq_id(seq_id), bit_score, seq_len)) elif self._mode == 'genome': scaff = self._split_seq_id(seq_id) out_lines.append( '%s\tDuplicated\t%s\t%s\t%s\t%s\t%s\n' % (entity, self._reformats_seq_id(scaff['id']), scaff['start'], scaff['end'], bit_score, seq_len)) for entity in the_fg: for seq_id in the_fg[entity]: bit_score = the_fg[entity][seq_id][2] seq_len = the_fg[entity][seq_id][3] not_missing.append(entity) fragmented.append(entity) if self._mode == 'proteins' or self._mode == 'transcriptome': out_lines.append( '%s\tFragmented\t%s\t%s\t%s\n' % (entity, self._reformats_seq_id(seq_id), bit_score, seq_len)) elif self._mode == 'genome': scaff = self._split_seq_id(seq_id) out_lines.append( '%s\tFragmented\t%s\t%s\t%s\t%s\t%s\n' % (entity, self._reformats_seq_id(scaff['id']), scaff['start'], scaff['end'], bit_score, seq_len)) missing = [] miss_file = open( '%smissing_busco_list_%s.tsv' % (self.mainout, self._out), 'w') self._write_output_header(miss_file) for busco_group in self._cutoff_dictionary: if busco_group in not_missing: pass else: out_lines.append('%s\tMissing\n' % busco_group) missing.append(busco_group) env.append(missing) env.append(fragmented) env.append(csc) for line in sorted(missing): miss_file.write('%s\n' % line) miss_file.close() for line in sorted(out_lines): out.write(line) out.close() return env def _create_main_and_tmp_dirs(self): """ This function creates the run(main) and the temporary directories :raises SystemExit: if a run with the same name already exists and the force option is not set :raises SystemExit: if the user cannot write in the tmp directory """ # final output directory self.mainout = self._root_folder+'run_%s/' % self._out # complain about the -r option if there is no checkpoint.tmp file if not self.get_checkpoint() and self._restart: BuscoAnalysis._logger.warning( 'This is not an uncompleted run that can be restarted') self._restart = False if not os.path.exists(self.mainout) and self._out: try: os.makedirs('%s' % self.mainout) except OSError: BuscoAnalysis._logger.error( 'Cannot write to the output directory, please make sure ' 'you have write permissions to %s' % self.mainout) raise SystemExit else: if not self._force and not self._restart: restart_msg1 = '' restart_msg2 = '' if self.get_checkpoint(): restart_msg1 = ' and seems uncompleted' restart_msg2 = ', or use the -r option to continue '\ 'an uncompleted run' BuscoAnalysis._logger.error( 'A run with that name already exists%s...\n\tIf you ' 'are sure you wish to overwrite existing files, please ' 'use the -f option%s' % (restart_msg1, restart_msg2)) raise SystemExit elif not self._restart: BuscoAnalysis._logger.info( 'Delete the current result folder and start a new run') self._p_open(['rm -rf %s*' % self.mainout], 'bash', shell=True) # create the tmp directory try: if self._tmp != './': if not os.path.exists(self._tmp): os.makedirs(self._tmp) if self._tmp[-1] != '/': self._tmp += '/' BuscoAnalysis._logger.info('Temp directory is %s' % self._tmp) except OSError: BuscoAnalysis._logger.error( 'Cannot write to the temp directory, please make sure ' 'you have write permissions to %s' % self._tmp) raise SystemExit if not os.access(self._tmp, os.W_OK): BuscoAnalysis._logger.error( 'Cannot write to the temp directory, please make sure ' 'you have write permissions to %s' % self._tmp) raise SystemExit def _set_checkpoint(self, nb=None): """ This function update the checkpoint file with the provided id or delete it if none is provided :param nb: the id of the checkpoint :type nb: int """ if nb: open('%scheckpoint.tmp' % self.mainout, 'w').write('%s.%s.%s' % (nb, self._mode, self._random)) else: if os.path.exists('%scheckpoint.tmp' % self.mainout): os.remove('%scheckpoint.tmp' % self.mainout) def _run_tblastn(self, missing_and_frag_only=False, ancestral_variants=False): """ This function runs tblastn :param missing_and_frag_only: to tell whether to blast only missing and fragmented buscos :type missing_and_frag_only: bool :param ancestral_variants: to tell whether to use the ancestral_variants file :type ancestral_variants: bool """ if ancestral_variants: ancestral_sfx = '_variants' else: ancestral_sfx = '' if missing_and_frag_only: self._extract_missing_and_frag_buscos_ancestral(ancestral_variants) output_suffix = '_missing_and_frag_rerun' query_file = \ '%sblast_output/missing_and_frag_ancestral%s' % (self.mainout, ancestral_sfx) else: output_suffix = '' query_file = '%sancestral%s' % (self._lineage_path, ancestral_sfx) if not missing_and_frag_only: BuscoAnalysis._logger.info('Create blast database...') blast_job = self._mkblast.create_job(BuscoAnalysis._logger) blast_job.add_parameter('-in') blast_job.add_parameter('%s' % self._sequences) blast_job.add_parameter('-dbtype') blast_job.add_parameter('nucl') blast_job.add_parameter('-out') blast_job.add_parameter('%s%s%s' % (self._tmp, self._out, self._random)) self._mkblast.run_jobs(self._cpus, BuscoAnalysis._logger) if not os.path.exists('%sblast_output' % self.mainout): os.makedirs('%sblast_output' % self.mainout) BuscoAnalysis._logger.info( 'Running tblastn, writing output to ' '%sblast_output/tblastn_%s%s.tsv...' % (self.mainout, self._out, output_suffix)) tblastn_job = self._tblastn.create_job() tblastn_job.add_parameter('-evalue') tblastn_job.add_parameter(str(self._ev_cutoff)) tblastn_job.add_parameter('-num_threads') if not self._blast_single_core: tblastn_job.add_parameter(str(self._cpus)) else: tblastn_job.add_parameter('1') tblastn_job.add_parameter('-query') tblastn_job.add_parameter(query_file) tblastn_job.add_parameter('-db') tblastn_job.add_parameter('%s%s%s' % (self._tmp, self._out, self._random)) tblastn_job.add_parameter('-out') tblastn_job.add_parameter('%sblast_output/tblastn_%s%s.tsv' % (self.mainout, self._out, output_suffix)) tblastn_job.add_parameter('-outfmt') tblastn_job.add_parameter('7') self._tblastn.run_jobs(1, BuscoAnalysis._logger) # tblastn manages available cpus by itself # check that blast worked if not os.path.exists('%sblast_output/tblastn_%s%s.tsv' % (self.mainout, self._out, output_suffix)): BuscoAnalysis._logger.error('tblastn failed !') raise SystemExit # check that the file is not truncated try: if "processed" not in open('%sblast_output/tblastn_%s%s.tsv' % (self.mainout, self._out, output_suffix), 'r').readlines()[-1]: BuscoAnalysis._logger.error('tblastn has ended prematurely ' '(the result file lacks the expected final line), ' 'which will produce incomplete results in the next steps ! ' 'This problem likely appeared in blast+ 2.4 and ' 'seems not fully fixed in 2.6. ' 'It happens only when using multiple cores. ' 'You can use a single core (-c 1) or downgrade to blast+ 2.2.x, ' 'a safe choice regarding this issue. ' 'See blast+ documentation for more information.') raise SystemExit except IndexError: # if the tblastn result file is empty, for example in phase 2 # if 100% was found in phase 1 pass def _set_rerun_busco_command(self): """ This function sets the command line to call to reproduce this run """ frame = inspect.stack()[-1] if inspect.getmodule(frame[0]) is not None: entry_point = inspect.getmodule(frame[0]).__file__ else: entry_point = '' self._rerun_cmd = 'python %s -i %s -o %s -l %s -m %s -c %s' % (entry_point, self._sequences, self._out, self._lineage_path, self._mode, self._cpus) if self._long: self._rerun_cmd += ' --long' if self._region_limit != BuscoConfig.DEFAULT_ARGS_VALUES['limit']: self._rerun_cmd += ' --limit %s' % self._region_limit if self._tmp != BuscoConfig.DEFAULT_ARGS_VALUES['tmp_path']: self._rerun_cmd += ' -t %s' % self._tmp if self._ev_cutoff != BuscoConfig.DEFAULT_ARGS_VALUES['evalue']: self._rerun_cmd += ' -e %s' % self._ev_cutoff if self._tarzip: self._rerun_cmd += ' -z' def _run_tarzip_hmmer_output(self): """ This function tarzips 'hmmer_output' results folder """ self._p_open(['tar', '-C', '%s' % self.mainout, '-zcf', '%shmmer_output.tar.gz' % self.mainout, 'hmmer_output', '--remove-files'], 'bash', shell=False) def _write_full_table_header(self, out): """ This function adds a header line to the full table file :param out: a full table file :type out: file """ out.write('# Busco id\tStatus\tSequence\tScore\tLength\n') def _reformats_seq_id(self, seq_id): """ This function reformats the sequence id to its original values, if it was somehow modified during the process It has to be overriden by subclasses when needed :param seq_id: the seq id to reformats :type seq_id: str :return: the reformatted seq_id :rtype: str """ return seq_id def _split_seq_id(self, seq_id): """ This function split the provided seq id into id, start and stop :param seq_id: the seq id to split :type seq_id: str :return: a dict containing the id, the start, the end found in seq_id :rtype: dict """ # -2,-1 instead of 0,1, if ':' in the fasta header, same for [,] name = seq_id.replace(']', '').split('[')[-1].split(':')[-2] start = \ seq_id.replace(']', '').split('[')[-1].split(':')[-1].split('-')[0] end = \ seq_id.replace(']', '').split('[')[-1].split(':')[-1].split('-')[1] return {'id': name, 'start': start, 'end': end} def _remove_bad_ratio_genes(self, original): """ This function removes duplicate positive results if the score is above a 0.85 threshold compared to the top scoring match :param original: a dict with BUSCO as key, and a dict of matching genes as values :type original: dict :return: the filtered dict :rtype: dict """ ratio = 0.85 filtered = copy.deepcopy(original) for k in original.keys(): max_score = 0 for k2 in original[k]: if original[k][k2][2] > max_score: max_score = original[k][k2][2] for k2 in original[k]: if original[k][k2][2] < max_score*ratio: del filtered[k][k2] if len(filtered[k]) == 0: del filtered[k] return filtered def _filter_multi_match_genes(self, original): """ This function identifies genes that match the same BUSCO, and keeps the one with the best score only :param original: a dict with BUSCO as key, and a dict of matching genes as values :type original: dict :return: the filtered dict :rtype: dict """ filtered = copy.deepcopy(original) # reorganize the key/value by gene gene_dict = {} for d in original.values(): for key in d.keys(): if key in gene_dict: gene_dict[key].append(d[key]) else: gene_dict[key] = [d[key]] # identify genes belonging to two or more buscos # keep the best score to_keep = {} for key in gene_dict.keys(): if len(gene_dict[key]) > 1: max_score = 0 busco_to_keep = "" for busco in gene_dict[key]: if max_score < busco[2]: max_score = busco[2] busco_to_keep = busco[5][0:11] to_keep[key] = busco_to_keep # clean the original list for k in original.keys(): for k2 in original[k].keys(): if k2 in to_keep.keys(): if to_keep[k2] != k: del filtered[k][k2] if len(filtered[k]) == 0: del filtered[k] return filtered def _measuring(self, nested): """ :param nested: :type nested: :return: :rtype: """ # TODO comment total_len = 0 if isinstance(nested, str): return '0' scaffolds = list(nested.keys()) if len(nested) == 1: total_len = [0] for hit in nested[scaffolds[0]]: total_len[0] += hit[1] - hit[0] elif len(nested) > 1: total_len = [0] * len(nested) for entry in range(0, len(scaffolds)): for hit in nested[scaffolds[entry]]: total_len[entry] += hit[1] - hit[0] return total_len def _get_hmmer_version(self, hmmer_exec_path): """ check the Tool has the correct version :raises SystemExit: if the version is not correct """ hmmer_version = 0.0 try: hmmer_version = subprocess.check_output('%s -h' % hmmer_exec_path, shell=True) hmmer_version = hmmer_version.decode('utf-8') hmmer_version = hmmer_version.split('\n')[1].split()[2] hmmer_version = float(hmmer_version[:3]) except ValueError: # to avoid a crash with super old version and notify the user, # will be useful hmmer_version = subprocess.check_output('%s -h' % hmmer_exec_path, shell=True) hmmer_version = hmmer_version.decode('utf-8') hmmer_version = hmmer_version.split('\n')[1].split()[1] hmmer_version = float(hmmer_version[:3]) finally: return hmmer_version def _check_overlap(self, a, b): """ This function checks whether two regions overlap :param a: first region, start and end :type a: list :param b: second region, start and end :type b: list :return: the number of overlapping positions :rtype: int """ return max(0, min(a[1], b[1]) - max(a[0], b[0])) def _define_boundary(self, a, b): """ This function defines the boundary of two overlapping regions :param a: first region, start and end :type a: list :param b: second region, start and end :type b: list :return: the boundaries, i.e. start and end :rtype: collections.deque """ temp_start = a[0] temp_end = a[1] current_start = b[0] current_end = b[1] boundary = None if temp_start < current_start and temp_end < current_start: # i.e. entry is fully before # append left, IF entry is the first one; otherwise put into # the proper position boundary = deque([a, b]) elif temp_start < current_start <= temp_end <= current_end: # i.e. overlap starts before, but ends inside boundary = deque([temp_start, current_end]) elif current_start <= temp_start <= current_end < temp_end: # overlap starts inside, but ends outside boundary = deque([current_start, temp_end]) elif temp_start > current_end and temp_end > current_end: # i.e. query is fully after # append right; otherwise put into the proper position boundary = deque([b, a]) elif current_start <= temp_start <= temp_end <= current_end: # i.e. query is fully inside, no further operations needed boundary = deque(b) elif temp_start == current_start and temp_end == current_end: boundary = deque(a) elif temp_start <= current_start and temp_end >= current_end: # i.e. query is longer and contains all coordinates # replace by the query boundary = deque(a) return boundary def _gargantua(self, deck): """ :param deck: :type deck: list :return: :rtype: int """ # TODO comment total = 0 for entry in deck: total += entry[1] - entry[0] return total def _p_open(self, cmd, name, shell=False): """ This function call subprocess.Popen for the provided command and logs the results with the provided name NOTE: to replace by pipebricks.Tool when possible :param cmd: the command to execute :type cmd: list :param name: the name to use in the log :type name: str :param shell: whether to use the shell parameter to Popen. Needed if wildcard charcter used (*?). See on web :type shell: bool """ process = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=shell) process_out = process.stderr.readlines() + process.stdout.readlines() for line in process_out: BuscoAnalysis._logger.info_external_tool(name, line.decode("utf-8").strip())