#!/usr/bin/env python # coding: utf-8 """ .. module:: GenomeAnalysis :synopsis: GenomeAnalysis implements genome analysis specifics .. versionadded:: 3.0.0 .. versionchanged:: 3.0.0 Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ from quast_libs.busco.BuscoAnalysis import BuscoAnalysis from quast_libs.busco.BuscoConfig import BuscoConfig from collections import deque import heapq import os from quast_libs.busco.pipebricks.PipeLogger import PipeLogger from quast_libs.busco.pipebricks.Toolset import Tool import time # from overrides import overrides # useful fro dev, but don't want all user to install this class GenomeAnalysis(BuscoAnalysis): """ This class runs a BUSCO analysis on a genome. """ _logger = PipeLogger.get_logger(__name__) # # magic or public, meant to be accessed by external scripts [instance] # def __init__(self, params): """ Initialize an instance. :param params: Values of all parameters that have to be defined :type params: PipeConfig """ # 1) load parameters # 2) load and validate tools # 3) check data and dataset integrity # 4) Ready for analysis # See parent __init__ where most of the calls and actions are made # Retrieve the augustus config path, mandatory for genome # Cannot be specified through conf because some augustus perl scripts use it as well # BUSCO could export it if absent, but do not want to mess up with the user env, # let's just tell the user to do it for now self._augustus_config_path = BuscoConfig.nice_path(os.environ.get('AUGUSTUS_CONFIG_PATH')) self._target_species = params.get('busco', 'species') self._augustus_parameters = params.get('busco', 'augustus_parameters') self._mode = 'genome' super(GenomeAnalysis, self).__init__(params) # _init tool called here self._long = params.getboolean('busco', 'long') self._flank = self._define_flank() # data integrity checks not done by the parent class if self.check_nucleotide_file() is False: BuscoAnalysis._logger.error('Please provide a nucleotide file as input') raise SystemExit # @overrides def run_analysis(self): """ This function calls all needed steps for running the analysis. """ super(GenomeAnalysis, self).run_analysis() if self._restart: checkpoint = self.get_checkpoint(reset_random_suffix=True) BuscoAnalysis._logger.warning('Restarting an uncompleted run') else: checkpoint = 0 # all steps will be done BuscoAnalysis._logger.info( '****** Phase 1 of 2, initial predictions ******') if checkpoint < 1: BuscoAnalysis._logger.info( '****** Step 1/3, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) self._run_tblastn() self._set_checkpoint(1) if checkpoint < 2: BuscoAnalysis._logger.info( '****** Step 2/3, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) self._get_coordinates() self._run_augustus() BuscoAnalysis._logger.info( '****** Step 3/3, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) self._run_hmmer() self._set_checkpoint(2) self._load_score() self._load_length() if checkpoint == 2 or checkpoint == 3: BuscoAnalysis._logger.info('Phase 1 was already completed.') if checkpoint == 3: self._fix_restart_augustus_folder() self._produce_short_summary() BuscoAnalysis._logger.info( '****** Phase 2 of 2, predictions using species specific training ' '******') if checkpoint < 3: BuscoAnalysis._logger.info( '****** Step 1/3, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) if self._has_variants_file: self._run_tblastn(missing_and_frag_only=True, ancestral_variants=True) self._get_coordinates(missing_and_frag_only=True) else: self._run_tblastn(missing_and_frag_only=True, ancestral_variants=False) self._get_coordinates(missing_and_frag_only=True) self._set_checkpoint(3) BuscoAnalysis._logger.info( '****** Step 2/3, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) self._rerun_augustus() self._move_retraining_parameters() self.cleanup() if self._tarzip: self._run_tarzip_augustus_output() self._run_tarzip_hmmer_output() # remove the checkpoint, run is done self._set_checkpoint() # @overrides def cleanup(self): """ This function cleans temporary files """ super(GenomeAnalysis, self).cleanup() self._p_open(['rm %s*%s%s_.temp' % (self._tmp, self._out, self._random)], 'bash', shell=True) self._p_open( ['rm %(tmp)s%(abrev)s.*ns? %(tmp)s%(abrev)s.*nin ' '%(tmp)s%(abrev)s.*nhr' % {'tmp': self._tmp, 'abrev': self._out + str(self._random)}], 'bash', shell=True) # @overrides def _check_dataset(self): """ Check if the dataset integrity, if files and folder are present :raises SystemExit: if the dataset miss files or folders """ super(GenomeAnalysis, self)._check_dataset() # prfl folder flag = False for dirpath, dirnames, files in os.walk('%sprfl' % self._lineage_path): if files: flag = True if not flag: BuscoAnalysis._logger.error( 'The dataset you provided lacks elements in %sprfl' % self._lineage_path) raise SystemExit # note: score and length cutoffs are checked when read, # see _load_scores and _load_lengths # ancestral would cause blast to fail, and be detected, see _blast() # dataset.cfg is not mandatory # check whether the ancestral_variants file is present if os.path.exists('%sancestral_variants' % self._lineage_path): self._has_variants_file = True else: self._has_variants_file = False BuscoAnalysis._logger.warning( 'The dataset you provided does not contain the file ' 'ancestral_variants, likely because it is an old version. ' 'All blast steps will use the file ancestral instead') # # public, meant to be accessed by external scripts [class] # # Nothing # # method that should be used as if protected, for internal use [instance] # to move to public and rename if meaningful # # @overrides def _init_tools(self): """ :return: """ super(GenomeAnalysis, self)._init_tools() # call to check_tool_dependencies made here self._check_file_dependencies() self._augustus = Tool('augustus', self._params) self._etraining = Tool('etraining', self._params) self._gff2gbSmallDNA_pl = Tool('gff2gbSmallDNA.pl', self._params) self._new_species_pl = Tool('new_species.pl', self._params) self._optimize_augustus_pl = Tool('optimize_augustus.pl', self._params) def _define_flank(self): """ :return: """ try: f = open(self._sequences, 'r') size = 0 for line in f: if line.startswith('>'): pass else: size += len(line.strip()) size /= 1000 # size in mb flank = int(size / 50) # proportional flank size if flank < 5000: flank = 5000 elif flank > BuscoConfig.MAX_FLANK: flank = BuscoConfig.MAX_FLANK f.close() except IOError: BuscoAnalysis._logger.error('Impossible to read the fasta file %s ' % self._sequences) raise SystemExit return flank def _fix_restart_augustus_folder(self): """ This function resets and checks the augustus folder to make a restart possible in phase 2 :raises SystemExit: if it is not possible to fix the folders """ if os.path.exists('%saugustus_output/predicted_genes_run1' % self.mainout) \ and os.path.exists('%shmmer_output_run1' % self.mainout): self._p_open(['rm', '-fr', '%saugustus_output/predicted_genes' % self.mainout], 'bash', shell=False) self._p_open(['mv', '%saugustus_output/predicted_genes_run1' % self.mainout, '%saugustus_output/predicted_genes' % self.mainout], 'bash', shell=False) self._p_open(['rm', '-fr', '%shmmer_output' % self.mainout], 'bash', shell=False) self._p_open(['mv', '%shmmer_output_run1/' % self.mainout, '%shmmer_output/' % self.mainout], 'bash', shell=False) elif os.path.exists('%saugustus_output/predicted_genes' % self.mainout) and os.path.exists('%shmmer_output' % self.mainout): pass else: BuscoAnalysis._logger.error( 'Impossible to restart the run, necessary folders are missing.' ' Use the -f option instead of -r') raise SystemExit def _get_coordinates(self, missing_and_frag_only=False): """ This function gets coordinates for candidate regions from tblastn result file :param missing_and_frag_only: tell whether to use the missing_and_frag_rerun tblastn file :type missing_and_frag_only: bool """ if missing_and_frag_only: blast_file = open( '%sblast_output/tblastn_%s_missing_and_frag_rerun.tsv' % (self.mainout, self._out)) else: blast_file = open('%sblast_output/tblastn_%s.tsv' % (self.mainout, self._out)) BuscoAnalysis._logger.info('Maximum number of candidate contig per BUSCO limited to: %s' % self._region_limit) BuscoAnalysis._logger.info( 'Getting coordinates for candidate regions...') dic = {} coords = {} for line in blast_file: if line.startswith('#'): pass else: try: line = line.strip().split() busco_name = line[0] contig = line[1] # busco_og and contig name, respectively busco_start = int(line[6]) busco_end = int(line[7]) # busco hit positions contig_start = int(line[8]) contig_end = int(line[9]) # contig postions aln_len = int(line[3]) # e_value and alignment length blast_eval = float(line[10]) # for minus-strand genes,invert coordinates for convenience if contig_end < contig_start: temp = contig_end contig_end = contig_start contig_start = temp # create new entry in dictionary for current BUSCO if busco_name not in dic.keys(): dic[busco_name] = [contig] coords[busco_name] = {} coords[busco_name][contig] = [contig_start, contig_end, deque([[busco_start, busco_end]]), aln_len, blast_eval] # get just the top scoring regions # according to region limits elif contig not in dic[busco_name] and len(dic[busco_name]) < self._region_limit: # scoring regions dic[busco_name].append(contig) coords[busco_name][contig] = [contig_start, contig_end, deque([[busco_start, busco_end]]), aln_len, blast_eval] # replace the lowest scoring region if the current has # a better score. # needed because of multiple blast query having # the same name when using ancestral_variants # and not sorted by eval in the tblastn result file elif contig not in dic[busco_name] and len(dic[busco_name]) >= self._region_limit: to_replace = None for entry in list(coords[busco_name].keys()): if coords[busco_name][entry][4] > blast_eval: # check if there is already a to_replace entry # and compare the eval if (to_replace and coords[busco_name][entry][4] > list(to_replace.values())[0]) \ or not to_replace: # use a single entry dictionary to store # the id to replace and its eval to_replace = {entry: coords[busco_name][entry][4]} if to_replace: dic[busco_name].remove(list(to_replace.keys())[0]) dic[busco_name].append(contig) coords[busco_name][contig] = [contig_start, contig_end, deque([[busco_start, busco_end]]), aln_len, blast_eval] # contigold already checked elif contig in dic[busco_name]: # now update coordinates if contig_start < coords[busco_name][contig][0] and \ coords[busco_name][contig][0] - \ contig_start <= 50000: # starts before and withing 50kb of current pos coords[busco_name][contig][0] = contig_start coords[busco_name][contig][2].append([busco_start, busco_end]) if contig_end > coords[busco_name][contig][1] \ and contig_end - coords[busco_name][contig][1] <= 50000: # ends after and within 50 kbs coords[busco_name][contig][1] = contig_end coords[busco_name][contig][3] = busco_end coords[busco_name][contig][2].append([busco_start, busco_end]) elif coords[busco_name][contig][1] > contig_start > \ coords[busco_name][contig][0]: # starts inside current coordinates if contig_end < coords[busco_name][contig][1]: # if ending inside, just add # alignemnt positions to deque coords[busco_name][contig][2].append( [busco_start, busco_end]) # if ending after current coordinates, extend elif contig_end > coords[busco_name][contig][1]: coords[busco_name][contig][2][1] = contig_end coords[busco_name][contig][2].append( [busco_start, busco_end]) except (IndexError, ValueError): pass blast_file.close() final_locations = {} if missing_and_frag_only: out = open( '%s/blast_output/coordinates_%s_missing_and_frag_rerun.tsv' % (self.mainout, self._out), 'w') # Coordinates output file else: out = open( '%s/blast_output/coordinates_%s.tsv' % (self.mainout, self._out), 'w') # Coordinates output file for busco_group in coords: final_locations[busco_group] = [] # list of candidate contigs candidate_contigs = list(coords[busco_group].keys()) size_lists = [] for contig in candidate_contigs: potential_locations = coords[busco_group][contig][2] max_iterations = len(potential_locations) iter_count = 0 final_regions = [] # nested list of regions used_pieces = [] non_used = [] while iter_count < max_iterations: currently = potential_locations[iter_count] if len(final_regions) == 0: final_regions.append(currently) else: for region in final_regions: if self._check_overlap(currently, region) != 0: gg = self._define_boundary(currently, region) region_index = final_regions.index(region) final_regions[region_index] = gg used_pieces.append(iter_count) else: non_used.append(iter_count) iter_count += 1 # done for this contig, now consolidate for entry_index in non_used: entry = potential_locations[entry_index] if entry in used_pieces: pass # already used else: ok = [] for region in final_regions: checking = self._check_overlap(entry, region) if checking == 0: # i.e. no overlap pass else: ok.append([region, entry]) if len(ok) == 0: # no overlaps at all (i.e. unique boundary) final_regions.append(entry) else: region = ok[0][0] currently = ok[0][1] gg = self._define_boundary(currently, region) final_regions[final_regions.index(region)] = gg size_lists.append(self._gargantua(final_regions)) max_size = max(size_lists) size_cutoff = int(0.7 * max_size) index_passed_cutoffs = heapq.nlargest(int(self._region_limit), range(len(size_lists)), size_lists.__getitem__) for candidate in index_passed_cutoffs: if size_lists[candidate] >= size_cutoff: seq_name = candidate_contigs[candidate] seq_start = int(coords[busco_group] [candidate_contigs[candidate]][0]) - self._flank if seq_start < 0: seq_start = 0 seq_end = int(coords[busco_group] [candidate_contigs[candidate]][1]) + self._flank final_locations[busco_group].append([seq_name, seq_start, seq_end]) out.write('%s\t%s\t%s\t%s\n' % (busco_group, seq_name, seq_start, seq_end)) out.close() def _extract_scaffolds(self, missing_and_frag_only=False): """ This function extract the scaffold having blast results :param missing_and_frag_only: to tell which coordinate file to look for, complete or just missing and fragment :type missing_and_frag_only: bool """ BuscoAnalysis._logger.info('Pre-Augustus scaffold extraction...') if missing_and_frag_only: coord = open( '%s/blast_output/coordinates_%s_missing_and_frag_rerun.tsv' % (self.mainout, self._out)) else: coord = open( '%s/blast_output/coordinates_%s.tsv' % (self.mainout, self._out)) dic = {} scaff_list = [] for i in coord: i = i.strip().split() if len(i) != 2: dic[i[0]] = [i[1], i[2], i[3]] if i[1] not in scaff_list: scaff_list.append(i[1]) coord.close() f = open(self._sequences) check = 0 out = None contig_len = 0 contig_id = None for i in f: if i.startswith('>'): i = i.split() i = i[0][1:] if i in scaff_list: out = open('%s%s%s%s_.temp' % (self._tmp, i, self._out, self._random), 'w') out.write('>%s\n' % i) check = 1 contig_id = i else: check = 0 elif check == 1: out.write(i) if i != '\n': contig_len += len(i) f.close() if out: out.close() # Keep track of the contig max lenght # Not used yet but might be useful self._contig_length.update({contig_id: contig_len}) def _run_augustus(self): """ This function runs Augustus """ # Run Augustus on all candidate regions # 1- Get the temporary sequence files (no multi-fasta support in # Augustus) # 2- Build a list with the running commands (for threading) # 3- Launch Augustus in paralell using Threading # 4- Prepare the sequence files to be analysed with HMMer 3.1 # Extract candidate contigs/scaffolds from genome assembly # Augustus can't handle multi-fasta files, each sequence has # to be present in its own file # Write the temporary sequence files # First, delete the augustus result folder, in case we are in # the restart mode at this point self._p_open(['rm -rf %saugustus_output/*' % self.mainout], 'bash', shell=True) # get all scaffolds with blast results self._extract_scaffolds() # Now run Augustus on each candidate region with its respective # Block-profile BuscoAnalysis._logger.info( 'Running Augustus prediction using %s as species:' % self._target_species) if self._augustus_parameters: BuscoAnalysis._logger.info( 'Additional parameters for Augustus are %s: ' % self._augustus_parameters) augustus_log = '%saugustus_output/augustus.log' % self.mainout BuscoAnalysis._logger.info( '[augustus]\tPlease find all logs related to Augustus errors here: %s' % augustus_log) if not os.path.exists('%saugustus_output/predicted_genes' % self.mainout): os.makedirs('%saugustus_output/predicted_genes' % self.mainout) # coordinates of hits by BUSCO self._location_dic = {} f = open('%s/blast_output/coordinates_%s.tsv' % (self.mainout, self._out)) for line in f: line = line.strip().split('\t') scaff_id = line[1] scaff_start = line[2] scaff_end = line[3] group_name = line[0] if group_name not in self._location_dic: # scaffold,start and end self._location_dic[group_name] = [[scaff_id, scaff_start, scaff_end]] elif group_name in self._location_dic: self._location_dic[group_name].append([scaff_id, scaff_start, scaff_end]) f.close() # Make a list containing the commands to be executed in parallel # with threading. for entry in self._location_dic: for location in self._location_dic[entry]: scaff = location[0] + self._out + str(self._random) + \ '_.temp' scaff_start = location[1] scaff_end = location[2] output_index = self._location_dic[entry].index(location) + 1 out_name = '%saugustus_output/predicted_genes/%s.out.%s' % \ (self.mainout, entry, output_index) augustus_job = self._augustus.create_job() augustus_job.add_parameter('--codingseq=1') augustus_job.add_parameter('--proteinprofile=%sprfl/%s.prfl' % (self._lineage_path, entry)) augustus_job.add_parameter('--predictionStart=%s ' % scaff_start) augustus_job.add_parameter('--predictionEnd=%s ' % scaff_end) augustus_job.add_parameter('--species=%s' % self._target_species) for p in self._augustus_parameters.split(' '): if len(p) > 2: augustus_job.add_parameter(p) augustus_job.add_parameter('%s%s' % (self._tmp, scaff)) augustus_job.stdout_file = [out_name, 'w'] augustus_job.stderr_file = [augustus_log, 'a'] self._augustus.run_jobs(self._cpus, BuscoAnalysis._logger) # Preparation of sequences for use with HMMer # Parse Augustus output files # ('run_XXXX/augustus') and extract protein # sequences to a FASTA file # ('run_XXXX/augustus_output/extracted_proteins'). BuscoAnalysis._logger.info('Extracting predicted proteins...') files = os.listdir('%saugustus_output/predicted_genes' % self.mainout) files.sort() for entry in files: self._p_open( ['sed -i.bak \'1,3d\' %saugustus_output/predicted_genes/%s;' 'rm %saugustus_output/predicted_genes/%s.bak' % (self.mainout, entry, self.mainout, entry)], 'bash', shell=True) if not os.path.exists(self.mainout + 'augustus_output/extracted_proteins'): os.makedirs('%saugustus_output/extracted_proteins' % self.mainout) self._no_predictions = [] for entry in files: self._extract(self.mainout, entry) self._extract(self.mainout, entry, aa=False) self._p_open(['find %saugustus_output/extracted_proteins -size 0 -delete' % self.mainout], 'bash', shell=True) def _set_rerun_busco_command(self): """ This function sets the command line to call to reproduce this run """ super(GenomeAnalysis, self)._set_rerun_busco_command() self._rerun_cmd += ' -sp %s' % self._target_species if self._augustus_parameters: self._rerun_cmd += ' --augustus_parameters \'%s\'' % self._augustus_parameters def _rerun_augustus(self): """ This function runs Augustus and hmmersearch for the second time """ # TODO: augustus and augustus rerun should reuse the shared code # when possible, and hmmer should not be here augustus_log = '%saugustus_output/augustus.log' % self.mainout if not os.path.exists('%saugustus_output/gffs' % self.mainout): os.makedirs('%saugustus_output/gffs' % self.mainout) if not os.path.exists('%saugustus_output/gb' % self.mainout): os.makedirs('%saugustus_output/gb' % self.mainout) BuscoAnalysis._logger.info( 'Training Augustus using Single-Copy Complete BUSCOs:') BuscoAnalysis._logger.info( 'Converting predicted genes to short genbank files at %s...' % time.strftime("%m/%d/%Y %H:%M:%S")) for entry in self._single_copy_files: check = 0 file_name = self._single_copy_files[entry].split('-')[-1] target_seq_name = self._single_copy_files[entry].split('[')[0] group_name = file_name.split('.')[0] # create GFFs with only the "single-copy" BUSCO sequences pred_file = open('%saugustus_output/predicted_genes/%s' % (self.mainout, file_name)) gff_file = open('%saugustus_output/gffs/%s.gff' % (self.mainout, group_name), 'w') BuscoAnalysis._logger.debug('creating %s' % ('%saugustus_output/gffs/%s.gff' % (self.mainout, group_name))) for line in pred_file: if line.startswith('# start gene'): pred_name = line.strip().split()[-1] if pred_name == target_seq_name: check = 1 elif line.startswith('#'): check = 0 elif check == 1: gff_file.write(line) pred_file.close() gff_file.close() gff2_gb_small_dna_pl_job = self._gff2gbSmallDNA_pl.create_job() gff2_gb_small_dna_pl_job.add_parameter('%saugustus_output/gffs/%s.gff' % (self.mainout, entry)) gff2_gb_small_dna_pl_job.add_parameter('%s' % self._sequences) gff2_gb_small_dna_pl_job.add_parameter('1000') gff2_gb_small_dna_pl_job.add_parameter('%saugustus_output/gb/%s.raw.gb' % (self.mainout, entry)) gff2_gb_small_dna_pl_job.stderr_file = [augustus_log, 'a'] gff2_gb_small_dna_pl_job.stdout_file = gff2_gb_small_dna_pl_job.stderr_file self._gff2gbSmallDNA_pl.run_jobs(self._cpus, BuscoAnalysis._logger, log_it=False) BuscoAnalysis._logger.info( 'All files converted to short genbank files, now running ' 'the training scripts at %s...' % time.strftime("%m/%d/%Y %H:%M:%S")) # create new species config file from template new_species_pl_job = self._new_species_pl.create_job() # bacteria clade needs to be flagged as "prokaryotic" if self._domain == 'prokaryota': new_species_pl_job.add_parameter('--prokaryotic') new_species_pl_job.add_parameter('--species=BUSCO_%s%s' % (self._out, self._random)) new_species_pl_job.stderr_file = [augustus_log, 'a'] new_species_pl_job.stdout_file = new_species_pl_job.stderr_file self._new_species_pl.run_jobs(self._cpus, BuscoAnalysis._logger, False) BuscoAnalysis._logger.debug('concat all gb files...') self._p_open(['find %saugustus_output/gb/ -type f -name "*.gb" -exec cat {} \; ' '> %saugustus_output/training_set_%s.txt' % (self.mainout, self.mainout, self._out)], 'bash', shell=True) BuscoAnalysis._logger.debug('run etraining...') # train on new training set (complete single copy buscos) etraining_job = self._etraining.create_job() etraining_job.add_parameter('--species=BUSCO_%s%s' % (self._out, self._random)) etraining_job.add_parameter('%saugustus_output/training_set_%s.txt' % (self.mainout, self._out)) etraining_job.stderr_file = [augustus_log, 'a'] etraining_job.stdout_file = new_species_pl_job.stderr_file self._etraining.run_jobs(self._cpus, BuscoAnalysis._logger, False) # long mode (--long) option - runs all the Augustus optimization # scripts (adds ~1 day of runtime) if self._long: BuscoAnalysis._logger.warning( 'Optimizing augustus metaparameters, this may take a ' 'very long time, started at %s' % time.strftime("%m/%d/%Y %H:%M:%S")) optimize_augustus_pl_job = self._optimize_augustus_pl.create_job() optimize_augustus_pl_job.add_parameter('--cpus=%s' % self._cpus) optimize_augustus_pl_job.add_parameter('--species=BUSCO_%s%s' % (self._out, self._random)) optimize_augustus_pl_job.add_parameter('%saugustus_output/training_set_%s.txt' % (self.mainout, self._out)) optimize_augustus_pl_job.stderr_file = [augustus_log, 'a'] optimize_augustus_pl_job.stdout_file = optimize_augustus_pl_job.stderr_file self._optimize_augustus_pl.run_jobs(self._cpus, BuscoAnalysis._logger, False) # train on new training set (complete single copy buscos) etraining_job = self._etraining.create_job() etraining_job.add_parameter('--species=BUSCO_%s%s' % (self._out, self._random)) etraining_job.add_parameter('%saugustus_output/training_set_%s.txt' % (self.mainout, self._out)) etraining_job.stderr_file = [augustus_log, 'a'] etraining_job.stdout_file = new_species_pl_job.stderr_file self._etraining.run_jobs(self._cpus, BuscoAnalysis._logger, False) ####################################################### # get all scaffolds with blast results self._extract_scaffolds(missing_and_frag_only=True) BuscoAnalysis._logger.info( 'Re-running Augustus with the new metaparameters, number of ' 'target BUSCOs: %s' % len(self._missing_busco_list + self._fragmented_busco_list)) augustus_rerun_seds = [] # Move first run and create a folder for the rerun self._p_open(['mv', '%saugustus_output/predicted_genes' % self.mainout, '%saugustus_output/predicted_genes_run1' % self.mainout], 'bash', shell=False) os.makedirs('%saugustus_output/predicted_genes' % self.mainout) self._p_open(['mv', '%shmmer_output/' % self.mainout, '%shmmer_output_run1/' % self.mainout], 'bash', shell=False) os.makedirs('%shmmer_output/' % self.mainout) # Update the location dict with the new blast search # coordinates of hits by BUSCO self._location_dic = {} f = open('%s/blast_output/coordinates_%s_missing_and_frag_rerun.tsv' % (self.mainout, self._out)) for line in f: line = line.strip().split('\t') scaff_id = line[1] scaff_start = line[2] scaff_end = line[3] group_name = line[0] if group_name not in self._location_dic: # scaffold,start and end self._location_dic[group_name] = [[scaff_id, scaff_start, scaff_end]] elif group_name in self._location_dic: self._location_dic[group_name].append([scaff_id, scaff_start, scaff_end]) f.close() for entry in self._missing_busco_list + self._fragmented_busco_list: if entry in self._location_dic: for location in self._location_dic[entry]: scaff = location[0] + self._out + str(self._random) + \ '_.temp' scaff_start = location[1] scaff_end = location[2] output_index = self._location_dic[entry].index(location) + 1 out_name = '%saugustus_output/predicted_genes/%s.out.%s' \ % (self.mainout, entry, output_index) augustus_job = self._augustus.create_job() augustus_job.add_parameter('--codingseq=1') augustus_job.add_parameter('--proteinprofile=%sprfl/%s.prfl' % (self._lineage_path, entry)) augustus_job.add_parameter('--predictionStart=%s ' % scaff_start) augustus_job.add_parameter('--predictionEnd=%s ' % scaff_end) augustus_job.add_parameter('--species=BUSCO_%s' % self._out + str(self._random)) for p in self._augustus_parameters.split(' '): if len(p) > 2: augustus_job.add_parameter(p) augustus_job.add_parameter('%s%s' % (self._tmp, scaff)) augustus_job.stdout_file = [out_name, 'w'] augustus_job.stderr_file = [augustus_log, 'a'] sed_call = 'sed -i.bak \'1,3d\' %s;rm %s.bak' % (out_name, out_name) augustus_rerun_seds.append(sed_call) out_name = '%shmmer_output/%s.out.%s' % (self.mainout, entry, output_index) augustus_fasta = \ '%saugustus_output/extracted_proteins/%s.faa.%s' \ % (self.mainout, entry, output_index) hmmer_job = self._hmmer.create_job() hmmer_job.add_parameter('--domtblout') hmmer_job.add_parameter('%s' % out_name) hmmer_job.add_parameter('-o') hmmer_job.add_parameter('%stemp_%s%s' % (self._tmp, self._out, self._random)) hmmer_job.add_parameter('--cpu') hmmer_job.add_parameter('1') hmmer_job.add_parameter('%shmms/%s.hmm' % (self._lineage_path, entry)) hmmer_job.add_parameter('%s' % augustus_fasta) else: pass self._augustus.run_jobs(self._cpus, BuscoAnalysis._logger) for sed_string in augustus_rerun_seds: self._p_open(['%s' % sed_string], 'bash', shell=True) # Extract fasta files from augustus output BuscoAnalysis._logger.info('Extracting predicted proteins...') self._no_predictions = [] for entry in self._missing_busco_list + self._fragmented_busco_list: if entry in self._location_dic: for location in self._location_dic[entry]: output_index = self._location_dic[entry].index(location) + 1 # when extract gets reworked to not need MAINOUT, # change to OUT_NAME plain_name = '%s.out.%s' % (entry, output_index) self._extract(self.mainout, plain_name) self._extract(self.mainout, plain_name, aa=False) else: pass # filter out the line that have no augustus prediction for job in self._hmmer.jobs_to_run: word = job.cmd_line target_seq = word[-1].split('/')[-1] if target_seq in self._no_predictions: self._hmmer.remove_job(job) BuscoAnalysis._logger.info('****** Step 3/3, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) BuscoAnalysis._logger.info( 'Running HMMER to confirm orthology of predicted proteins:') self._hmmer.run_jobs(self._cpus, BuscoAnalysis._logger) # Fuse the run1 and rerun folders self._p_open(['mv %saugustus_output/predicted_genes/*.* ' '%saugustus_output/predicted_genes_run1/ 2> /dev/null' % (self.mainout, self.mainout)], 'bash', shell=True) self._p_open(['mv %shmmer_output/*.* %shmmer_output_run1/ 2> /dev/null' % (self.mainout, self.mainout)], 'bash', shell=True) self._p_open(['rm', '-r', '%saugustus_output/predicted_genes' % self.mainout], 'bash', shell=False) self._p_open(['rm', '-r', '%shmmer_output' % self.mainout], 'bash', shell=False) self._p_open(['mv', '%saugustus_output/predicted_genes_run1' % self.mainout, '%saugustus_output/predicted_genes' % self.mainout], 'bash', shell=False) self._p_open(['mv', '%shmmer_output_run1' % self.mainout, '%shmmer_output' % self.mainout], 'bash', shell=False) # Compute the final results self._produce_short_summary() if len(self._missing_busco_list) == self._totalbuscos: BuscoAnalysis._logger.warning( 'BUSCO did not find any match. Do not forget to check ' 'the file %s to exclude a problem regarding Augustus' % augustus_log) # get single-copy files as fasta if not os.path.exists('%ssingle_copy_busco_sequences' % self.mainout): os.makedirs('%ssingle_copy_busco_sequences' % self.mainout) BuscoAnalysis._logger.debug('Getting single-copy files...') for entry in self._single_copy_files: check = 0 file_name = \ self._single_copy_files[entry].split('-')[-1].replace('out', 'faa') file_name_nucl = \ self._single_copy_files[entry].split('-')[-1].replace('out', 'fna') target_seq_name = self._single_copy_files[entry].split('[')[0] group_name = file_name.split('.')[0] seq_coord_start = \ self._single_copy_files[entry].split(']-')[0].split('[')[1] pred_fasta_file = open('%saugustus_output/extracted_proteins/%s' % (self.mainout, file_name)) single_copy_outfile = open('%ssingle_copy_busco_sequences/%s.faa' % (self.mainout, group_name), 'w') pred_fasta_file_nucl = \ open('%saugustus_output/extracted_proteins/%s' % (self.mainout, file_name_nucl)) single_copy_outfile_nucl = \ open('%ssingle_copy_busco_sequences/%s.fna' % (self.mainout, group_name), 'w') for line in pred_fasta_file: if line.startswith('>%s' % target_seq_name): single_copy_outfile.write( '>%s:%s:%s\n' % (group_name, self._sequences, seq_coord_start)) check = 1 elif line.startswith('>'): check = 0 elif check == 1: single_copy_outfile.write(line) for line in pred_fasta_file_nucl: if line.startswith('>%s' % target_seq_name): single_copy_outfile_nucl.write( '>%s:%s:%s\n' % (group_name, self._sequences, seq_coord_start)) check = 1 elif line.startswith('>'): check = 0 elif check == 1: single_copy_outfile_nucl.write(line) pred_fasta_file.close() single_copy_outfile.close() pred_fasta_file_nucl.close() single_copy_outfile_nucl.close() def _move_retraining_parameters(self): """ This function moves retraining parameters from augustus species folder to the run folder """ if os.path.exists(self._augustus_config_path + ('species/BUSCO_%s%s' % (self._out, self._random))): self._p_open(['cp', '-r', '%sspecies/BUSCO_%s%s' % (self._augustus_config_path, self._out, self._random), '%saugustus_output/retraining_parameters' % self.mainout], 'bash', shell=False) self._p_open(['rm', '-rf', '%sspecies/BUSCO_%s%s' % (self._augustus_config_path, self._out, self._random)], 'bash', shell=False) else: BuscoAnalysis._logger.warning( 'Augustus did not produce a retrained species folder, ' 'please check the augustus log file in the run folder ' 'to ensure that nothing went wrong ' '(%saugustus_output/augustus.log)' % self.mainout) # @overrides def _run_hmmer(self): """ This function runs hmmsearch. :raises SystemExit: if the hmmsearch result folder is empty after the run """ BuscoAnalysis._logger.info( 'Running HMMER to confirm orthology of predicted proteins:') files = os.listdir('%saugustus_output/extracted_proteins' % self.mainout) files.sort() if not os.path.exists(self.mainout + 'hmmer_output'): os.makedirs('%shmmer_output' % self.mainout) count = 0 for entry in files: if entry.split('.')[-2] == 'faa': count += 1 group_name = entry.split('.')[0] group_index = entry.split('.')[-1] hmmer_job = self._hmmer.create_job() hmmer_job.add_parameter('--domtblout') hmmer_job.add_parameter('%shmmer_output/%s.out.%s' % (self.mainout, group_name, group_index)) hmmer_job.add_parameter('-o') hmmer_job.add_parameter('%stemp_%s%s' % (self._tmp, self._out, self._random)) hmmer_job.add_parameter('--cpu') hmmer_job.add_parameter('1') hmmer_job.add_parameter('%shmms/%s.hmm' % (self._lineage_path, group_name)) hmmer_job.add_parameter('%saugustus_output/extracted_proteins/%s' % (self.mainout, entry)) self._hmmer.run_jobs(self._cpus, BuscoAnalysis._logger) def _run_tarzip_augustus_output(self): """ This function tarzips results folder """ # augustus_output/predicted_genes self._p_open(['tar', '-C', '%saugustus_output' % self.mainout, '-zcf', '%saugustus_output/predicted_genes.tar.gz' % self.mainout, 'predicted_genes', '--remove-files'], 'bash', shell=False) # augustus_output/extracted_proteins self._p_open(['tar', '-C', '%saugustus_output' % self.mainout, '-zcf', '%saugustus_output/extracted_proteins.tar.gz' % self.mainout, 'extracted_proteins', '--remove-files'], 'bash', shell=False) # augustus_output/gb self._p_open(['tar', '-C', '%saugustus_output' % self.mainout, '-zcf', '%saugustus_output/gb.tar.gz' % self.mainout, 'gb', '--remove-files'], 'bash', shell=False) # augustus_output/gffs self._p_open(['tar', '-C', '%saugustus_output' % self.mainout, '-zcf', '%saugustus_output/gffs.tar.gz' % self.mainout, 'gffs', '--remove-files'], 'bash', shell=False) # single_copy_busco_sequences self._p_open(['tar', '-C', '%s' % self.mainout, '-zcf', '%ssingle_copy_busco_sequences.tar.gz' % self.mainout, 'single_copy_busco_sequences', '--remove-files'], 'bash', shell=False) def _extract(self, path, group, aa=True): """ This function extracts fasta files from augustus output :param path: the path to the BUSCO run folder :type path: str :param group: the BUSCO group id :type group: str :param aa: to tell whether to extract amino acid instead of nucleotide sequence :type aa: bool """ ext = 'fna' start_str = '# coding sequence' end_str = '# protein' if aa: ext = 'faa' start_str = '# protein' end_str = '# end' count = 0 group_name = group.split('.')[0] try: group_index = int(group.split('.')[-1]) except IndexError: group_index = '1' group += '.out.1' group_index = str(group_index) f = open('%saugustus_output/predicted_genes/%s' % (path, group)) written_check = 0 check = 0 while True: line = f.readline() if not line: break if line.startswith('# start gene'): line = f.readline() line = line.split() places = [line[0], line[3], line[4]] elif line.startswith(start_str): line = line.strip().split('[') count += 1 if written_check == 0: out = open('%saugustus_output/extracted_proteins/%s.%s.%s' % (path, group_name, ext, group_index), 'w') written_check = 1 out.write('>g%s[%s:%s-%s]\n' % (count, places[0], places[1], places[2])) if line[1][-1] == ']': line[1] = line[1][:-1] out.write(line[1]) check = 1 else: if line.startswith('# sequence of block'): check = 0 elif line.startswith(end_str): check = 0 out.write('\n') elif check == 1: line = line.split()[1] if line[-1] == ']': line = line[:-1] out.write(line) f.close() if written_check == 1: out.close() else: self._no_predictions.append('%s.faa.%s' % (group_name, group_index)) # @overrides def _check_tool_dependencies(self): """ check dependencies on tools :raises SystemExit: if a Tool is not available """ super(GenomeAnalysis, self)._check_tool_dependencies() # check 'augustus' command availability if not Tool.check_tool_available('augustus', self._params): BuscoAnalysis._logger.error( '\"augustus\" is not accessible, check its path in the config.ini.default file (do not include the commmand ' 'in the path !), and ' 'add it to your $PATH environmental variable (the entry in config.ini.default is not sufficient ' 'for retraining to work properly)') raise SystemExit # check availability augustus - related commands if not Tool.check_tool_available('etraining', self._params): BuscoAnalysis._logger.error( '\"etraining\" is not accessible, check its path in the config.ini.default file(do not include the commmand ' 'in the path !), and ' 'add it to your $PATH environmental variable (the entry in config.ini.default is not sufficient ' 'for retraining to work properly)') raise SystemExit if not Tool.check_tool_available('gff2gbSmallDNA.pl', self._params): BuscoAnalysis._logger.error( 'Impossible to locate the required script gff2gbSmallDNA.pl. ' 'Check that you properly declared the path to augustus scripts folder in your ' 'config.ini.default file (do not include the script name ' 'in the path !)') raise SystemExit if not Tool.check_tool_available('new_species.pl', self._params): BuscoAnalysis._logger.error( 'Impossible to locate the required script new_species.pl. ' 'Check that you properly declared the path to augustus scripts folder in your ' 'config.ini.default file (do not include the script name ' 'in the path !)') raise SystemExit if not Tool.check_tool_available('optimize_augustus.pl', self._params): BuscoAnalysis._logger.error( 'Impossible to locate the required script optimize_augustus.pl. ' 'Check that you properly declared the path to augustus scripts folder in your ' 'config.ini.default file (do not include the script name ' 'in the path !)') raise SystemExit def _check_file_dependencies(self): """ check dependencies on files and folders properly configured. :raises SystemExit: if Augustus config path is not writable or not set at all :raises SystemExit: if Augustus config path does not contain the needed species present """ try: if not os.access(self._augustus_config_path, os.W_OK): BuscoAnalysis._logger.error( 'Cannot write to Augustus config path, ' 'please make sure you have write permissions to %s' % self._augustus_config_path) raise SystemExit except TypeError: BuscoAnalysis._logger.error( 'The environment variable AUGUSTUS_CONFIG_PATH is not set') raise SystemExit if not os.path.exists(self._augustus_config_path + '/species/%s' % self._target_species): BuscoAnalysis._logger.error( 'Impossible to locate the species "%s" in Augustus config path' ' (%sspecies), check that AUGUSTUS_CONFIG_PATH is properly set' ' and contains this species. \n\t\tSee the help if you want ' 'to provide an alternative species' % (self._target_species, self._augustus_config_path)) raise SystemExit def _write_full_table_header(self, out): """ This function adds a header line to the full table file :param out: a full table file :type out: file """ out.write('# Busco id\tStatus\tContig\tStart\tEnd\tScore\tLength\n') # # method that should be considered as if protected, for internal use [class] # to move to public and rename if meaningful # # Nothing