#!/usr/bin/env python3 # coding: utf-8 """ .. module:: TranscriptomeAnalysis :synopsis:TranscriptomeAnalysis implements genome analysis specifics .. versionadded:: 3.0.0 .. versionchanged:: 3.0.0 Copyright (c) 2016-2017, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ import os import time from quast_libs.busco.BuscoAnalysis import BuscoAnalysis from quast_libs.busco.pipebricks.PipeLogger import PipeLogger # from overrides import overrides # useful fro dev, but don't want all user to install this class TranscriptomeAnalysis(BuscoAnalysis): """ Analysis on a transcriptome. """ _logger = PipeLogger.get_logger(__name__) # # magic or public, meant to be accessed by external scripts [instance] # def __init__(self, params): """ Initialize an instance. :param params: Values of all parameters that have to be defined :type params: BuscoConfig """ self._mode = 'transcriptome' super(TranscriptomeAnalysis, self).__init__(params) self._transcriptome_by_scaff = {} # data integrity checks not done by the parent class if self.check_nucleotide_file() is False: TranscriptomeAnalysis._logger.error('Please provide a nucleotide file as input') raise SystemExit # @overrides def run_analysis(self): """ This function calls all needed steps for running the analysis. """ super(TranscriptomeAnalysis, self).run_analysis() if self._restart: checkpoint = self.get_checkpoint( reset_random_suffix=True) TranscriptomeAnalysis._logger.warning( 'Restarting an uncompleted run') else: checkpoint = 0 # all steps will be done if checkpoint < 1: TranscriptomeAnalysis._logger.info( '****** Step 1/2, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) if self._has_variants_file: self._run_tblastn( ancestral_variants=True) else: self._run_tblastn( ancestral_variants=False) self._set_checkpoint(1) TranscriptomeAnalysis._logger.info( '****** Step 2/2, current time: %s ******' % time.strftime("%m/%d/%Y %H:%M:%S")) self._load_score() self._load_length() self._get_coordinates() self._run_hmmer() self._produce_short_summary() self.cleanup() if self._tarzip: self._run_tarzip_hmmer_output() self._run_tarzip_translated_proteins() # remove the checkpoint, run is done self._set_checkpoint() # @overrides def cleanup(self): """ This function cleans temporary files. """ super(TranscriptomeAnalysis, self).cleanup() self._p_open(['rm %s*%s%s_.temp' % (self._tmp, self._out, self._random)], 'bash', shell=True) self._p_open(['rm %(tmp)s%(abrev)s.*ns? %(tmp)s%(abrev)s.*nin %(tmp)s%(abrev)s.*nhr' % {'tmp': self._tmp, 'abrev': self._out + str(self._random)}], 'bash', shell=True) # @overrides def _check_dataset(self): """ Check if the dataset integrity, if files and folder are present :raises SystemExit: if the dataset miss files or folders """ super(TranscriptomeAnalysis, self)._check_dataset() # note: score and length cutoffs are checked when read, # see _load_scores and _load_lengths # ancestral would cause blast to fail, and be detected, see _blast() # dataset.cfg is not mandatory # check whether the ancestral_variants file is present if os.path.exists('%sancestral_variants' % self._lineage_path): self._has_variants_file = True else: self._has_variants_file = False BuscoAnalysis._logger.warning( 'The dataset you provided does not contain the file ' 'ancestral_variants, likely because it is an old version. ' 'All blast steps will use the file ancestral instead') # # public, meant to be accessed by external scripts [class] # # Nothing # # method that should be used as if protected, for internal use [instance] # to move to public and rename if meaningful # def _sixpack(self, seq): """ Gets the sixframe translation for the provided sequence :param seq: the sequence to be translated :type seq: str :return: the six translated sequences :rtype: list """ s1 = seq s2 = seq[1:] s3 = seq[2:] rev = '' for letter in seq[::-1]: try: rev += BuscoAnalysis.COMP[letter] except KeyError: rev += BuscoAnalysis.COMP['N'] r1 = rev r2 = rev[1:] r3 = rev[2:] transc = [] frames = [s1, s2, s3, r1, r3, r2] for sequence in frames: part = '' new = '' for letter in sequence: if len(part) == 3: try: new += BuscoAnalysis.CODONS[part] except KeyError: new += 'X' part = '' part += letter else: part += letter if len(part) == 3: try: new += BuscoAnalysis.CODONS[part] except KeyError: new += 'X' transc.append(new) return transc def _reformats_seq_id(self, seq_id): """ This function reformats the sequence id to its original values :param seq_id: the seq id to reformats :type seq_id: str :return: the reformatted seq_id :rtype: str """ return "_".join(seq_id.split('_')[:-1]) def _get_coordinates(self): """ This function gets coordinates for candidate regions from tblastn result file """ TranscriptomeAnalysis._logger.info('Maximum number of candidate transcript per BUSCO limited to: %s' % self._region_limit) TranscriptomeAnalysis._logger.info( 'Getting coordinates for candidate transcripts...') # open input file f = open('%sblast_output/tblastn_%s.tsv' % (self.mainout, self._out)) transcriptome_by_busco = {} self._transcriptome_by_scaff = {} maxi = 0 for i in f: # get a dictionary of BUSCO matches vs candidate scaffolds if i.startswith('#'): pass else: line = i.strip().split() if self._has_variants_file: busco = '_'.join(line[0].split("_")[:-1]) # This pattern else: # can support busco = line[0] # name like scaff = line[1] # EOG00_1234_1 leng = int(line[3]) blast_eval = float(line[10]) if busco not in transcriptome_by_busco.keys(): # Simply add it # Use a single entry dict to keep scaffs id and their # blast eval, for each busco transcriptome_by_busco[busco] = [{scaff: blast_eval}] # and keep a list of each busco by scaff try: self._transcriptome_by_scaff[scaff].append(busco) except KeyError: self._transcriptome_by_scaff[scaff] = [busco] maxi = leng elif len(transcriptome_by_busco[busco]) < self._region_limit \ and leng >= 0.7 * maxi: # check that this transcript is not already in, and update # its eval if needed add = True for scaff_dict in transcriptome_by_busco[busco]: if list(scaff_dict.keys())[0] == scaff: add = False # update the eval for this scaff if blast_eval < list(scaff_dict.values())[0]: scaff_dict[scaff] = blast_eval if add: transcriptome_by_busco[busco].append( {scaff: blast_eval}) try: self._transcriptome_by_scaff[scaff].append(busco) except KeyError: self._transcriptome_by_scaff[scaff] = [busco] if leng > maxi: maxi = leng elif len(transcriptome_by_busco[busco]) >= self._region_limit \ and leng >= 0.7 * maxi: # replace the lowest scoring transcript if the current has # a better score. needed because of multiple blast query # having the same name when using ancestral_variants and # not sorted by eval in the tblastn result file to_replace = None # Define if something has to be replaced for entry in transcriptome_by_busco[busco]: if list(entry.values())[0] > blast_eval: # check if there is already a to_replace entry and # compare the eval if (to_replace and list(entry.values())[0] > list(to_replace.values())[0]) or \ not to_replace: to_replace = {list(entry.keys())[0]: list(entry.values())[0]} if to_replace: # try to add the new one # check that this scaffold is not already in, # and update the eval if needed # if the scaff was already in, do not replace # the to_replace entry to keep the max number of # candidate regions add = True for scaff_dict in transcriptome_by_busco[busco]: if list(scaff_dict.keys())[0] == scaff: add = False if blast_eval < list(scaff_dict.values())[0]: # update the eval for this scaff scaff_dict[scaff] = blast_eval if add: # add the new one transcriptome_by_busco[busco].append({ scaff: blast_eval}) try: self._transcriptome_by_scaff[scaff].append( busco) except KeyError: self._transcriptome_by_scaff[scaff] = [busco] # remove the old one for entry in transcriptome_by_busco[busco]: if list(entry.keys())[0] == \ list(to_replace.keys())[0]: scaff_to_remove = list(entry.keys())[0] break transcriptome_by_busco[busco].remove(entry) for entry in self._transcriptome_by_scaff[scaff_to_remove]: if entry == busco: break self._transcriptome_by_scaff[scaff_to_remove].remove(entry) if leng > maxi: maxi = leng TranscriptomeAnalysis._logger.info( 'Extracting candidate transcripts...') f = open(self._sequences) check = 0 out = None for i in f: if i.startswith('>'): i = i.strip().split() i = i[0][1:] if i in list(self._transcriptome_by_scaff.keys()): out = open('%s%s%s%s_.temp' % (self._tmp, i, self._out, self._random), 'w') out.write('>%s\n' % i) check = 1 else: check = 0 elif check == 1: out.write(i) f.close() if out: out.close() if not os.path.exists('%stranslated_proteins' % self.mainout): os.makedirs('%stranslated_proteins' % self.mainout) files = os.listdir(self._tmp) files.sort() lista = [] for entry in files: if entry.endswith(self._out + str(self._random) + '_.temp'): lista.append(entry) TranscriptomeAnalysis._logger.info( 'Translating candidate transcripts...') for entry in lista: raw_seq = open(self._tmp + entry) # this works even if the runname is in the header name = self._out.join(entry.replace('_.temp', '').split(self._out)[:-1]) trans_seq = open(self.mainout + 'translated_proteins/' + name + '.faa', 'w') nucl_seq = '' header = '' for line in raw_seq: if line.startswith('>'): header = line.strip() + '_' else: nucl_seq += line.strip() seq_count = 0 for translation in self._sixpack(nucl_seq): seq_count += 1 trans_seq.write( '%s%s\n%s\n' % (header, seq_count, translation)) raw_seq.close() trans_seq.close() # open target scores file f2 = open('%sscores_cutoff' % self._lineage_path) # Load dictionary of HMM expected scores and full list of groups score_dic = {} for i in f2: i = i.strip().split() try: # float values: [1]=mean; [2]=minimum score_dic[i[0]] = float(i[1]) except IndexError: pass f2.close() self._totalbuscos = len(list(score_dic.keys())) # # method that should be considered as if protected, for internal use [class] # to move to public and rename if meaningful # # Nothing def _run_tarzip_translated_proteins(self): """ This function tarzips results folder """ # translated_proteins self._p_open(['tar', '-C', '%s' % self.mainout, '-zcf', '%stranslated_proteins.tar.gz' % self.mainout, 'translated_proteins', '--remove-files'], 'bash', shell=False) # @overrides def _run_hmmer(self): """ This function runs hmmsearch. """ TranscriptomeAnalysis._logger.info( 'Running HMMER to confirm transcript orthology:') files = os.listdir('%stranslated_proteins/' % self.mainout) files.sort() if not os.path.exists('%shmmer_output' % self.mainout): os.makedirs('%shmmer_output' % self.mainout) count = 0 busco_index = {} for f in files: if f.endswith('.faa'): count += 1 scaff = f[:-4] scaff_buscos = self._transcriptome_by_scaff[scaff] for busco in scaff_buscos: try: busco_index[busco] += 1 except KeyError: busco_index[busco] = 1 hmmer_job = self._hmmer.create_job() hmmer_job.add_parameter('--domtblout') hmmer_job.add_parameter('%shmmer_output/%s.out.%s' % (self.mainout, busco, busco_index[busco])) hmmer_job.add_parameter('-o') hmmer_job.add_parameter('%stemp_%s%s' % (self._tmp, self._out, str(self._random))) hmmer_job.add_parameter('--cpu') hmmer_job.add_parameter('1') hmmer_job.add_parameter('%shmms/%s.hmm' % (self._lineage_path, busco)) hmmer_job.add_parameter('%stranslated_proteins/%s' % (self.mainout, f)) # Run hmmer self._hmmer.run_jobs(self._cpus)