#!/usr/bin/env python import birchscript import re import os import os.path import subprocess import sys # mrtrans.csh September 1, 2010 # This is a front end for mrtrans. It makes sure that the names of # the sequences in PROTFILE and DNAFILE are the same, and re-orders # the sequences in DNAFILE, if necessary, to be in the same order # as in PROTFILE. # This script assumes that sequence names in PROTFILE are IDENTICAL to # the corresponding names in DNAFILE. PROTFILE = sys.argv[1] DNAFILE = sys.argv[2] JOBID = str(os.getpid()) # ---------------- Reformat files for use by mrtrans binaries ---------------------------------- # Modified for compatability with both GDE and biolegato. bioLegato # can't read pseudo-Genbank files created by readseq, so output now # goes to a GDE flat file, which both can read. # sed edits out ":CDS" or "_CDS" that is added by FEATURES # tr gets rid of extra > and ASCII 128 added by mrtrans # sed lops off the comma and extra junk readseq adds to the name line # including the " from >name..." that needs to be removed. # mrtrans has been modified to allow sequence names longer than # 9 char. Install mrtrans from mrtrans.tar. # Finally, we have to translate the protein sequence to # uppercase, which is required by mrtrans. #readseq -a -f=Pearson -pipe $DNAFILE |sed "s/[:_]CDS/_/" |sed "s/,.*//" | cut -f1 -d" " > $JOBID.dna; #grep '>' $JOBID.dna | cut -c2-80 | cut -f1 -d" " > $JOBID.dna.nam dnanam_lines = [] p_dna = subprocess.Popen(["readseq", "-a", "-f=Pearson", "-pipe", DNAFILE ], stdout=subprocess.PIPE) for line in p_dna.stdout: line = re.sub(",.*", "", re.sub("[:_]CDS", "_", line)).split(" ")[0] if len(line) > 0 and line[0] == '>': dnanam_lines += line[1:] #readseq -a -f=Pearson -C -pipe $PROTFILE |sed "s/[:_]CDS/_/" | sed "s/,.*//" >$JOBID.prot; #grep '>' $JOBID.prot | cut -c2-80 | cut -f1 -d" " > $JOBID.pro.nam pronam_lines = [] missing_lines = [] p_pro = subprocess.Popen(["readseq", "-a", "-f=Pearson", "-C", "-pipe", PROTFILE ], stdout=subprocess.PIPE) for line in p_pro.stdout: line = re.sub(",.*", "", re.sub("[:_]CDS", "_", line)) if len(line) > 0 and line[0] == '>': pronam_lines += line[1:] # ----------------- Make sure names are consistent between DNA and Protein files ------------------- #Make a list of sequence names from each file. #sort < $JOBID.pro.nam > $JOBID.pro.nam.sorted #sort < $JOBID.dna.nam > $JOBID.dna.nam.sorted #comm -23 $JOBID.pro.nam.sorted $JOBID.dna.nam.sorted > $JOBID.missing if line not in dnanam_lines: missing_lines += line #p_pro.close() print "Making sure all names in protein file have a counterpart in dna file..." if len(missing_lines) == 0: # All protein sequences have a DNA counterpart. # Re-order the DNA file into the same order as the protein file. # A $JOBID.dna.num is the same as $JOBID.dna.nam, except that # the former begins with a line number for each name in the # latter. For each protein sequence, the appropriate line # number is chosen using grep, and readseq pulls out that # sequence from $DNAFILE. print "Re-ordering the DNA file to correspond to order of sequences" print "in protein file." dna_jobid_lines = [] # cat -n $JOBID.dna.nam |tr -s " " " " > $JOBID.dna.num line_number = 1 for line in dnanam_lines: dna_jobid_lines += re.sub(" +", " ", line) line_number += 1 dnanam_lines = None h_dnareordered = open(JOBID + ".dna.reordered", "a") for name in pronam_lines: #set LINENUM = `grep -w "$name" $JOBID.dna.num | cut -f1 -d" " |tr -d ' ' ` LINENUM = 1 for line in dna_jobid_lines: if re.search("\b" + name + "\b", line): break else: LINENUM += 1 subprocess.call(["readseq", "-pipe", "-C", "-i" + LINENUM, "-fPearson", JOBID + ".dna"], stdout=h_dnareordered) h_dnareordered.close() else: print 'The following sequences are present in the protein file' print 'but missing from the DNA file:' sys.stdout.writelines(missing_lines) # ------------------------ Run mrtrans binary and cleanup output so bioLegato can read it #mrtrans $JOBID.prot $JOBID.dna.reordered | sed "s/> from >/>/" |tr -d '' | readseq -pipe -a -f8 | sed "s/>>/#/"|sed 's/ from.*//' > $JOBID.flat; p_mrtrans = subprocess.Popen(["mrtrans", JOBID + ".prot", JOBID + ".dna.reordered"], stdout=subprocess.PIPE) p_readseq = subprocess.Popen(["readseq", "-pipe", "-a", "-f8"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) for line in p_mrtrans.stdout: #re.sub("> from >", ">", line).translate(None, '') re.sub("> from >", ">", line).translate(None, chr(128)) p_readseq.stdin.write(line) p_readseq.stdin.close() h_output = open(JOBID + ".flat", "w") for line in p_readseq.stdout: h_output.write(re.sub(" from.*", "", re.sub(">>", "#", line))) h_output.close() # Launch bioLegato in the background ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.isfile(filename) and os.path.splitext(filename)[0] == JOBID] birchscript.Cleanrun([["blnalign", JOBID + ".flat"]], ls_dir, True) # Delete temporary files #$RM_CMD $JOBID.prot $JOBID.dna* $JOBID.*.nam $JOBID.*.sorted $JOBID.missing