#!/usr/bin/python # uniqid.py - Read a source file and replace each definition line with a unique # identifier. Store the unique ID and original definition line # in a .csv file as a key-value pair. # Version 7 Feb 2010 # Synopsis: # uniqid.py [options] -encode sourcein sourceout csvout # uniqid.py [options] -decode textin textout csvin # # -encode (default) options begin with a dash; filenames do not # The first three filenames on the command line # are read as sourcein, the original source file; # sourceout, the sourcefile sequences in which the # description line is replaced with a unique ID; # and csvout, a comma-separated value file containing # the unique identifier and the corresponding # definition line # # -decode options begin with a dash; filenames do not # The first three filenames on the command line # are read as textin, any text file containing # unique IDs generated from a previous run using # -encode; textout the output file in which the # unique ID is replaced by the original name, or # the name plus parts of the definition line; csvin, # the csv file generated by a previous run using # -encode. # # -f list_of_fields similar to -f in the Unix cut # command. A comma-separated list of fields to be # written to textout when decoding files. # # -s seperator seperator is a character to use as the seperator # when parsing a definition line into fields. # default = " ", a blank space # # -nf string string is one or more characters to begin the # unique identifier, which which the definition # line is replaced. # # Idea for more general version of program: # An option lets you input a regular expression that is used for # finding the original ID, rather than just hardwiring fasta format # into the program. The program will still default to search for fasta # sequence names, but by employing regular expressions, uniqid.py # can perform substitutions in ANY type of file. Probably not hard # to implement, either. import operator import random import bisect import math import sys import string import re # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters : "Wrapper class for command line parameters" def __init__(self) : self.FIELDS = [1] # list of fields to parse with f option self.SEP = " " # seperator for parsing fields from the def. line self.NAMEFLAG = "!_" # all IDs begin with this string self.ENCODE = bool(True) self.SOURCEIN = "" self.SOURCEOUT = "" self.CSVIN = "" self.TEXTIN = "" self.TEXTOUT = "" self.CSVOUT = "" self.CSVSEP = "\t" def ReadArgs(self) : "Remove leading and trailing quotes from a string" def Unquote(STR) : if STR[0] == '"' : S = STR.replace('"','') else : S = STR.replace("'","") return S "Read command line arguments into a Parameter object" NUMARGS = len(sys.argv) if NUMARGS > 1 : I = int(1) while (I < NUMARGS) : if sys.argv[I] == "-f" : if I < NUMARGS : I = I + 1 FIELDSTRING = sys.argv[I] self.FIELDS = FIELDSTRING.split(',') I = I + 1 else : print ("uniqid.py: fields specified for -f") elif sys.argv[I] == "-s" : if I < NUMARGS : I = I + 1 SEPSTR = Unquote(sys.argv[I]) self.SEP = SEPSTR[0] I = I + 1 else : print ("uniqid.py: No value specified for -s") elif sys.argv[I] == "-nf" : if I < NUMARGS : I = I + 1 self.NAMEFLAG = Unquote(sys.argv[I]) I = I + 1 else : print ("uniqid.py: No value specified for -nf") elif sys.argv[I] == "-encode" : self.ENCODE = True I = I + 1 elif sys.argv[I] == "-decode" : self.ENCODE = False I = I + 1 elif self.ENCODE : if self.SOURCEIN == "" : self.SOURCEIN = sys.argv[I] I = I + 1 elif self.SOURCEOUT == "" : self.SOURCEOUT = sys.argv[I] I = I + 1 elif self.CSVOUT == "" : self.CSVOUT = sys.argv[I] I = I + 1 else : if self.TEXTIN == "" : self.TEXTIN = sys.argv[I] I = I + 1 elif self.TEXTOUT == "" : self.TEXTOUT = sys.argv[I] I = I + 1 elif self.CSVIN == "" : self.CSVIN = sys.argv[I] I = I + 1 #for F in self.FIELDS : print self.FIELDS print self.SEP print self.NAMEFLAG print self.ENCODE print self.SOURCEIN print self.SOURCEOUT print self.CSVOUT print self.TEXTIN print self.TEXTOUT print self.CSVIN # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # For each sequence in sourcein, write the sequence to sourceout # replacing the definition line with a unique identifier. # Write the unique identifier and the definition line to csvout # def EncodeNames(NAMEFLAG,SOURCEIN,SOURCEOUT,CSVOUT,CSVSEP) : # Choose a random number, and make sure it hasn't already # been recorded in NAMELIST. If we just kept appending # to NAMELIST, checking to see if a number was already used # would become inefficient for large lists. We want to keep # adding numbers to the list so that it stays sorted. # bisect_left module lets us insert each new value into an # already-sorted list to the left of the next-highest value. # Thus, the list always stays sorted. See # http://www.doughellmann.com/PyMOTW/bisect/index.html def GetUniqID(NAMELIST) : DONE = bool(False) ID = str(random.randint(1,9999999)) while not DONE : if ID in NAMELIST : ID = str(random.randint(1,9999999)) else : bisect.insort_left(NAMELIST,ID) DONE = bool(True) return ID # create a dummy file just so that we have a file to close # the first time the loop is executed. This also takes care # of files in which the first sequence begins after the first line FIN = open(SOURCEIN,'r') FOUT = open(SOURCEOUT,'w') COUT = open(CSVOUT,'w') NAMELIST = [] for LINE in FIN : LINE = LINE.strip() if len(LINE) > 0 : if LINE[0] == '>' : #new sequence UNIQNAME = NAMEFLAG + GetUniqID(NAMELIST) FOUT.write('>' + UNIQNAME + '\n') # > is not considered as part of the definition line # so we drop the first char. of LINE COUT.write(UNIQNAME + CSVSEP + LINE[1:] + '\n') #print UNIQNAME + CSVSEP + LINE[1:] + '\n' else : #copy the line to output file FOUT.write(LINE + '\n') FIN.close() FOUT.close() COUT.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # For each occurence of a name in textin, write the sequence to sourceout # replacing the definition line with a unique identifier. # Write the unique identifier and the definition line to csvout # def DecodeNames(SEP,NF,FIELDS,TEXTIN,TEXTOUT,CSVIN) : TIN = open(TEXTIN,'r') TOUT = open(TEXTOUT,'w') # read unique ids and corresponding definition lines # from CSV in into a dictionary def ReadDefLines(CSVIN) : DICT = {} CIN = open(CSVIN,'r') for LINE in CIN : TOKENS = LINE.strip().split('\t') DICT[TOKENS[0]] = TOKENS[1] CIN.close() return DICT # Get one or more fields from a definition line, using # SEP as the field seperator def GetFields(DEF,FIELDS,SEP) : STR = "" TOKENS = DEF.split(SEP) print TOKENS LEN = len(TOKENS) if LEN > 0 : J = int(1) print J STR = TOKENS[int(FIELDS[J-1])-1] J = J + 1 while J < LEN : print J STR = STR + SEP + TOKENS[int(FIELDS[J-1])-1] J = J + 1 return STR # Read in key-value pairs of unique IDs and definition lines IDDICT = ReadDefLines(CSVIN) # for each line in the file, replace the ID with the original # definition line for LINE in TIN : OUTPUTLINE = LINE.strip() if OUTPUTLINE.find(NF) != -1 : # Find each occurence of print 'Original: ' + OUTPUTLINE for K in IDDICT.keys() : STR = GetFields(IDDICT[K],FIELDS,SEP) OUTPUTLINE = OUTPUTLINE.replace(K,STR) print 'Modified: ' + OUTPUTLINE TOUT.write(OUTPUTLINE + '\n') TIN.close() TOUT.close() #======================== MAIN PROCEDURE ========================== P = Parameters () P.ReadArgs() if P.ENCODE : EncodeNames(P.NAMEFLAG,P.SOURCEIN,P.SOURCEOUT,P.CSVOUT,P.CSVSEP) else : DecodeNames(P.SEP,P.NAMEFLAG,P.FIELDS,P.TEXTIN,P.TEXTOUT,P.CSVIN)