#!/usr/bin/env python3 """ June 18, 2020, Dr. Brian Fristensky, University of Manitoba uniqid.py - Read a source file and replace each definition line with a unique identifier. Store the unique ID and original definition line in a .csv file as a key-value pair. Synopsis: uniqid.py [options] --encode sourcein sourceout csvout uniqid.py [options] --encodesame textin textout csvin uniqid.py [options] --decode textin textout csvin options begin with a dash; filenames do not --encode (default) The first three filenames on the command line are read as sourcein, the original source file; sourceout, the sourcefile sequences in which the description line is replaced with a unique ID; and csvout, a comma-separated value file containing the unique identifier and the corresponding definition line --encodesame Encode another file, substituting in the same random names from a previous run using --encode. This makes it possible to encode two or more files using the same random names, so that all output files generated can be decoded with a single csv file. The first three filenames on the command line are read as sourcein, the original source file; sourceout, the sourcefile in which the description line is replaced with a unique ID generated previously by --encode; and csvin, a comma-separated value file containing the unique identifier and the corresponding definition line --decode The first three filenames on the command line are read as textin, any text file containing unique IDs generated from a previous run using -encode; textout the output file in which the unique ID is replaced by the original name, or the name plus parts of the definition line; csvin, the csv file generated by a previous run using -encode. -f list_of_fields similar to -f in the Unix cut command. A comma-separated list of fields to be written to textout when decoding files. -s seperator seperator is a character to use as the seperator when parsing a definition line into fields. default = " ", a blank space -nf string string is one or more characters to begin the unique identifier, which which the definition line is replaced. (default '!_') Idea for more general version of program: An option lets you input a regular expression that is used for finding the original ID, rather than just hardwiring fasta format into the program. The program will still default to search for fasta sequence names, but by employing regular expressions, uniqid.py can perform substitutions in ANY type of file. Probably not hard to implement, either. @modified: October 28 2020 @author: Dale Hamel @contact: brian.fristensky@umanitoba.ca """ import argparse import bisect import math import operator import os import random import re import string import sys blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod from birchlib import Argument PROGRAM = "uniqid.py: " USAGE = "\n\t USAGE: uniqid.py [options] --encode sourcein sourceout csvout\n\t\tuniqid.py [options] --decode textin textout csvin" BM = Birchmod(PROGRAM, USAGE) DEBUG = True # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - "Wrapper class for command line parameters" class Parameters: def __init__(self): """ Initializes arguments: FIELDS=[1] SEP= " " (whitespace) NAMEFLAG="!_" ACTIOM="" SOURCEIN="" CSVIN="" TEXTIN="" TEXTOUT="" CSVOUT="" CSVSEP="\\\t" Then calls read_args() to fill in their values from command line """ self.FIELDS = [1] # list of fields to parse with f option self.SEP = " " # seperator for parsing fields from the def. line self.NAMEFLAG = "!_" # all IDs begin with this string self.ACTION = "" self.SOURCEIN = "" self.SOURCEOUT = "" self.CSVIN = "" self.TEXTIN = "" self.TEXTOUT = "" self.CSVOUT = "" self.CSVSEP = "\t" self.read_args() def unquote(self, S): """ Remove leading and trailing quotes from a string @param STR: The string to clean up @type STR: str """ if not S == "" : if S.startswith('"') : S = S.replace('"', '') else: S = S.replace("'", "") return S def read_args(self): """ Read command line arguments into a Paramters object """ parser = argparse.ArgumentParser() parser.add_argument("-f", action="store", default="", help="comma separated list of field numbers") parser.add_argument("-nf", action="store", default="", help="prefix for randomized IDs") parser.add_argument("-s", action="store", default="", help="field separator for csv files") task = parser.add_mutually_exclusive_group() task.add_argument("--encode", action="store_true") task.add_argument("--decode", action="store_true") task.add_argument("--encodesame", action="store_true") parser.add_argument("infile", action="store", default="", help="input file") parser.add_argument("outfile", action="store", default="", help="output file") parser.add_argument("csvfile", action="store", default="", help="csv file") try: args = parser.parse_args() if not args.f == "" : self.FIELDS = args.f.split(",") if not args.nf == "" : self.NAMEFLAG = self.unquote(args.nf) if not args.s == "" : self.SEP = self.unquote(args.s) if args.encode : self.TASK = "encode" self.SOURCEIN = args.infile self.SOURCEOUT = args.outfile self.CSVOUT = args.csvfile elif args.decode : self.TASK = "decode" self.TEXTIN = args.infile self.TEXTOUT = args.outfile self.CSVIN = args.csvfile else : self.TASK = "encodesame" self.TEXTIN = args.infile self.TEXTOUT = args.outfile self.CSVIN = args.csvfile except ValueError: BM.printusage() if DEBUG : print("FIELDS: " + str(self.FIELDS)) print("SEP: " + self.SEP) print("NAMEFLAG: " + self.NAMEFLAG) print("TASK: " + self.TASK) print("SOURCEIN: " + self.SOURCEIN) print("SOURCEOUT: " + self.SOURCEOUT) print("CSVOUT: " + self.CSVOUT) print("TEXTIN: " + self.TEXTIN) print("TEXTOUT: " + self.TEXTOUT) print("CSVIN: " + self.CSVIN) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def EncodeNames(NAMEFLAG, SOURCEIN, SOURCEOUT, CSVOUT, CSVSEP): """ For each sequence in sourcein, write the sequence to sourceout replacing the definition line with a unique identifier. Write the unique identifier and the definition line to csvout """ def GetUniqID(NAMELIST): """ Choose a random number, and make sure it hasn't already been recorded in NAMELIST. If we just kept appending to NAMELIST, checking to see if a number was already used would become inefficient for large lists. We want to keep adding numbers to the list so that it stays sorted. bisect_left module lets us insert each new value into an already-sorted list to the left of the next-highest value. Thus, the list always stays sorted. See http://www.doughellmann.com/PyMOTW/bisect/index.html """ DONE = bool(False) ID = str(random.randint(1, 9999999)) while not DONE: if ID in NAMELIST: ID = str(random.randint(1, 9999999)) else: bisect.insort_left(NAMELIST, ID) DONE = bool(True) return ID # create a dummy file just so that we have a file to close # the first time the loop is executed. This also takes care # of files in which the first sequence begins after the first line try: FIN = open(SOURCEIN, 'r') except: BM.file_error(SOURCEIN) FOUT = open(SOURCEOUT, 'w') COUT = open(CSVOUT, 'w') NAMELIST = [] for LINE in FIN: LINE = LINE.strip() if len(LINE) > 0: if LINE[0] == '>': #new sequence UNIQNAME = NAMEFLAG + GetUniqID(NAMELIST) FOUT.write('>' + UNIQNAME + '\n') # > is not considered as part of the definition line # so we drop the first char. of LINE COUT.write(UNIQNAME + CSVSEP + LINE[1:] + '\n') else: #copy the line to output file FOUT.write(LINE + '\n') FIN.close() FOUT.close() COUT.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def DecodeNames(SEP, NF, FIELDS, TEXTIN, TEXTOUT, CSVIN): """ For each occurence of a name in textin, write the sequence to sourceout replacing the definition line with a unique identifier. Write the unique identifier and the definition line to csvout """ try: TIN = open(TEXTIN, 'r') except: BM.file_error(TEXTIN) TOUT = open(TEXTOUT, 'w') def ReadDefLines(CSVIN): """ Read unique ids and corresponding definition lines from CSV in into a dictionary @param CSVIN: The name of the file containing the CSV data @type CSVIN: str """ DICT = {} CIN = open(CSVIN, 'r') for LINE in CIN: TOKENS = LINE.strip().split('\t') DICT[TOKENS[0]] = TOKENS[1] CIN.close() return DICT def GetFields(DEF, FIELDS, SEP): """ Get one or more fields from a definition line, using SEP as the field seperator @param DEF: The definition line @type DEF: str @param FIELDS: A list of fields @type FIELDS: list @param SEP: The string to split on @type SEP: str """ STR = "" TOKENS = DEF.split(SEP) #print(TOKENS) LEN = len(TOKENS) if LEN > 0: J = int(1) #print(J) STR = TOKENS[int(FIELDS[J-1])-1] J = J + 1 while J < LEN: #print(J) STR = STR + SEP + TOKENS[int(FIELDS[J-1])-1] J = J + 1 return STR # Read in key-value pairs of unique IDs and definition lines IDDICT = ReadDefLines(CSVIN) # for each line in the file, replace the ID with the original # definition line for LINE in TIN: OUTPUTLINE = LINE.strip() if OUTPUTLINE.find(NF) != -1: # Find each occurence of if DEBUG: print('Original: ' + OUTPUTLINE) for K in list(IDDICT.keys()): STR = GetFields(IDDICT[K], FIELDS, SEP) OUTPUTLINE = OUTPUTLINE.replace(K, STR) if DEBUG : print('Modified: ' + OUTPUTLINE) TOUT.write(OUTPUTLINE + '\n') TIN.close() TOUT.close() #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ P = Parameters () if P.TASK == "encode": EncodeNames(P.NAMEFLAG, P.SOURCEIN, P.SOURCEOUT, P.CSVOUT, P.CSVSEP) elif P.TASK == "decode" : DecodeNames(P.SEP, P.NAMEFLAG, P.FIELDS, P.TEXTIN, P.TEXTOUT, P.CSVIN) #elif P.TASK == "encodesame" : #pass #else : #print(USAGE) if (BM.documentor() or "-test" in sys.argv): pass else: main()