""" studentlink.py - read fields for student data, clean the input, and write to an output file. Assumptions: 1)Every file has an ID field to identify each student. Other files contain different sorts of information on students, all keyed by ID number. 2)The ID field is always the first column. SYNOPSIS studentlink.py filelist outfile """ import re import sys #-------------------- Parameters ---------------------- """ Wrapper for command line parameters. """ class Parameters : def __init__(self): self.listfile = "" self.outfile = "" self.SEP = "," # we could add a step to read this as a command # line parameter def read_args(self): # For a production script, this code should be re-written # using the Python argparse class. self.listfile = sys.argv[1] self.outfile = sys.argv[2] #------------------------- Filelist ------------------ class FileList : """ list of files to be read """ def __init__(self): self.filenames = [] def ReadFileList(self,FN): lfile = open(FN,'r') lines = lfile.readlines() for l in lines : name = l.strip() if len(name) > 0 : self.filenames.append(name) #print(name) lfile.close() #------------------------- Header ------------------ class Header : """ The header is a list of dictionary key,value pairs. The keys are column headings found. For consistency, we force column headings into uppercase. The values are column numbers. eg. if the header was ID,MET,gender then the dictionary would be {[ID,0],[MET,1],[gender,2]} """ def __init__(self): self.FieldName = {} def GetHeader(self,firstline,SEP) : """ Given a raw input line, parse out the column headings using SEP as separator """ FieldsFound = firstline.split(SEP) colnum=0 for f in FieldsFound : fs = f.strip() # strip removes leading and trailing whitespace characters if len(fs) > 0 : capname = fs.upper() if not capname in self.FieldName : self.FieldName[fs]=colnum colnum += 1 print('Headers read: ' + str(self.FieldName)) #------------------------- Student ------------------ class Student : """ Data for a given student. """ def __init__(self): self.MET = "NA" self.gender = "9" self.score = -1 def WriteStudent(outfile): """ Write data for a student to output file. """ #outstring = self.ID + SEP + self.MET + SEP + self.gender + SEP + self.score #print(outstring) #outfile.write(outstring) #------------------------- Table of Students ------------------ class STable : def __init__(self): self.students = {} def GetID(Field): """ Given a field, return an ID In this case, all we do is return the field stripped of leading and trailing whitespace """ ID = Field.strip() return ID def GetMET(Field): """ Given a field, return an ID In this case, all we do is return the field stripped of leading and trailing whitespace """ MET = Field.strip() MET=Field.replace([" ","-"], " ") return MET def GetGender(Field): """ Given a field, return a gender In this case, all we do is return the field stripped of leading and trailing whitespace """ Gender = Field.strip() if Gender[0] == '1' : Gender='F' elif Gender == '2' : Gender='M' else : Gender = Gender.upper() if not Gender[0] in ['F','M'] : Gender = '?' return Gender def ReadStudentFile(self,sfile,LegalHeadings,SEP) : lines = sfile.readlines() # Header is the first line, a special case HeadingsFound = Header() HeadingsFound.GetHeader(lines[0],SEP) if 'ID' in HeadingsFound.FieldName : # process all remaining lines for l in range(1, len(lines)) : tokens = lines[l].split(SEP) print(tokens) IDnum = tokens[0].strip() if not IDnum in self.students : self.students[IDnum] = Student() for t in range(1, len(tokens)): ts = tokens[t].strip() if ts == 'MET' : self.students[IDnum].MET = self.GetMET(ts) elif ts == 'GENDER' : self.students[IDnum].gender = self.GetGender(ts) #--------------------------- main procedure --------------- # read command line parameters P = Parameters() P.read_args() print("listfile= " + P.listfile) print("outfile= " + P.outfile) # read a list of files to process infiles = FileList() infiles.ReadFileList(P.listfile) # Make a first pass through the files. In this pass, we read # the first line of each file to get a complete list of which # headers are present. ColumnHeadings = Header() for FN in infiles.filenames : print(FN) F = open(FN,'r') firstline = F.readline() ColumnHeadings.GetHeader(firstline,P.SEP) F.close() print(ColumnHeadings.FieldName) # Make a second pass through the files, now reading the data # indicated by the column headings. Students = STable() for FN in infiles.filenames : print('Processing ' + FN) F = open(FN,'r') Students.ReadStudentFile(F,ColumnHeadings,P.SEP) F.close() # Write the merged files to the output file #outfile = open(P.outfile,'w') #for S in Students.students : # S.WriteStudent(outfile)