#======================================================================= # # Python Lexical Analyser # # Regular Expressions # #======================================================================= from __future__ import absolute_import import types try: from sys import maxsize as maxint except ImportError: from sys import maxint from . import Errors # # Constants # BOL = 'bol' EOL = 'eol' EOF = 'eof' nl_code = ord('\n') # # Helper functions # def chars_to_ranges(s): """ Return a list of character codes consisting of pairs [code1a, code1b, code2a, code2b,...] which cover all the characters in |s|. """ char_list = list(s) char_list.sort() i = 0 n = len(char_list) result = [] while i < n: code1 = ord(char_list[i]) code2 = code1 + 1 i += 1 while i < n and code2 >= ord(char_list[i]): code2 += 1 i += 1 result.append(code1) result.append(code2) return result def uppercase_range(code1, code2): """ If the range of characters from code1 to code2-1 includes any lower case letters, return the corresponding upper case range. """ code3 = max(code1, ord('a')) code4 = min(code2, ord('z') + 1) if code3 < code4: d = ord('A') - ord('a') return (code3 + d, code4 + d) else: return None def lowercase_range(code1, code2): """ If the range of characters from code1 to code2-1 includes any upper case letters, return the corresponding lower case range. """ code3 = max(code1, ord('A')) code4 = min(code2, ord('Z') + 1) if code3 < code4: d = ord('a') - ord('A') return (code3 + d, code4 + d) else: return None def CodeRanges(code_list): """ Given a list of codes as returned by chars_to_ranges, return an RE which will match a character in any of the ranges. """ re_list = [CodeRange(code_list[i], code_list[i + 1]) for i in range(0, len(code_list), 2)] return Alt(*re_list) def CodeRange(code1, code2): """ CodeRange(code1, code2) is an RE which matches any character with a code |c| in the range |code1| <= |c| < |code2|. """ if code1 <= nl_code < code2: return Alt(RawCodeRange(code1, nl_code), RawNewline, RawCodeRange(nl_code + 1, code2)) else: return RawCodeRange(code1, code2) # # Abstract classes # class RE(object): """RE is the base class for regular expression constructors. The following operators are defined on REs: re1 + re2 is an RE which matches |re1| followed by |re2| re1 | re2 is an RE which matches either |re1| or |re2| """ nullable = 1 # True if this RE can match 0 input symbols match_nl = 1 # True if this RE can match a string ending with '\n' str = None # Set to a string to override the class's __str__ result def build_machine(self, machine, initial_state, final_state, match_bol, nocase): """ This method should add states to |machine| to implement this RE, starting at |initial_state| and ending at |final_state|. If |match_bol| is true, the RE must be able to match at the beginning of a line. If nocase is true, upper and lower case letters should be treated as equivalent. """ raise NotImplementedError("%s.build_machine not implemented" % self.__class__.__name__) def build_opt(self, m, initial_state, c): """ Given a state |s| of machine |m|, return a new state reachable from |s| on character |c| or epsilon. """ s = m.new_state() initial_state.link_to(s) initial_state.add_transition(c, s) return s def __add__(self, other): return Seq(self, other) def __or__(self, other): return Alt(self, other) def __str__(self): if self.str: return self.str else: return self.calc_str() def check_re(self, num, value): if not isinstance(value, RE): self.wrong_type(num, value, "Plex.RE instance") def check_string(self, num, value): if type(value) != type(''): self.wrong_type(num, value, "string") def check_char(self, num, value): self.check_string(num, value) if len(value) != 1: raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s." "Expected a string of length 1, got: %s" % ( num, self.__class__.__name__, repr(value))) def wrong_type(self, num, value, expected): if type(value) == types.InstanceType: got = "%s.%s instance" % ( value.__class__.__module__, value.__class__.__name__) else: got = type(value).__name__ raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " "(expected %s, got %s" % ( num, self.__class__.__name__, expected, got)) # # Primitive RE constructors # ------------------------- # # These are the basic REs from which all others are built. # ## class Char(RE): ## """ ## Char(c) is an RE which matches the character |c|. ## """ ## nullable = 0 ## def __init__(self, char): ## self.char = char ## self.match_nl = char == '\n' ## def build_machine(self, m, initial_state, final_state, match_bol, nocase): ## c = self.char ## if match_bol and c != BOL: ## s1 = self.build_opt(m, initial_state, BOL) ## else: ## s1 = initial_state ## if c == '\n' or c == EOF: ## s1 = self.build_opt(m, s1, EOL) ## if len(c) == 1: ## code = ord(self.char) ## s1.add_transition((code, code+1), final_state) ## if nocase and is_letter_code(code): ## code2 = other_case_code(code) ## s1.add_transition((code2, code2+1), final_state) ## else: ## s1.add_transition(c, final_state) ## def calc_str(self): ## return "Char(%s)" % repr(self.char) def Char(c): """ Char(c) is an RE which matches the character |c|. """ if len(c) == 1: result = CodeRange(ord(c), ord(c) + 1) else: result = SpecialSymbol(c) result.str = "Char(%s)" % repr(c) return result class RawCodeRange(RE): """ RawCodeRange(code1, code2) is a low-level RE which matches any character with a code |c| in the range |code1| <= |c| < |code2|, where the range does not include newline. For internal use only. """ nullable = 0 match_nl = 0 range = None # (code, code) uppercase_range = None # (code, code) or None lowercase_range = None # (code, code) or None def __init__(self, code1, code2): self.range = (code1, code2) self.uppercase_range = uppercase_range(code1, code2) self.lowercase_range = lowercase_range(code1, code2) def build_machine(self, m, initial_state, final_state, match_bol, nocase): if match_bol: initial_state = self.build_opt(m, initial_state, BOL) initial_state.add_transition(self.range, final_state) if nocase: if self.uppercase_range: initial_state.add_transition(self.uppercase_range, final_state) if self.lowercase_range: initial_state.add_transition(self.lowercase_range, final_state) def calc_str(self): return "CodeRange(%d,%d)" % (self.code1, self.code2) class _RawNewline(RE): """ RawNewline is a low-level RE which matches a newline character. For internal use only. """ nullable = 0 match_nl = 1 def build_machine(self, m, initial_state, final_state, match_bol, nocase): if match_bol: initial_state = self.build_opt(m, initial_state, BOL) s = self.build_opt(m, initial_state, EOL) s.add_transition((nl_code, nl_code + 1), final_state) RawNewline = _RawNewline() class SpecialSymbol(RE): """ SpecialSymbol(sym) is an RE which matches the special input symbol |sym|, which is one of BOL, EOL or EOF. """ nullable = 0 match_nl = 0 sym = None def __init__(self, sym): self.sym = sym def build_machine(self, m, initial_state, final_state, match_bol, nocase): # Sequences 'bol bol' and 'bol eof' are impossible, so only need # to allow for bol if sym is eol if match_bol and self.sym == EOL: initial_state = self.build_opt(m, initial_state, BOL) initial_state.add_transition(self.sym, final_state) class Seq(RE): """Seq(re1, re2, re3...) is an RE which matches |re1| followed by |re2| followed by |re3|...""" def __init__(self, *re_list): nullable = 1 for i, re in enumerate(re_list): self.check_re(i, re) nullable = nullable and re.nullable self.re_list = re_list self.nullable = nullable i = len(re_list) match_nl = 0 while i: i -= 1 re = re_list[i] if re.match_nl: match_nl = 1 break if not re.nullable: break self.match_nl = match_nl def build_machine(self, m, initial_state, final_state, match_bol, nocase): re_list = self.re_list if len(re_list) == 0: initial_state.link_to(final_state) else: s1 = initial_state n = len(re_list) for i, re in enumerate(re_list): if i < n - 1: s2 = m.new_state() else: s2 = final_state re.build_machine(m, s1, s2, match_bol, nocase) s1 = s2 match_bol = re.match_nl or (match_bol and re.nullable) def calc_str(self): return "Seq(%s)" % ','.join(map(str, self.re_list)) class Alt(RE): """Alt(re1, re2, re3...) is an RE which matches either |re1| or |re2| or |re3|...""" def __init__(self, *re_list): self.re_list = re_list nullable = 0 match_nl = 0 nullable_res = [] non_nullable_res = [] i = 1 for re in re_list: self.check_re(i, re) if re.nullable: nullable_res.append(re) nullable = 1 else: non_nullable_res.append(re) if re.match_nl: match_nl = 1 i += 1 self.nullable_res = nullable_res self.non_nullable_res = non_nullable_res self.nullable = nullable self.match_nl = match_nl def build_machine(self, m, initial_state, final_state, match_bol, nocase): for re in self.nullable_res: re.build_machine(m, initial_state, final_state, match_bol, nocase) if self.non_nullable_res: if match_bol: initial_state = self.build_opt(m, initial_state, BOL) for re in self.non_nullable_res: re.build_machine(m, initial_state, final_state, 0, nocase) def calc_str(self): return "Alt(%s)" % ','.join(map(str, self.re_list)) class Rep1(RE): """Rep1(re) is an RE which matches one or more repetitions of |re|.""" def __init__(self, re): self.check_re(1, re) self.re = re self.nullable = re.nullable self.match_nl = re.match_nl def build_machine(self, m, initial_state, final_state, match_bol, nocase): s1 = m.new_state() s2 = m.new_state() initial_state.link_to(s1) self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase) s2.link_to(s1) s2.link_to(final_state) def calc_str(self): return "Rep1(%s)" % self.re class SwitchCase(RE): """ SwitchCase(re, nocase) is an RE which matches the same strings as RE, but treating upper and lower case letters according to |nocase|. If |nocase| is true, case is ignored, otherwise it is not. """ re = None nocase = None def __init__(self, re, nocase): self.re = re self.nocase = nocase self.nullable = re.nullable self.match_nl = re.match_nl def build_machine(self, m, initial_state, final_state, match_bol, nocase): self.re.build_machine(m, initial_state, final_state, match_bol, self.nocase) def calc_str(self): if self.nocase: name = "NoCase" else: name = "Case" return "%s(%s)" % (name, self.re) # # Composite RE constructors # ------------------------- # # These REs are defined in terms of the primitive REs. # Empty = Seq() Empty.__doc__ = \ """ Empty is an RE which matches the empty string. """ Empty.str = "Empty" def Str1(s): """ Str1(s) is an RE which matches the literal string |s|. """ result = Seq(*tuple(map(Char, s))) result.str = "Str(%s)" % repr(s) return result def Str(*strs): """ Str(s) is an RE which matches the literal string |s|. Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|... """ if len(strs) == 1: return Str1(strs[0]) else: result = Alt(*tuple(map(Str1, strs))) result.str = "Str(%s)" % ','.join(map(repr, strs)) return result def Any(s): """ Any(s) is an RE which matches any character in the string |s|. """ #result = apply(Alt, tuple(map(Char, s))) result = CodeRanges(chars_to_ranges(s)) result.str = "Any(%s)" % repr(s) return result def AnyBut(s): """ AnyBut(s) is an RE which matches any character (including newline) which is not in the string |s|. """ ranges = chars_to_ranges(s) ranges.insert(0, -maxint) ranges.append(maxint) result = CodeRanges(ranges) result.str = "AnyBut(%s)" % repr(s) return result AnyChar = AnyBut("") AnyChar.__doc__ = \ """ AnyChar is an RE which matches any single character (including a newline). """ AnyChar.str = "AnyChar" def Range(s1, s2=None): """ Range(c1, c2) is an RE which matches any single character in the range |c1| to |c2| inclusive. Range(s) where |s| is a string of even length is an RE which matches any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,... """ if s2: result = CodeRange(ord(s1), ord(s2) + 1) result.str = "Range(%s,%s)" % (s1, s2) else: ranges = [] for i in range(0, len(s1), 2): ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1)) result = Alt(*ranges) result.str = "Range(%s)" % repr(s1) return result def Opt(re): """ Opt(re) is an RE which matches either |re| or the empty string. """ result = Alt(re, Empty) result.str = "Opt(%s)" % re return result def Rep(re): """ Rep(re) is an RE which matches zero or more repetitions of |re|. """ result = Opt(Rep1(re)) result.str = "Rep(%s)" % re return result def NoCase(re): """ NoCase(re) is an RE which matches the same strings as RE, but treating upper and lower case letters as equivalent. """ return SwitchCase(re, nocase=1) def Case(re): """ Case(re) is an RE which matches the same strings as RE, but treating upper and lower case letters as distinct, i.e. it cancels the effect of any enclosing NoCase(). """ return SwitchCase(re, nocase=0) # # RE Constants # Bol = Char(BOL) Bol.__doc__ = \ """ Bol is an RE which matches the beginning of a line. """ Bol.str = "Bol" Eol = Char(EOL) Eol.__doc__ = \ """ Eol is an RE which matches the end of a line. """ Eol.str = "Eol" Eof = Char(EOF) Eof.__doc__ = \ """ Eof is an RE which matches the end of the file. """ Eof.str = "Eof"