Module textwalker.pattern_parser
Contains classes for parsing pattern and matching text against the pattern
Expand source code
"""
Contains classes for parsing pattern and matching text against the pattern
"""
from enum import Enum
from typing import List, Optional, Union
from .utils import arr2str
# Globals
ESCAPABLE_CHARS = {"(", ")", "[", "]", "{", "}", "-", "+", "*", "?"}
# Enums
# R(ange(Quantifier)State explicitly tracks the state of parsing, i.e.
# which symbolic token is being parsed since interpretation of space character
# depends on current state.
# DevNote: This could be achieved via flags that could be used to determine
# when we've reached an invalid state. And in fact, they encode the same thing,
# the tradeoff is in clarity of intention vs. the more code.
# states:
# OpenBrace- the opening '{' is seen but not yet started parsing first num
# ParsingLower - first lower bound number char has been seen
# PostLower - lower bound number boundary reached
# PreUpper - comma has been reached
# ParsingUpper - first upper bound number char is seen
RQState = Enum('State', 'OpenBrace ParsingLower PostLower PreUpper ParsingUpper PostUpper')
# Exceptions
class UnescapedChar(Exception):
"""
These represent the constraint that special chars be escaped
"""
pass
class UnrecognizedEscapedChar(Exception):
"""
This represents that random characters should not be escaped
"""
pass
class UnclosedCharSet(Exception):
"""
A char set was not closed, i.e. missing ']'
"""
pass
class UnassociatedQuantifier(Exception):
"""
An unassociated quantifier was found.
NOTE: This is only raised, e.g. if the pattern was {1,2},
but not for other quantifier. This is because the correct
error messages, depends on the intentions of the user.
From my assessment, a free '+' is likely to be an unescaped
char. But '{1,2}' is almost certainly a missing body
"""
class UnexpectedChar(Exception):
"""
generic exception for unexpected char and where one
of the other exceptions doesn't seem appropriate
"""
# Quantifier classes
class Quantifier:
"""
base quantifier class
"""
pass
class ZeroOrMore(Quantifier):
"""
Represents quantifier zero or more repetitions
"""
def __str__(self):
return "*"
class OneOrMore(Quantifier):
"""
Represents quantifier one or more repetitions
"""
def __str__(self):
return "+"
def __repr__(self):
return str(self)
class ZeroOrOne(Quantifier):
"""
Represents quantifier zero or one repetitions
"""
def __str__(self):
return "?"
class RangeQuantifier(Quantifier):
"""
Defines a inclusive range for the number
of repetitions
"""
def __init__(self, lower_bound: int, upper_bound: int):
self.lbound = lower_bound
self.ubound = upper_bound
def __str__(self):
return f'{{{self.lbound},{self.ubound}}}'
def __repr__(self):
return str(self)
class Token:
"""
base token type
"""
def __init__(self, quantifier=None):
self.quantifier = quantifier
class Literal(Token):
"""
Represents a literal token
"""
def __init__(self, value, quantifier=None):
super().__init__(quantifier)
self.value = value
def __repr__(self):
if self.quantifier is not None:
return f"L[{self.value}{self.quantifier}]"
return f"L[{self.value}]"
def __str__(self):
return self.__repr__()
class CharRange(Token):
"""
Represents a child range in a charset, e.g. a-z
"""
def __init__(self, range_start, range_end, quantifier=None):
super().__init__(quantifier)
self.range_start = range_start
self.range_end = range_end
def __repr__(self):
if self.quantifier is not None:
return f"CR[{self.range_start}-{self.range_end}{self.quantifier}]"
return f"CR({self.range_start}-{self.range_end})"
def __str__(self):
return self.__repr__()
class Charset(Token):
"""
Represents a charset e.g. [a-z01]
"""
def __init__(self, matches: list, quantifier=None):
super().__init__(quantifier)
# matches is composed of either literals or ranges
self.matches = matches
def __repr__(self):
if self.quantifier is not None:
return f"CS[{self.matches}{self.quantifier}]"
return f"CS[{self.matches}]"
def __str__(self):
return self.__repr__()
class Grouping(Token):
"""
Represents a grouping e.g. (foo)
"""
def __init__(self, groups: list, quantifier=None):
self.groups = groups
super().__init__(quantifier)
def __repr__(self):
if self.quantifier is not None:
return f"G[{self.groups}{self.quantifier}]"
return f"G[{self.groups}]"
class MatchResult:
"""
Result for match
"""
def __init__(self, matched: bool, content: str = ""):
self.matched = matched
self.content = content
def __str__(self):
if self.matched is False:
return "NoMatch"
return f"Match({self.content})"
def __repr__(self):
return str(self)
class PatternParser:
"""
Class responsible for parsing logic.
"""
def __init__(self, pattern: str):
# when parsing a chunk, `start` is the start of the chunk
# and `current` points to the `current` char
self.start = 0
self.current = 0
self.pattern = pattern
self.compiled = None
self.compile()
def is_at_end(self) -> bool:
"""
Determine whether parser is at the end of the pattern text
"""
return self.current >= len(self.pattern)
def advance(self) -> str:
"""
Consume and return the current char.
Consumption increments `current`
"""
char = self.pattern[self.current]
self.current += 1
return char
def has_next(self) -> bool:
"""
Determines whether there is a next element to consume
"""
return self.current < len(self.pattern) - 1
def has_next_next(self) -> bool:
"""
Determines whether there is a next to next element to consume
"""
return self.current < len(self.pattern) - 2
def peek(self) -> str:
"""
View the current element without consuming it
"""
return self.pattern[self.current]
def peek_next(self) -> str:
"""
View the next element without consuming it.
NOTE: This call is only valid if `has_next` is True
"""
return self.pattern[self.current + 1]
def peek_next_next(self) -> str:
"""
View the next to next element without consuming it
NOTE: This call is only valid if `has_next_next` is True
"""
return self.pattern[self.current + 2]
@staticmethod
def try_chars_to_num(chars: List[str]) -> Optional[int]:
"""
Attempt to parse a list of numeric chars into an int;
if input is empty, returns None
"""
num_str = ''.join(chars)
if len(num_str) == 0:
return None
return int(num_str)
@staticmethod
def coalesce_literals(tokens: List[Token]) -> List[Token]:
"""
Combine adjacent literal chars with no quantifier into a literal word;
e.g. L[a]L[b] -> L[ab]
This is needed because the parsing pass, does not peek to the next
char and treats each char as a single-char literal. This coalescing is needed
to match word level repetitions.
The relative ordering of non-`Literal` tokens and `Literal` tokens with
quantifiers is unchanged.
"""
coalesced = []
partials = (
[]
) # partial list of chars that will be coalesced into in a single Literal
for idx, token in enumerate(tokens):
# if quantifier is not None, can't coalesce into one literal
if isinstance(token, Literal) and token.quantifier is None:
partials.append(token)
elif len(partials) > 0:
# we're at coalesce boundary; coalesce `partials` into a single
# Literal and add to result
value = "".join([literal.value for literal in partials])
coalesced.append(Literal(value))
partials = []
# add all other tokens, as-is
if (
not isinstance(token, Literal)
or isinstance(token, Literal)
and token.quantifier is not None
):
coalesced.append(token)
if len(partials) > 0:
# add remaining chars as a literal
value = "".join([literal.value for literal in partials])
coalesced.append(Literal(value))
return coalesced
def compile(self):
"""
compile the user supplied pattern
"""
if self.compiled is None:
self.compiled = self.compile_grouping()
def compile_subgroups(self, subgroups: List[Token]) -> Token:
"""
Compile subgroups.
When compiling a pattern, a pattern may consist of
one or more subgroups. If there is a single sub-group, return
the subgroup as is. This is needed to avoid excessive nesting,
which leads to incorrect behavior. If there are multiple sub-groups, then
wrap them in a coalesced `Grouping`
"""
coalesced = self.coalesce_literals(subgroups)
if len(coalesced) == 1:
# return single sub group as is
return coalesced.pop()
# wrap multiple sub-groups in a grouping
return Grouping(coalesced)
def compile_grouping(self, is_nested: bool = False) -> Grouping:
"""
Compile the pattern/grouping. A grouping consists of any number of
literals, charsets, and sub-groups, and is the most general
representation of a pattern. Hence, the user input is treated as a `Grouping`
args:
is_nested: whether this is nested grouping
pattern can be of form:
foo : literal
[a-z]: charset (range)
[3-9]
[az] charsets (enumeration)
fox* : single-char quantifier: *,+,?
[3-9]{3,4}: quantity/numeric range
(foo)*: groupings
(foo_[a-z]*)+: groupings
NB: quantifiers can apply to any other element, except another quantifier
special chars, namely:
-, {, }, [, ], *, +, ?, (, ) must be escaped
escape via backslash prepended to char
An unescaped special char is an error
An escaped char, that should not be escaped in an error.
Returns:
compiled grouping; for the root call this is a `Grouping`
object; for a non-root call this could be any other `Token`
"""
compiled = []
while self.is_at_end() is False:
ch = self.advance()
if ch == "(":
# handle grouping
grouping = self.compile_grouping(is_nested=True)
compiled.append(grouping)
elif ch == ")":
if is_nested:
# this terminates the grouping
return self.compile_subgroups(compiled)
raise UnescapedChar(ch)
elif ch == "[":
# this will either succeed and consume and return
# entire charset, or raise exception
charset = self.compile_charset()
compiled.append(charset)
# handle quantifiers
elif ch == "*" or ch == "+" or ch == "?":
# find matchable to attach quantifier to
matchable = compiled[-1] if len(compiled) > 0 else None
if matchable is None:
raise UnescapedChar(ch)
if ch == "*":
matchable.quantifier = ZeroOrMore()
elif ch == "+":
matchable.quantifier = OneOrMore()
else:
assert ch == "?", "unknown quantifier"
matchable.quantifier = ZeroOrOne()
elif ch == "{": # range quantifier
# this will either succeed and consume the entire quantifier or raise
range_quantifier = self.compile_range_quantifier()
matchable = compiled[-1] if len(compiled) > 0 else None
if matchable is None:
# some error
raise UnassociatedQuantifier(str(range_quantifier))
matchable.quantifier = range_quantifier
elif ch == '-':
# this is an unescaped dash
raise UnescapedChar(ch)
# handle escape char
elif ch == "\\":
next_char = self.advance()
if next_char not in ESCAPABLE_CHARS:
# NOTE: currently not supporting all escape chars
raise UnrecognizedEscapedChar(next_char)
# add the escaped char as a literal
compiled.append(Literal(next_char))
else:
compiled.append(Literal(ch))
if is_nested:
# this is error; since this was a nested call we should have found
# a closing bracket; the choice of exception is imprecise
# because the user's intention could be to: 1) create a group
# and perhaps forgot the closing bracket; or 2) a literal match
# on open bracket "(" and forgot to escape; for now bracket "(" must be escaped
raise UnescapedChar("Unclosed bracket")
grouping = self.compile_subgroups(compiled)
if not is_nested and not isinstance(grouping, Grouping):
# this is the root call- the returned must be wrapped in a Grouping
grouping = Grouping([grouping])
return grouping
def compile_range_quantifier(self) -> RangeQuantifier:
"""
Compile a range quantifier, e.g. '{1, 2}'
This is invoked after consuming the opening '{'
"""
current = RQState.OpenBrace
lower = -1
upper = -1
chars = [] # number chars
while self.is_at_end() is False:
ch = self.advance()
if ch.isnumeric():
chars.append(ch)
if current == RQState.OpenBrace:
current = RQState.ParsingLower
elif current == RQState.PostLower or current == RQState.PostUpper:
# error: a number's word boundary was already reached
# there is a break between numeric characters
raise UnexpectedChar(ch, "Invalid quantifier range")
elif current == RQState.PreUpper:
current = RQState.ParsingUpper
elif ch.isspace():
# how to interpret a space depends on what is being parsed
if current == RQState.OpenBrace:
continue # skip leading space
elif current == RQState.ParsingLower:
# this is a word boundary
lower = self.try_chars_to_num(chars)
if lower is None:
raise UnexpectedChar("Unexpected space character word boundary; Invalid quantifier range")
chars = []
current = RQState.PostLower
elif current == RQState.ParsingUpper:
current = RQState.PostUpper
elif ch == ',':
current = RQState.PreUpper
# the first word
lower = self.try_chars_to_num(chars)
if lower is None:
# ill formed expression
raise UnexpectedChar(',', " '{' must be either escaped, or be properly constructed e.g. {M, N}; Invalid quantifier range") # noqa E501
chars = []
elif ch == '}':
upper = self.try_chars_to_num(chars)
if upper is None:
raise UnexpectedChar(',', " '{' must be either escaped, or be properly constructed e.g. {M, N}; Invalid quantifier range") # noqa E501
break
else:
# unexpected char
raise UnexpectedChar
return RangeQuantifier(lower, upper)
def compile_charset(self) -> Charset:
"""
Consume characters in the pattern, corresponding to a charset,
i.e. started with '[', terminated with ']' with literals and char ranges
in between.
Returns:
compiled `Charset`
Raises:
- UnclosedSet and UnescapedChar
"""
result = []
closed = False # whether the charset is closed
while self.is_at_end() is False:
ch = self.advance() # consume char
# handle escape char
if ch == "\\":
next_char = self.advance()
if next_char not in ESCAPABLE_CHARS:
# NOTE: currently not supporting all escape chars
raise UnrecognizedEscapedChar
# add the escaped char as a literal
result.append(Literal(next_char))
# handle range dash
if ch == "-":
# an unescaped dash, should only appear between a range
# this simplifies the case, where it's the first or last char in set
raise UnescapedChar(ch)
if ch == "]":
# closing bracket found
closed = True
break
else: # ch is non-special
# check if it's part of a range
if self.has_next() and self.peek() == "-":
if not self.has_next_next():
# unescaped dash's are only supported in range
raise UnescapedChar(self.peek())
self.advance() # consume the dash
rng_end = self.advance()
rng = CharRange(ch, rng_end)
result.append(rng)
else: # handle literal
result.append(Literal(ch))
if not closed:
raise UnclosedCharSet()
return Charset(result)
def match(self, string: str, startidx: int = 0) -> Optional[str]:
"""
Attempt to match `string` starting at `startidx`
Args:
string: string to match
startidx: location to start search
Return
None: no-match
str: match (possibly zero length)
"""
matcher = PatternMatcher(self.compiled)
return matcher.match(string, startidx)
class PatternMatcher:
"""
Encapsulates pattern matching logic
"""
def __init__(self, compiled: Grouping):
self.compiled = compiled
@staticmethod
def can_consume(to_run_iteration: int, quantifier: Quantifier) -> bool:
"""
Determines whether the `quantifier` allows consuming another repetition
NOTE: this should be invoked before consuming
to_run_iteration is 0-based
"""
if isinstance(quantifier, ZeroOrMore) or isinstance(quantifier, OneOrMore):
return True
elif isinstance(quantifier, ZeroOrOne):
return to_run_iteration < 1
elif isinstance(quantifier, RangeQuantifier):
# NB: quantifier values are 1-based and inclusive, and to_run_iteration is 0-based
return to_run_iteration < quantifier.ubound
elif quantifier is None:
# interpret None as 1
return to_run_iteration < 1
return True
@staticmethod
def sufficient_consumed(last_repetition: int, quantifier: Quantifier) -> bool:
"""
Determines whether the minimum number of elements was consumed
NOTE: this should be invoked after consuming
"""
if isinstance(quantifier, ZeroOrOne):
return True
elif isinstance(quantifier, ZeroOrMore):
return True
elif isinstance(quantifier, OneOrMore):
return last_repetition >= 1
elif isinstance(quantifier, RangeQuantifier):
# NB: quantifier values are 1-based and inclusive, and last_repetition is 0-based
return quantifier.lbound <= last_repetition
elif quantifier is None:
# TODO: not sure if this correct
return True
else:
return True
@staticmethod
def accepts_empty(quantifier: Quantifier) -> bool:
"""
Determines whether the quantifier allows for an empty match
"""
# TODO: add check for quantity range
return isinstance(quantifier, ZeroOrMore) or isinstance(quantifier, ZeroOrOne)
def check_and_update_empty(
self, result: MatchResult, quantifier: Quantifier
) -> MatchResult:
"""
In cases where the quantifier, permits 0 matches, e.g. *, ?,
a "no-match" of a sub-group is to be treated as a match of length zero.
Returns:
If there is a no-match, and the quantifier allows zero matches,
then this will update the result to be an empty match;
Otherwise, it will return the `result`
"""
# if quantifier is [0,..] and no-match, this is treated
# as a match of len 0;
if result.matched is False and self.accepts_empty(quantifier):
result = MatchResult(True, "")
# else return original result
return result
@staticmethod
def match_literal(literal: Literal, string: str, stridx: int = 0) -> MatchResult:
"""
Attempt to match a literal
"""
for idx, lch in enumerate(literal.value):
if stridx + idx == len(string):
return MatchResult(False)
if lch != string[stridx + idx]:
return MatchResult(False)
return MatchResult(True, literal.value)
@staticmethod
def match_charset(charset: Charset, string: str, stridx: int = 0) -> MatchResult:
"""
Attempt to match a charset
"""
for charset_member in charset.matches:
if stridx == len(string):
# handle empty string
return MatchResult(False)
if isinstance(charset_member, Literal):
if charset_member.value == string[stridx]:
return MatchResult(True, string[stridx])
elif isinstance(charset_member, CharRange):
# NOTE: this comparison relies on python's lexical ordering
if (
charset_member.range_start
<= string[stridx]
<= charset_member.range_end
):
return MatchResult(True, string[stridx])
return MatchResult(False)
def match_sub_groups(self, groups: List[Token], string: str, stridx: int = 0):
"""
Attempt to match a list of sub groups.
"""
sgptr = 0 # sub-group ptr
sgroup_repetition = 0 # count of repetitions of current sub group
matched = []
# the last gptr location where a match was found
# needed to determine if the pattern was fully consumed
last_matched_sgptr = -1
while sgptr < len(groups):
# there are 2 things to check:
# 1) is there a match
# 2) is the number of repetitions of match as expected
# consume as much of string (maximum munch) using subgroup
subgroup = groups[sgptr]
# does the quantifier on this subgroup, allow it to consume more chars
if self.can_consume(sgroup_repetition, subgroup.quantifier):
# invoke the right handler
res = None
if isinstance(subgroup, Literal):
res = self.match_literal(subgroup, string, stridx)
res = self.check_and_update_empty(res, subgroup.quantifier)
elif isinstance(subgroup, Charset):
res = self.match_charset(subgroup, string, stridx)
res = self.check_and_update_empty(res, subgroup.quantifier)
else:
assert isinstance(
subgroup, Grouping
), "unexpected sub-expression type"
# groupings can be nested
# so the matching algorithm must be recursive
res = self.match_grouping(subgroup, string, stridx)
res = self.check_and_update_empty(res, subgroup.quantifier)
# current component matched
if res.matched:
sgroup_repetition += 1
matched.append(res.content)
# increment string pointer
stridx += len(res.content)
last_matched_sgptr = sgptr
# increment the gptr; this represents
# something not matching with [0,...] quantifier
# we treat this as a match of length 0
if len(res.content) == 0:
sgptr += 1
sgroup_repetition = 0
# current component did not match
else:
# no match, move to next matchable
# check minimum match cond was violated
if not self.sufficient_consumed(
sgroup_repetition, subgroup.quantifier
):
return MatchResult(False)
sgroup_repetition = 0
# increment component pointer
sgptr += 1
else:
# quantifier restricts further consume
# consider next subgroup
sgptr += 1
sgroup_repetition = 0
continue
# we want the pattern to be fully consumed and at least one
# group has not been matched
if last_matched_sgptr < len(groups) - 1:
return MatchResult(False)
content = arr2str(matched)
return MatchResult(True, content)
def match_grouping(
self, grouping: Grouping, string: str, stridx: int = 0
) -> MatchResult:
"""
Finds the longest match by greedily matching characters.
Greedy here implies, that when matching text on a sub-group, it will consume
as many characters from the text, as the sub-group can match. This is
noteworthy since a non-greedy sub-group match may result in a overall pattern match; whereas
the greedy approach results in a non-match; e.g.:
for pattern = (aa)*a, text = aaaa, the greedy algorithm would consider this a
non-match since, the pattern must be fully consumed, and the final 'a' does not get
consumed.
The user string may be partially consumed. However, if the pattern
is not consumed, then this is considered a non-match.
An empty string is a valid input to match. Further an empty string can
match an arbitrary pattern, as long as each child pattern allows
zero matches. Thus all matching handlers must handle zero-length input string
Args:
stridx: idx where to start matching string
Returns:
MatchResult.matched is True if there is a complete match; else False
"""
groups = grouping.groups
group_repetition = 0 # count of repetitions of group
matched = [] # chars that have matched
sg_matched = True
while self.can_consume(group_repetition, grouping.quantifier):
res = self.match_sub_groups(groups, string, stridx)
if res.matched:
matched.append(res.content)
group_repetition += 1
stridx += len(res.content)
if len(res.content) == 0:
break
else:
# sub group did not match
sg_matched = False
break
content = arr2str(matched)
# the following distinguishes a non-match from an empty match
# i.e. content length is 0 and the quantifier does not allow a zero match
if (len(content) == 0 and sg_matched is False) and self.accepts_empty(
grouping.quantifier
) is False:
return MatchResult(False)
return MatchResult(True, content)
def match(self, string: str, stridx: int = 0) -> Optional[str]:
"""
Attempt to match `string` starting at `startidx`
Args:
string: string to match
startidx: location to start search
Return
None: no-match
str: match (possibly zero length)
"""
matched = None
result = self.match_grouping(self.compiled, string, stridx)
if result.matched:
matched = result.content
return matched
Classes
class CharRange (range_start, range_end, quantifier=None)
-
Represents a child range in a charset, e.g. a-z
Expand source code
class CharRange(Token): """ Represents a child range in a charset, e.g. a-z """ def __init__(self, range_start, range_end, quantifier=None): super().__init__(quantifier) self.range_start = range_start self.range_end = range_end def __repr__(self): if self.quantifier is not None: return f"CR[{self.range_start}-{self.range_end}{self.quantifier}]" return f"CR({self.range_start}-{self.range_end})" def __str__(self): return self.__repr__()
Ancestors
class Charset (matches: list, quantifier=None)
-
Represents a charset e.g. [a-z01]
Expand source code
class Charset(Token): """ Represents a charset e.g. [a-z01] """ def __init__(self, matches: list, quantifier=None): super().__init__(quantifier) # matches is composed of either literals or ranges self.matches = matches def __repr__(self): if self.quantifier is not None: return f"CS[{self.matches}{self.quantifier}]" return f"CS[{self.matches}]" def __str__(self): return self.__repr__()
Ancestors
class Grouping (groups: list, quantifier=None)
-
Represents a grouping e.g. (foo)
Expand source code
class Grouping(Token): """ Represents a grouping e.g. (foo) """ def __init__(self, groups: list, quantifier=None): self.groups = groups super().__init__(quantifier) def __repr__(self): if self.quantifier is not None: return f"G[{self.groups}{self.quantifier}]" return f"G[{self.groups}]"
Ancestors
class Literal (value, quantifier=None)
-
Represents a literal token
Expand source code
class Literal(Token): """ Represents a literal token """ def __init__(self, value, quantifier=None): super().__init__(quantifier) self.value = value def __repr__(self): if self.quantifier is not None: return f"L[{self.value}{self.quantifier}]" return f"L[{self.value}]" def __str__(self): return self.__repr__()
Ancestors
class MatchResult (matched: bool, content: str = '')
-
Result for match
Expand source code
class MatchResult: """ Result for match """ def __init__(self, matched: bool, content: str = ""): self.matched = matched self.content = content def __str__(self): if self.matched is False: return "NoMatch" return f"Match({self.content})" def __repr__(self): return str(self)
class OneOrMore
-
Represents quantifier one or more repetitions
Expand source code
class OneOrMore(Quantifier): """ Represents quantifier one or more repetitions """ def __str__(self): return "+" def __repr__(self): return str(self)
Ancestors
class PatternMatcher (compiled: Grouping)
-
Encapsulates pattern matching logic
Expand source code
class PatternMatcher: """ Encapsulates pattern matching logic """ def __init__(self, compiled: Grouping): self.compiled = compiled @staticmethod def can_consume(to_run_iteration: int, quantifier: Quantifier) -> bool: """ Determines whether the `quantifier` allows consuming another repetition NOTE: this should be invoked before consuming to_run_iteration is 0-based """ if isinstance(quantifier, ZeroOrMore) or isinstance(quantifier, OneOrMore): return True elif isinstance(quantifier, ZeroOrOne): return to_run_iteration < 1 elif isinstance(quantifier, RangeQuantifier): # NB: quantifier values are 1-based and inclusive, and to_run_iteration is 0-based return to_run_iteration < quantifier.ubound elif quantifier is None: # interpret None as 1 return to_run_iteration < 1 return True @staticmethod def sufficient_consumed(last_repetition: int, quantifier: Quantifier) -> bool: """ Determines whether the minimum number of elements was consumed NOTE: this should be invoked after consuming """ if isinstance(quantifier, ZeroOrOne): return True elif isinstance(quantifier, ZeroOrMore): return True elif isinstance(quantifier, OneOrMore): return last_repetition >= 1 elif isinstance(quantifier, RangeQuantifier): # NB: quantifier values are 1-based and inclusive, and last_repetition is 0-based return quantifier.lbound <= last_repetition elif quantifier is None: # TODO: not sure if this correct return True else: return True @staticmethod def accepts_empty(quantifier: Quantifier) -> bool: """ Determines whether the quantifier allows for an empty match """ # TODO: add check for quantity range return isinstance(quantifier, ZeroOrMore) or isinstance(quantifier, ZeroOrOne) def check_and_update_empty( self, result: MatchResult, quantifier: Quantifier ) -> MatchResult: """ In cases where the quantifier, permits 0 matches, e.g. *, ?, a "no-match" of a sub-group is to be treated as a match of length zero. Returns: If there is a no-match, and the quantifier allows zero matches, then this will update the result to be an empty match; Otherwise, it will return the `result` """ # if quantifier is [0,..] and no-match, this is treated # as a match of len 0; if result.matched is False and self.accepts_empty(quantifier): result = MatchResult(True, "") # else return original result return result @staticmethod def match_literal(literal: Literal, string: str, stridx: int = 0) -> MatchResult: """ Attempt to match a literal """ for idx, lch in enumerate(literal.value): if stridx + idx == len(string): return MatchResult(False) if lch != string[stridx + idx]: return MatchResult(False) return MatchResult(True, literal.value) @staticmethod def match_charset(charset: Charset, string: str, stridx: int = 0) -> MatchResult: """ Attempt to match a charset """ for charset_member in charset.matches: if stridx == len(string): # handle empty string return MatchResult(False) if isinstance(charset_member, Literal): if charset_member.value == string[stridx]: return MatchResult(True, string[stridx]) elif isinstance(charset_member, CharRange): # NOTE: this comparison relies on python's lexical ordering if ( charset_member.range_start <= string[stridx] <= charset_member.range_end ): return MatchResult(True, string[stridx]) return MatchResult(False) def match_sub_groups(self, groups: List[Token], string: str, stridx: int = 0): """ Attempt to match a list of sub groups. """ sgptr = 0 # sub-group ptr sgroup_repetition = 0 # count of repetitions of current sub group matched = [] # the last gptr location where a match was found # needed to determine if the pattern was fully consumed last_matched_sgptr = -1 while sgptr < len(groups): # there are 2 things to check: # 1) is there a match # 2) is the number of repetitions of match as expected # consume as much of string (maximum munch) using subgroup subgroup = groups[sgptr] # does the quantifier on this subgroup, allow it to consume more chars if self.can_consume(sgroup_repetition, subgroup.quantifier): # invoke the right handler res = None if isinstance(subgroup, Literal): res = self.match_literal(subgroup, string, stridx) res = self.check_and_update_empty(res, subgroup.quantifier) elif isinstance(subgroup, Charset): res = self.match_charset(subgroup, string, stridx) res = self.check_and_update_empty(res, subgroup.quantifier) else: assert isinstance( subgroup, Grouping ), "unexpected sub-expression type" # groupings can be nested # so the matching algorithm must be recursive res = self.match_grouping(subgroup, string, stridx) res = self.check_and_update_empty(res, subgroup.quantifier) # current component matched if res.matched: sgroup_repetition += 1 matched.append(res.content) # increment string pointer stridx += len(res.content) last_matched_sgptr = sgptr # increment the gptr; this represents # something not matching with [0,...] quantifier # we treat this as a match of length 0 if len(res.content) == 0: sgptr += 1 sgroup_repetition = 0 # current component did not match else: # no match, move to next matchable # check minimum match cond was violated if not self.sufficient_consumed( sgroup_repetition, subgroup.quantifier ): return MatchResult(False) sgroup_repetition = 0 # increment component pointer sgptr += 1 else: # quantifier restricts further consume # consider next subgroup sgptr += 1 sgroup_repetition = 0 continue # we want the pattern to be fully consumed and at least one # group has not been matched if last_matched_sgptr < len(groups) - 1: return MatchResult(False) content = arr2str(matched) return MatchResult(True, content) def match_grouping( self, grouping: Grouping, string: str, stridx: int = 0 ) -> MatchResult: """ Finds the longest match by greedily matching characters. Greedy here implies, that when matching text on a sub-group, it will consume as many characters from the text, as the sub-group can match. This is noteworthy since a non-greedy sub-group match may result in a overall pattern match; whereas the greedy approach results in a non-match; e.g.: for pattern = (aa)*a, text = aaaa, the greedy algorithm would consider this a non-match since, the pattern must be fully consumed, and the final 'a' does not get consumed. The user string may be partially consumed. However, if the pattern is not consumed, then this is considered a non-match. An empty string is a valid input to match. Further an empty string can match an arbitrary pattern, as long as each child pattern allows zero matches. Thus all matching handlers must handle zero-length input string Args: stridx: idx where to start matching string Returns: MatchResult.matched is True if there is a complete match; else False """ groups = grouping.groups group_repetition = 0 # count of repetitions of group matched = [] # chars that have matched sg_matched = True while self.can_consume(group_repetition, grouping.quantifier): res = self.match_sub_groups(groups, string, stridx) if res.matched: matched.append(res.content) group_repetition += 1 stridx += len(res.content) if len(res.content) == 0: break else: # sub group did not match sg_matched = False break content = arr2str(matched) # the following distinguishes a non-match from an empty match # i.e. content length is 0 and the quantifier does not allow a zero match if (len(content) == 0 and sg_matched is False) and self.accepts_empty( grouping.quantifier ) is False: return MatchResult(False) return MatchResult(True, content) def match(self, string: str, stridx: int = 0) -> Optional[str]: """ Attempt to match `string` starting at `startidx` Args: string: string to match startidx: location to start search Return None: no-match str: match (possibly zero length) """ matched = None result = self.match_grouping(self.compiled, string, stridx) if result.matched: matched = result.content return matched
Static methods
def accepts_empty(quantifier: Quantifier) ‑> bool
-
Determines whether the quantifier allows for an empty match
Expand source code
@staticmethod def accepts_empty(quantifier: Quantifier) -> bool: """ Determines whether the quantifier allows for an empty match """ # TODO: add check for quantity range return isinstance(quantifier, ZeroOrMore) or isinstance(quantifier, ZeroOrOne)
def can_consume(to_run_iteration: int, quantifier: Quantifier) ‑> bool
-
Determines whether the
quantifier
allows consuming another repetition NOTE: this should be invoked before consuming to_run_iteration is 0-basedExpand source code
@staticmethod def can_consume(to_run_iteration: int, quantifier: Quantifier) -> bool: """ Determines whether the `quantifier` allows consuming another repetition NOTE: this should be invoked before consuming to_run_iteration is 0-based """ if isinstance(quantifier, ZeroOrMore) or isinstance(quantifier, OneOrMore): return True elif isinstance(quantifier, ZeroOrOne): return to_run_iteration < 1 elif isinstance(quantifier, RangeQuantifier): # NB: quantifier values are 1-based and inclusive, and to_run_iteration is 0-based return to_run_iteration < quantifier.ubound elif quantifier is None: # interpret None as 1 return to_run_iteration < 1 return True
def match_charset(charset: Charset, string: str, stridx: int = 0) ‑> MatchResult
-
Attempt to match a charset
Expand source code
@staticmethod def match_charset(charset: Charset, string: str, stridx: int = 0) -> MatchResult: """ Attempt to match a charset """ for charset_member in charset.matches: if stridx == len(string): # handle empty string return MatchResult(False) if isinstance(charset_member, Literal): if charset_member.value == string[stridx]: return MatchResult(True, string[stridx]) elif isinstance(charset_member, CharRange): # NOTE: this comparison relies on python's lexical ordering if ( charset_member.range_start <= string[stridx] <= charset_member.range_end ): return MatchResult(True, string[stridx]) return MatchResult(False)
def match_literal(literal: Literal, string: str, stridx: int = 0) ‑> MatchResult
-
Attempt to match a literal
Expand source code
@staticmethod def match_literal(literal: Literal, string: str, stridx: int = 0) -> MatchResult: """ Attempt to match a literal """ for idx, lch in enumerate(literal.value): if stridx + idx == len(string): return MatchResult(False) if lch != string[stridx + idx]: return MatchResult(False) return MatchResult(True, literal.value)
def sufficient_consumed(last_repetition: int, quantifier: Quantifier) ‑> bool
-
Determines whether the minimum number of elements was consumed NOTE: this should be invoked after consuming
Expand source code
@staticmethod def sufficient_consumed(last_repetition: int, quantifier: Quantifier) -> bool: """ Determines whether the minimum number of elements was consumed NOTE: this should be invoked after consuming """ if isinstance(quantifier, ZeroOrOne): return True elif isinstance(quantifier, ZeroOrMore): return True elif isinstance(quantifier, OneOrMore): return last_repetition >= 1 elif isinstance(quantifier, RangeQuantifier): # NB: quantifier values are 1-based and inclusive, and last_repetition is 0-based return quantifier.lbound <= last_repetition elif quantifier is None: # TODO: not sure if this correct return True else: return True
Methods
def check_and_update_empty(self, result: MatchResult, quantifier: Quantifier) ‑> MatchResult
-
In cases where the quantifier, permits 0 matches, e.g. *, ?, a "no-match" of a sub-group is to be treated as a match of length zero.
Returns
If there is a no-match, and the quantifier allows zero matches, then this will update the result to be an empty match; Otherwise, it will return the
result
Expand source code
def check_and_update_empty( self, result: MatchResult, quantifier: Quantifier ) -> MatchResult: """ In cases where the quantifier, permits 0 matches, e.g. *, ?, a "no-match" of a sub-group is to be treated as a match of length zero. Returns: If there is a no-match, and the quantifier allows zero matches, then this will update the result to be an empty match; Otherwise, it will return the `result` """ # if quantifier is [0,..] and no-match, this is treated # as a match of len 0; if result.matched is False and self.accepts_empty(quantifier): result = MatchResult(True, "") # else return original result return result
def match(self, string: str, stridx: int = 0) ‑> Union[str, NoneType]
-
Attempt to match
string
starting atstartidx
Args
string
- string to match
startidx
- location to start search
Return None: no-match str: match (possibly zero length)
Expand source code
def match(self, string: str, stridx: int = 0) -> Optional[str]: """ Attempt to match `string` starting at `startidx` Args: string: string to match startidx: location to start search Return None: no-match str: match (possibly zero length) """ matched = None result = self.match_grouping(self.compiled, string, stridx) if result.matched: matched = result.content return matched
def match_grouping(self, grouping: Grouping, string: str, stridx: int = 0) ‑> MatchResult
-
Finds the longest match by greedily matching characters. Greedy here implies, that when matching text on a sub-group, it will consume as many characters from the text, as the sub-group can match. This is noteworthy since a non-greedy sub-group match may result in a overall pattern match; whereas the greedy approach results in a non-match; e.g.:
for pattern = (aa)*a, text = aaaa, the greedy algorithm would consider this a non-match since, the pattern must be fully consumed, and the final 'a' does not get consumed.
The user string may be partially consumed. However, if the pattern is not consumed, then this is considered a non-match.
An empty string is a valid input to match. Further an empty string can match an arbitrary pattern, as long as each child pattern allows zero matches. Thus all matching handlers must handle zero-length input string
Args
stridx
- idx where to start matching string
Returns
MatchResult.matched is True if there is a complete match; else False
Expand source code
def match_grouping( self, grouping: Grouping, string: str, stridx: int = 0 ) -> MatchResult: """ Finds the longest match by greedily matching characters. Greedy here implies, that when matching text on a sub-group, it will consume as many characters from the text, as the sub-group can match. This is noteworthy since a non-greedy sub-group match may result in a overall pattern match; whereas the greedy approach results in a non-match; e.g.: for pattern = (aa)*a, text = aaaa, the greedy algorithm would consider this a non-match since, the pattern must be fully consumed, and the final 'a' does not get consumed. The user string may be partially consumed. However, if the pattern is not consumed, then this is considered a non-match. An empty string is a valid input to match. Further an empty string can match an arbitrary pattern, as long as each child pattern allows zero matches. Thus all matching handlers must handle zero-length input string Args: stridx: idx where to start matching string Returns: MatchResult.matched is True if there is a complete match; else False """ groups = grouping.groups group_repetition = 0 # count of repetitions of group matched = [] # chars that have matched sg_matched = True while self.can_consume(group_repetition, grouping.quantifier): res = self.match_sub_groups(groups, string, stridx) if res.matched: matched.append(res.content) group_repetition += 1 stridx += len(res.content) if len(res.content) == 0: break else: # sub group did not match sg_matched = False break content = arr2str(matched) # the following distinguishes a non-match from an empty match # i.e. content length is 0 and the quantifier does not allow a zero match if (len(content) == 0 and sg_matched is False) and self.accepts_empty( grouping.quantifier ) is False: return MatchResult(False) return MatchResult(True, content)
def match_sub_groups(self, groups: List[Token], string: str, stridx: int = 0)
-
Attempt to match a list of sub groups.
Expand source code
def match_sub_groups(self, groups: List[Token], string: str, stridx: int = 0): """ Attempt to match a list of sub groups. """ sgptr = 0 # sub-group ptr sgroup_repetition = 0 # count of repetitions of current sub group matched = [] # the last gptr location where a match was found # needed to determine if the pattern was fully consumed last_matched_sgptr = -1 while sgptr < len(groups): # there are 2 things to check: # 1) is there a match # 2) is the number of repetitions of match as expected # consume as much of string (maximum munch) using subgroup subgroup = groups[sgptr] # does the quantifier on this subgroup, allow it to consume more chars if self.can_consume(sgroup_repetition, subgroup.quantifier): # invoke the right handler res = None if isinstance(subgroup, Literal): res = self.match_literal(subgroup, string, stridx) res = self.check_and_update_empty(res, subgroup.quantifier) elif isinstance(subgroup, Charset): res = self.match_charset(subgroup, string, stridx) res = self.check_and_update_empty(res, subgroup.quantifier) else: assert isinstance( subgroup, Grouping ), "unexpected sub-expression type" # groupings can be nested # so the matching algorithm must be recursive res = self.match_grouping(subgroup, string, stridx) res = self.check_and_update_empty(res, subgroup.quantifier) # current component matched if res.matched: sgroup_repetition += 1 matched.append(res.content) # increment string pointer stridx += len(res.content) last_matched_sgptr = sgptr # increment the gptr; this represents # something not matching with [0,...] quantifier # we treat this as a match of length 0 if len(res.content) == 0: sgptr += 1 sgroup_repetition = 0 # current component did not match else: # no match, move to next matchable # check minimum match cond was violated if not self.sufficient_consumed( sgroup_repetition, subgroup.quantifier ): return MatchResult(False) sgroup_repetition = 0 # increment component pointer sgptr += 1 else: # quantifier restricts further consume # consider next subgroup sgptr += 1 sgroup_repetition = 0 continue # we want the pattern to be fully consumed and at least one # group has not been matched if last_matched_sgptr < len(groups) - 1: return MatchResult(False) content = arr2str(matched) return MatchResult(True, content)
class PatternParser (pattern: str)
-
Class responsible for parsing logic.
Expand source code
class PatternParser: """ Class responsible for parsing logic. """ def __init__(self, pattern: str): # when parsing a chunk, `start` is the start of the chunk # and `current` points to the `current` char self.start = 0 self.current = 0 self.pattern = pattern self.compiled = None self.compile() def is_at_end(self) -> bool: """ Determine whether parser is at the end of the pattern text """ return self.current >= len(self.pattern) def advance(self) -> str: """ Consume and return the current char. Consumption increments `current` """ char = self.pattern[self.current] self.current += 1 return char def has_next(self) -> bool: """ Determines whether there is a next element to consume """ return self.current < len(self.pattern) - 1 def has_next_next(self) -> bool: """ Determines whether there is a next to next element to consume """ return self.current < len(self.pattern) - 2 def peek(self) -> str: """ View the current element without consuming it """ return self.pattern[self.current] def peek_next(self) -> str: """ View the next element without consuming it. NOTE: This call is only valid if `has_next` is True """ return self.pattern[self.current + 1] def peek_next_next(self) -> str: """ View the next to next element without consuming it NOTE: This call is only valid if `has_next_next` is True """ return self.pattern[self.current + 2] @staticmethod def try_chars_to_num(chars: List[str]) -> Optional[int]: """ Attempt to parse a list of numeric chars into an int; if input is empty, returns None """ num_str = ''.join(chars) if len(num_str) == 0: return None return int(num_str) @staticmethod def coalesce_literals(tokens: List[Token]) -> List[Token]: """ Combine adjacent literal chars with no quantifier into a literal word; e.g. L[a]L[b] -> L[ab] This is needed because the parsing pass, does not peek to the next char and treats each char as a single-char literal. This coalescing is needed to match word level repetitions. The relative ordering of non-`Literal` tokens and `Literal` tokens with quantifiers is unchanged. """ coalesced = [] partials = ( [] ) # partial list of chars that will be coalesced into in a single Literal for idx, token in enumerate(tokens): # if quantifier is not None, can't coalesce into one literal if isinstance(token, Literal) and token.quantifier is None: partials.append(token) elif len(partials) > 0: # we're at coalesce boundary; coalesce `partials` into a single # Literal and add to result value = "".join([literal.value for literal in partials]) coalesced.append(Literal(value)) partials = [] # add all other tokens, as-is if ( not isinstance(token, Literal) or isinstance(token, Literal) and token.quantifier is not None ): coalesced.append(token) if len(partials) > 0: # add remaining chars as a literal value = "".join([literal.value for literal in partials]) coalesced.append(Literal(value)) return coalesced def compile(self): """ compile the user supplied pattern """ if self.compiled is None: self.compiled = self.compile_grouping() def compile_subgroups(self, subgroups: List[Token]) -> Token: """ Compile subgroups. When compiling a pattern, a pattern may consist of one or more subgroups. If there is a single sub-group, return the subgroup as is. This is needed to avoid excessive nesting, which leads to incorrect behavior. If there are multiple sub-groups, then wrap them in a coalesced `Grouping` """ coalesced = self.coalesce_literals(subgroups) if len(coalesced) == 1: # return single sub group as is return coalesced.pop() # wrap multiple sub-groups in a grouping return Grouping(coalesced) def compile_grouping(self, is_nested: bool = False) -> Grouping: """ Compile the pattern/grouping. A grouping consists of any number of literals, charsets, and sub-groups, and is the most general representation of a pattern. Hence, the user input is treated as a `Grouping` args: is_nested: whether this is nested grouping pattern can be of form: foo : literal [a-z]: charset (range) [3-9] [az] charsets (enumeration) fox* : single-char quantifier: *,+,? [3-9]{3,4}: quantity/numeric range (foo)*: groupings (foo_[a-z]*)+: groupings NB: quantifiers can apply to any other element, except another quantifier special chars, namely: -, {, }, [, ], *, +, ?, (, ) must be escaped escape via backslash prepended to char An unescaped special char is an error An escaped char, that should not be escaped in an error. Returns: compiled grouping; for the root call this is a `Grouping` object; for a non-root call this could be any other `Token` """ compiled = [] while self.is_at_end() is False: ch = self.advance() if ch == "(": # handle grouping grouping = self.compile_grouping(is_nested=True) compiled.append(grouping) elif ch == ")": if is_nested: # this terminates the grouping return self.compile_subgroups(compiled) raise UnescapedChar(ch) elif ch == "[": # this will either succeed and consume and return # entire charset, or raise exception charset = self.compile_charset() compiled.append(charset) # handle quantifiers elif ch == "*" or ch == "+" or ch == "?": # find matchable to attach quantifier to matchable = compiled[-1] if len(compiled) > 0 else None if matchable is None: raise UnescapedChar(ch) if ch == "*": matchable.quantifier = ZeroOrMore() elif ch == "+": matchable.quantifier = OneOrMore() else: assert ch == "?", "unknown quantifier" matchable.quantifier = ZeroOrOne() elif ch == "{": # range quantifier # this will either succeed and consume the entire quantifier or raise range_quantifier = self.compile_range_quantifier() matchable = compiled[-1] if len(compiled) > 0 else None if matchable is None: # some error raise UnassociatedQuantifier(str(range_quantifier)) matchable.quantifier = range_quantifier elif ch == '-': # this is an unescaped dash raise UnescapedChar(ch) # handle escape char elif ch == "\\": next_char = self.advance() if next_char not in ESCAPABLE_CHARS: # NOTE: currently not supporting all escape chars raise UnrecognizedEscapedChar(next_char) # add the escaped char as a literal compiled.append(Literal(next_char)) else: compiled.append(Literal(ch)) if is_nested: # this is error; since this was a nested call we should have found # a closing bracket; the choice of exception is imprecise # because the user's intention could be to: 1) create a group # and perhaps forgot the closing bracket; or 2) a literal match # on open bracket "(" and forgot to escape; for now bracket "(" must be escaped raise UnescapedChar("Unclosed bracket") grouping = self.compile_subgroups(compiled) if not is_nested and not isinstance(grouping, Grouping): # this is the root call- the returned must be wrapped in a Grouping grouping = Grouping([grouping]) return grouping def compile_range_quantifier(self) -> RangeQuantifier: """ Compile a range quantifier, e.g. '{1, 2}' This is invoked after consuming the opening '{' """ current = RQState.OpenBrace lower = -1 upper = -1 chars = [] # number chars while self.is_at_end() is False: ch = self.advance() if ch.isnumeric(): chars.append(ch) if current == RQState.OpenBrace: current = RQState.ParsingLower elif current == RQState.PostLower or current == RQState.PostUpper: # error: a number's word boundary was already reached # there is a break between numeric characters raise UnexpectedChar(ch, "Invalid quantifier range") elif current == RQState.PreUpper: current = RQState.ParsingUpper elif ch.isspace(): # how to interpret a space depends on what is being parsed if current == RQState.OpenBrace: continue # skip leading space elif current == RQState.ParsingLower: # this is a word boundary lower = self.try_chars_to_num(chars) if lower is None: raise UnexpectedChar("Unexpected space character word boundary; Invalid quantifier range") chars = [] current = RQState.PostLower elif current == RQState.ParsingUpper: current = RQState.PostUpper elif ch == ',': current = RQState.PreUpper # the first word lower = self.try_chars_to_num(chars) if lower is None: # ill formed expression raise UnexpectedChar(',', " '{' must be either escaped, or be properly constructed e.g. {M, N}; Invalid quantifier range") # noqa E501 chars = [] elif ch == '}': upper = self.try_chars_to_num(chars) if upper is None: raise UnexpectedChar(',', " '{' must be either escaped, or be properly constructed e.g. {M, N}; Invalid quantifier range") # noqa E501 break else: # unexpected char raise UnexpectedChar return RangeQuantifier(lower, upper) def compile_charset(self) -> Charset: """ Consume characters in the pattern, corresponding to a charset, i.e. started with '[', terminated with ']' with literals and char ranges in between. Returns: compiled `Charset` Raises: - UnclosedSet and UnescapedChar """ result = [] closed = False # whether the charset is closed while self.is_at_end() is False: ch = self.advance() # consume char # handle escape char if ch == "\\": next_char = self.advance() if next_char not in ESCAPABLE_CHARS: # NOTE: currently not supporting all escape chars raise UnrecognizedEscapedChar # add the escaped char as a literal result.append(Literal(next_char)) # handle range dash if ch == "-": # an unescaped dash, should only appear between a range # this simplifies the case, where it's the first or last char in set raise UnescapedChar(ch) if ch == "]": # closing bracket found closed = True break else: # ch is non-special # check if it's part of a range if self.has_next() and self.peek() == "-": if not self.has_next_next(): # unescaped dash's are only supported in range raise UnescapedChar(self.peek()) self.advance() # consume the dash rng_end = self.advance() rng = CharRange(ch, rng_end) result.append(rng) else: # handle literal result.append(Literal(ch)) if not closed: raise UnclosedCharSet() return Charset(result) def match(self, string: str, startidx: int = 0) -> Optional[str]: """ Attempt to match `string` starting at `startidx` Args: string: string to match startidx: location to start search Return None: no-match str: match (possibly zero length) """ matcher = PatternMatcher(self.compiled) return matcher.match(string, startidx)
Static methods
def coalesce_literals(tokens: List[Token]) ‑> List[Token]
-
Combine adjacent literal chars with no quantifier into a literal word; e.g. L[a]L[b] -> L[ab]
This is needed because the parsing pass, does not peek to the next char and treats each char as a single-char literal. This coalescing is needed to match word level repetitions.
The relative ordering of non-
Literal
tokens andLiteral
tokens with quantifiers is unchanged.Expand source code
@staticmethod def coalesce_literals(tokens: List[Token]) -> List[Token]: """ Combine adjacent literal chars with no quantifier into a literal word; e.g. L[a]L[b] -> L[ab] This is needed because the parsing pass, does not peek to the next char and treats each char as a single-char literal. This coalescing is needed to match word level repetitions. The relative ordering of non-`Literal` tokens and `Literal` tokens with quantifiers is unchanged. """ coalesced = [] partials = ( [] ) # partial list of chars that will be coalesced into in a single Literal for idx, token in enumerate(tokens): # if quantifier is not None, can't coalesce into one literal if isinstance(token, Literal) and token.quantifier is None: partials.append(token) elif len(partials) > 0: # we're at coalesce boundary; coalesce `partials` into a single # Literal and add to result value = "".join([literal.value for literal in partials]) coalesced.append(Literal(value)) partials = [] # add all other tokens, as-is if ( not isinstance(token, Literal) or isinstance(token, Literal) and token.quantifier is not None ): coalesced.append(token) if len(partials) > 0: # add remaining chars as a literal value = "".join([literal.value for literal in partials]) coalesced.append(Literal(value)) return coalesced
def try_chars_to_num(chars: List[str]) ‑> Union[int, NoneType]
-
Attempt to parse a list of numeric chars into an int; if input is empty, returns None
Expand source code
@staticmethod def try_chars_to_num(chars: List[str]) -> Optional[int]: """ Attempt to parse a list of numeric chars into an int; if input is empty, returns None """ num_str = ''.join(chars) if len(num_str) == 0: return None return int(num_str)
Methods
def advance(self) ‑> str
-
Consume and return the current char. Consumption increments
current
Expand source code
def advance(self) -> str: """ Consume and return the current char. Consumption increments `current` """ char = self.pattern[self.current] self.current += 1 return char
def compile(self)
-
compile the user supplied pattern
Expand source code
def compile(self): """ compile the user supplied pattern """ if self.compiled is None: self.compiled = self.compile_grouping()
def compile_charset(self) ‑> Charset
-
Consume characters in the pattern, corresponding to a charset, i.e. started with '[', terminated with ']' with literals and char ranges in between.
Returns
compiled
Charset
Raises
- UnclosedSet and UnescapedChar
Expand source code
def compile_charset(self) -> Charset: """ Consume characters in the pattern, corresponding to a charset, i.e. started with '[', terminated with ']' with literals and char ranges in between. Returns: compiled `Charset` Raises: - UnclosedSet and UnescapedChar """ result = [] closed = False # whether the charset is closed while self.is_at_end() is False: ch = self.advance() # consume char # handle escape char if ch == "\\": next_char = self.advance() if next_char not in ESCAPABLE_CHARS: # NOTE: currently not supporting all escape chars raise UnrecognizedEscapedChar # add the escaped char as a literal result.append(Literal(next_char)) # handle range dash if ch == "-": # an unescaped dash, should only appear between a range # this simplifies the case, where it's the first or last char in set raise UnescapedChar(ch) if ch == "]": # closing bracket found closed = True break else: # ch is non-special # check if it's part of a range if self.has_next() and self.peek() == "-": if not self.has_next_next(): # unescaped dash's are only supported in range raise UnescapedChar(self.peek()) self.advance() # consume the dash rng_end = self.advance() rng = CharRange(ch, rng_end) result.append(rng) else: # handle literal result.append(Literal(ch)) if not closed: raise UnclosedCharSet() return Charset(result)
def compile_grouping(self, is_nested: bool = False) ‑> Grouping
-
Compile the pattern/grouping. A grouping consists of any number of literals, charsets, and sub-groups, and is the most general representation of a pattern. Hence, the user input is treated as a
Grouping
args: is_nested: whether this is nested grouping
pattern can be of form: foo : literal
[3-9] [az] charsets (enumeration) fox : single-char quantifier: ,+,? [3-9]{3,4}: quantity/numeric range (foo): groupings (foo_a-z)+: groupings
NB: quantifiers can apply to any other element, except another quantifier
special chars, namely: -, {, }, [, ], *, +, ?, (, ) must be escaped
escape via backslash prepended to char
An unescaped special char is an error
An escaped char, that should not be escaped in an error.
Returns
compiled grouping; for the root call this is a
Grouping
object; for a non-root call this could be any otherToken
Expand source code
def compile_grouping(self, is_nested: bool = False) -> Grouping: """ Compile the pattern/grouping. A grouping consists of any number of literals, charsets, and sub-groups, and is the most general representation of a pattern. Hence, the user input is treated as a `Grouping` args: is_nested: whether this is nested grouping pattern can be of form: foo : literal [a-z]: charset (range) [3-9] [az] charsets (enumeration) fox* : single-char quantifier: *,+,? [3-9]{3,4}: quantity/numeric range (foo)*: groupings (foo_[a-z]*)+: groupings NB: quantifiers can apply to any other element, except another quantifier special chars, namely: -, {, }, [, ], *, +, ?, (, ) must be escaped escape via backslash prepended to char An unescaped special char is an error An escaped char, that should not be escaped in an error. Returns: compiled grouping; for the root call this is a `Grouping` object; for a non-root call this could be any other `Token` """ compiled = [] while self.is_at_end() is False: ch = self.advance() if ch == "(": # handle grouping grouping = self.compile_grouping(is_nested=True) compiled.append(grouping) elif ch == ")": if is_nested: # this terminates the grouping return self.compile_subgroups(compiled) raise UnescapedChar(ch) elif ch == "[": # this will either succeed and consume and return # entire charset, or raise exception charset = self.compile_charset() compiled.append(charset) # handle quantifiers elif ch == "*" or ch == "+" or ch == "?": # find matchable to attach quantifier to matchable = compiled[-1] if len(compiled) > 0 else None if matchable is None: raise UnescapedChar(ch) if ch == "*": matchable.quantifier = ZeroOrMore() elif ch == "+": matchable.quantifier = OneOrMore() else: assert ch == "?", "unknown quantifier" matchable.quantifier = ZeroOrOne() elif ch == "{": # range quantifier # this will either succeed and consume the entire quantifier or raise range_quantifier = self.compile_range_quantifier() matchable = compiled[-1] if len(compiled) > 0 else None if matchable is None: # some error raise UnassociatedQuantifier(str(range_quantifier)) matchable.quantifier = range_quantifier elif ch == '-': # this is an unescaped dash raise UnescapedChar(ch) # handle escape char elif ch == "\\": next_char = self.advance() if next_char not in ESCAPABLE_CHARS: # NOTE: currently not supporting all escape chars raise UnrecognizedEscapedChar(next_char) # add the escaped char as a literal compiled.append(Literal(next_char)) else: compiled.append(Literal(ch)) if is_nested: # this is error; since this was a nested call we should have found # a closing bracket; the choice of exception is imprecise # because the user's intention could be to: 1) create a group # and perhaps forgot the closing bracket; or 2) a literal match # on open bracket "(" and forgot to escape; for now bracket "(" must be escaped raise UnescapedChar("Unclosed bracket") grouping = self.compile_subgroups(compiled) if not is_nested and not isinstance(grouping, Grouping): # this is the root call- the returned must be wrapped in a Grouping grouping = Grouping([grouping]) return grouping
def compile_range_quantifier(self) ‑> RangeQuantifier
-
Compile a range quantifier, e.g. '{1, 2}' This is invoked after consuming the opening '{'
Expand source code
def compile_range_quantifier(self) -> RangeQuantifier: """ Compile a range quantifier, e.g. '{1, 2}' This is invoked after consuming the opening '{' """ current = RQState.OpenBrace lower = -1 upper = -1 chars = [] # number chars while self.is_at_end() is False: ch = self.advance() if ch.isnumeric(): chars.append(ch) if current == RQState.OpenBrace: current = RQState.ParsingLower elif current == RQState.PostLower or current == RQState.PostUpper: # error: a number's word boundary was already reached # there is a break between numeric characters raise UnexpectedChar(ch, "Invalid quantifier range") elif current == RQState.PreUpper: current = RQState.ParsingUpper elif ch.isspace(): # how to interpret a space depends on what is being parsed if current == RQState.OpenBrace: continue # skip leading space elif current == RQState.ParsingLower: # this is a word boundary lower = self.try_chars_to_num(chars) if lower is None: raise UnexpectedChar("Unexpected space character word boundary; Invalid quantifier range") chars = [] current = RQState.PostLower elif current == RQState.ParsingUpper: current = RQState.PostUpper elif ch == ',': current = RQState.PreUpper # the first word lower = self.try_chars_to_num(chars) if lower is None: # ill formed expression raise UnexpectedChar(',', " '{' must be either escaped, or be properly constructed e.g. {M, N}; Invalid quantifier range") # noqa E501 chars = [] elif ch == '}': upper = self.try_chars_to_num(chars) if upper is None: raise UnexpectedChar(',', " '{' must be either escaped, or be properly constructed e.g. {M, N}; Invalid quantifier range") # noqa E501 break else: # unexpected char raise UnexpectedChar return RangeQuantifier(lower, upper)
def compile_subgroups(self, subgroups: List[Token]) ‑> Token
-
Compile subgroups.
When compiling a pattern, a pattern may consist of one or more subgroups. If there is a single sub-group, return the subgroup as is. This is needed to avoid excessive nesting, which leads to incorrect behavior. If there are multiple sub-groups, then wrap them in a coalesced
Grouping
Expand source code
def compile_subgroups(self, subgroups: List[Token]) -> Token: """ Compile subgroups. When compiling a pattern, a pattern may consist of one or more subgroups. If there is a single sub-group, return the subgroup as is. This is needed to avoid excessive nesting, which leads to incorrect behavior. If there are multiple sub-groups, then wrap them in a coalesced `Grouping` """ coalesced = self.coalesce_literals(subgroups) if len(coalesced) == 1: # return single sub group as is return coalesced.pop() # wrap multiple sub-groups in a grouping return Grouping(coalesced)
def has_next(self) ‑> bool
-
Determines whether there is a next element to consume
Expand source code
def has_next(self) -> bool: """ Determines whether there is a next element to consume """ return self.current < len(self.pattern) - 1
def has_next_next(self) ‑> bool
-
Determines whether there is a next to next element to consume
Expand source code
def has_next_next(self) -> bool: """ Determines whether there is a next to next element to consume """ return self.current < len(self.pattern) - 2
def is_at_end(self) ‑> bool
-
Determine whether parser is at the end of the pattern text
Expand source code
def is_at_end(self) -> bool: """ Determine whether parser is at the end of the pattern text """ return self.current >= len(self.pattern)
def match(self, string: str, startidx: int = 0) ‑> Union[str, NoneType]
-
Attempt to match
string
starting atstartidx
Args
string
- string to match
startidx
- location to start search
Return None: no-match str: match (possibly zero length)
Expand source code
def match(self, string: str, startidx: int = 0) -> Optional[str]: """ Attempt to match `string` starting at `startidx` Args: string: string to match startidx: location to start search Return None: no-match str: match (possibly zero length) """ matcher = PatternMatcher(self.compiled) return matcher.match(string, startidx)
def peek(self) ‑> str
-
View the current element without consuming it
Expand source code
def peek(self) -> str: """ View the current element without consuming it """ return self.pattern[self.current]
def peek_next(self) ‑> str
-
View the next element without consuming it. NOTE: This call is only valid if
has_next
is TrueExpand source code
def peek_next(self) -> str: """ View the next element without consuming it. NOTE: This call is only valid if `has_next` is True """ return self.pattern[self.current + 1]
def peek_next_next(self) ‑> str
-
View the next to next element without consuming it NOTE: This call is only valid if
has_next_next
is TrueExpand source code
def peek_next_next(self) -> str: """ View the next to next element without consuming it NOTE: This call is only valid if `has_next_next` is True """ return self.pattern[self.current + 2]
class Quantifier
-
base quantifier class
Expand source code
class Quantifier: """ base quantifier class """ pass
Subclasses
class RangeQuantifier (lower_bound: int, upper_bound: int)
-
Defines a inclusive range for the number of repetitions
Expand source code
class RangeQuantifier(Quantifier): """ Defines a inclusive range for the number of repetitions """ def __init__(self, lower_bound: int, upper_bound: int): self.lbound = lower_bound self.ubound = upper_bound def __str__(self): return f'{{{self.lbound},{self.ubound}}}' def __repr__(self): return str(self)
Ancestors
class RQState (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
An enumeration.
Ancestors
- enum.Enum
Class variables
var OpenBrace
var ParsingLower
var ParsingUpper
var PostLower
var PostUpper
var PreUpper
class Token (quantifier=None)
-
base token type
Expand source code
class Token: """ base token type """ def __init__(self, quantifier=None): self.quantifier = quantifier
Subclasses
class UnassociatedQuantifier (*args, **kwargs)
-
An unassociated quantifier was found.
NOTE: This is only raised, e.g. if the pattern was {1,2}, but not for other quantifier. This is because the correct error messages, depends on the intentions of the user. From my assessment, a free '+' is likely to be an unescaped char. But '{1,2}' is almost certainly a missing body
Expand source code
class UnassociatedQuantifier(Exception): """ An unassociated quantifier was found. NOTE: This is only raised, e.g. if the pattern was {1,2}, but not for other quantifier. This is because the correct error messages, depends on the intentions of the user. From my assessment, a free '+' is likely to be an unescaped char. But '{1,2}' is almost certainly a missing body """
Ancestors
- builtins.Exception
- builtins.BaseException
class UnclosedCharSet (*args, **kwargs)
-
A char set was not closed, i.e. missing ']'
Expand source code
class UnclosedCharSet(Exception): """ A char set was not closed, i.e. missing ']' """ pass
Ancestors
- builtins.Exception
- builtins.BaseException
class UnescapedChar (*args, **kwargs)
-
These represent the constraint that special chars be escaped
Expand source code
class UnescapedChar(Exception): """ These represent the constraint that special chars be escaped """ pass
Ancestors
- builtins.Exception
- builtins.BaseException
class UnexpectedChar (*args, **kwargs)
-
generic exception for unexpected char and where one of the other exceptions doesn't seem appropriate
Expand source code
class UnexpectedChar(Exception): """ generic exception for unexpected char and where one of the other exceptions doesn't seem appropriate """
Ancestors
- builtins.Exception
- builtins.BaseException
class UnrecognizedEscapedChar (*args, **kwargs)
-
This represents that random characters should not be escaped
Expand source code
class UnrecognizedEscapedChar(Exception): """ This represents that random characters should not be escaped """ pass
Ancestors
- builtins.Exception
- builtins.BaseException
class ZeroOrMore
-
Represents quantifier zero or more repetitions
Expand source code
class ZeroOrMore(Quantifier): """ Represents quantifier zero or more repetitions """ def __str__(self): return "*"
Ancestors
class ZeroOrOne
-
Represents quantifier zero or one repetitions
Expand source code
class ZeroOrOne(Quantifier): """ Represents quantifier zero or one repetitions """ def __str__(self): return "?"
Ancestors