| """ |
| Iterator based sre token scanner |
| """ |
| import re |
| from re import VERBOSE, MULTILINE, DOTALL |
| import sre_parse |
| import sre_compile |
| import sre_constants |
| from sre_constants import BRANCH, SUBPATTERN |
| |
| __all__ = ['Scanner', 'pattern'] |
| |
| FLAGS = (VERBOSE | MULTILINE | DOTALL) |
| |
| class Scanner(object): |
| def __init__(self, lexicon, flags=FLAGS): |
| self.actions = [None] |
| # Combine phrases into a compound pattern |
| s = sre_parse.Pattern() |
| s.flags = flags |
| p = [] |
| for idx, token in enumerate(lexicon): |
| phrase = token.pattern |
| try: |
| subpattern = sre_parse.SubPattern(s, |
| [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) |
| except sre_constants.error: |
| raise |
| p.append(subpattern) |
| self.actions.append(token) |
| |
| s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work |
| p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |
| self.scanner = sre_compile.compile(p) |
| |
| def iterscan(self, string, idx=0, context=None): |
| """ |
| Yield match, end_idx for each match |
| """ |
| match = self.scanner.scanner(string, idx).match |
| actions = self.actions |
| lastend = idx |
| end = len(string) |
| while True: |
| m = match() |
| if m is None: |
| break |
| matchbegin, matchend = m.span() |
| if lastend == matchend: |
| break |
| action = actions[m.lastindex] |
| if action is not None: |
| rval, next_pos = action(m, context) |
| if next_pos is not None and next_pos != matchend: |
| # "fast forward" the scanner |
| matchend = next_pos |
| match = self.scanner.scanner(string, matchend).match |
| yield rval, matchend |
| lastend = matchend |
| |
| |
| def pattern(pattern, flags=FLAGS): |
| def decorator(fn): |
| fn.pattern = pattern |
| fn.regex = re.compile(pattern, flags) |
| return fn |
| return decorator |