# NOT_RPYTHON """ A pure Python reimplementation of the _sre module from CPython 2.4 Copyright 2005 Nik Haldimann, licensed under the MIT license This code is based on material licensed under CNRI's Python 1.6 license and copyrighted by: Copyright (c) 1997-2001 by Secret Labs AB """ MAXREPEAT = 2147483648 #import array import operator, sys from sre_constants import ATCODES, OPCODES, CHCODES from sre_constants import SRE_INFO_PREFIX, SRE_INFO_LITERAL from sre_constants import SRE_FLAG_UNICODE, SRE_FLAG_LOCALE import sys # Identifying as _sre from Python 2.3 or 2.4 #if sys.version_info[:2] >= (2, 4): MAGIC = 20031017 #else: # MAGIC = 20030419 # In _sre.c this is bytesize of the code word type of the C implementation. # There it's 2 for normal Python builds and more for wide unicode builds (large # enough to hold a 32-bit UCS-4 encoded character). Since here in pure Python # we only see re bytecodes as Python longs, we shouldn't have to care about the # codesize. But sre_compile will compile some stuff differently depending on the # codesize (e.g., charsets). # starting with python 3.3 CODESIZE is 4 #if sys.maxunicode == 65535: # CODESIZE = 2 #else: CODESIZE = 4 copyright = "_sre.py 2.4c Copyright 2005 by Nik Haldimann" def getcodesize(): return CODESIZE def compile(pattern, flags, code, groups=0, groupindex={}, indexgroup=[None]): """Compiles (or rather just converts) a pattern descriptor to a SRE_Pattern object. Actual compilation to opcodes happens in sre_compile.""" return SRE_Pattern(pattern, flags, code, groups, groupindex, indexgroup) def getlower(char_ord, flags): if (char_ord < 128) or (flags & SRE_FLAG_UNICODE) \ or (flags & SRE_FLAG_LOCALE and char_ord < 256): #return ord(unichr(char_ord).lower()) return ord(chr(char_ord).lower()) else: return char_ord class SRE_Pattern: def __init__(self, pattern, flags, code, groups=0, groupindex={}, indexgroup=[None]): self.pattern = pattern self.flags = flags self.groups = groups self.groupindex = groupindex # Maps group names to group indices self._indexgroup = indexgroup # Maps indices to group names self._code = code def match(self, string, pos=0, endpos=sys.maxsize): """If zero or more characters at the beginning of string match this regular expression, return a corresponding MatchObject instance. Return None if the string does not match the pattern.""" state = _State(string, pos, endpos, self.flags) if state.match(self._code): return SRE_Match(self, state) return None def search(self, string, pos=0, endpos=sys.maxsize): """Scan through string looking for a location where this regular expression produces a match, and return a corresponding MatchObject instance. Return None if no position in the string matches the pattern.""" state = _State(string, pos, endpos, self.flags) if state.search(self._code): return SRE_Match(self, state) else: return None def findall(self, string, pos=0, endpos=sys.maxsize): """Return a list of all non-overlapping matches of pattern in string.""" matchlist = [] state = _State(string, pos, endpos, self.flags) while state.start <= state.end: state.reset() state.string_position = state.start if not state.search(self._code): break match = SRE_Match(self, state) if self.groups == 0 or self.groups == 1: item = match.group(self.groups) else: item = match.groups("") matchlist.append(item) if state.string_position == state.start: state.start += 1 else: state.start = state.string_position return matchlist def _subx(self, template, string, count=0, subn=False): filter = template if not callable(template) and "\\" in template: # handle non-literal strings ; hand it over to the template compiler #import sre #sre was renamed to re #fix me brython #print("possible issue at _sre.py line 116") import re as sre filter = sre._subx(self, template) state = _State(string, 0, sys.maxsize, self.flags) sublist = [] n = last_pos = 0 while not count or n < count: state.reset() state.string_position = state.start if not state.search(self._code): break if last_pos < state.start: sublist.append(string[last_pos:state.start]) if not (last_pos == state.start and last_pos == state.string_position and n > 0): # the above ignores empty matches on latest position if callable(filter): sublist.append(filter(SRE_Match(self, state))) else: sublist.append(filter) last_pos = state.string_position n += 1 if state.string_position == state.start: state.start += 1 else: state.start = state.string_position if last_pos < state.end: sublist.append(string[last_pos:state.end]) item = "".join(sublist) if subn: return item, n else: return item def sub(self, repl, string, count=0): """Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.""" return self._subx(repl, string, count, False) def subn(self, repl, string, count=0): """Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.""" return self._subx(repl, string, count, True) def split(self, string, maxsplit=0): """Split string by the occurrences of pattern.""" splitlist = [] state = _State(string, 0, sys.maxsize, self.flags) n = 0 last = state.start while not maxsplit or n < maxsplit: state.reset() state.string_position = state.start if not state.search(self._code): break if state.start == state.string_position: # zero-width match if last == state.end: # or end of string break state.start += 1 continue splitlist.append(string[last:state.start]) # add groups (if any) if self.groups: match = SRE_Match(self, state) splitlist.extend(list(match.groups(None))) n += 1 last = state.start = state.string_position splitlist.append(string[last:state.end]) return splitlist def finditer(self, string, pos=0, endpos=sys.maxsize): """Return a list of all non-overlapping matches of pattern in string.""" #scanner = self.scanner(string, pos, endpos) _list=[] _m=self.scanner(string, pos, endpos) _re=SRE_Scanner(self, string, pos, endpos) _m=_re.search() while _m: _list.append(_m) _m=_re.search() return _list #return iter(scanner.search, None) def scanner(self, string, start=0, end=sys.maxsize): return SRE_Scanner(self, string, start, end) def __copy__(self): raise TypeError("cannot copy this pattern object") def __deepcopy__(self): raise TypeError("cannot copy this pattern object") class SRE_Scanner: """Undocumented scanner interface of sre.""" def __init__(self, pattern, string, start, end): self.pattern = pattern self._state = _State(string, start, end, self.pattern.flags) def _match_search(self, matcher): state = self._state state.reset() state.string_position = state.start match = None if matcher(self.pattern._code): match = SRE_Match(self.pattern, state) if match is None or state.string_position == state.start: state.start += 1 else: state.start = state.string_position return match def match(self): return self._match_search(self._state.match) def search(self): return self._match_search(self._state.search) class SRE_Match: def __init__(self, pattern, state): self.re = pattern self.string = state.string self.pos = state.pos self.endpos = state.end self.lastindex = state.lastindex if self.lastindex < 0: self.lastindex = None self.regs = self._create_regs(state) #statement below is not valid under python3 ( 0 <= None) #if pattern._indexgroup and 0 <= self.lastindex < len(pattern._indexgroup): if self.lastindex is not None and pattern._indexgroup and 0 <= self.lastindex < len(pattern._indexgroup): # The above upper-bound check should not be necessary, as the re # compiler is supposed to always provide an _indexgroup list long # enough. But the re.Scanner class seems to screw up something # there, test_scanner in test_re won't work without upper-bound # checking. XXX investigate this and report bug to CPython. self.lastgroup = pattern._indexgroup[self.lastindex] else: self.lastgroup = None def _create_regs(self, state): """Creates a tuple of index pairs representing matched groups.""" regs = [(state.start, state.string_position)] for group in range(self.re.groups): mark_index = 2 * group if mark_index + 1 < len(state.marks) \ and state.marks[mark_index] is not None \ and state.marks[mark_index + 1] is not None: regs.append((state.marks[mark_index], state.marks[mark_index + 1])) else: regs.append((-1, -1)) return tuple(regs) def _get_index(self, group): if isinstance(group, int): if group >= 0 and group <= self.re.groups: return group else: if group in self.re.groupindex: return self.re.groupindex[group] raise IndexError("no such group") def _get_slice(self, group, default): group_indices = self.regs[group] if group_indices[0] >= 0: return self.string[group_indices[0]:group_indices[1]] else: return default def start(self, group=0): """Returns the indices of the start of the substring matched by group; group defaults to zero (meaning the whole matched substring). Returns -1 if group exists but did not contribute to the match.""" return self.regs[self._get_index(group)][0] def end(self, group=0): """Returns the indices of the end of the substring matched by group; group defaults to zero (meaning the whole matched substring). Returns -1 if group exists but did not contribute to the match.""" return self.regs[self._get_index(group)][1] def span(self, group=0): """Returns the 2-tuple (m.start(group), m.end(group)).""" return self.start(group), self.end(group) def expand(self, template): """Return the string obtained by doing backslash substitution and resolving group references on template.""" import sre return sre._expand(self.re, self, template) def groups(self, default=None): """Returns a tuple containing all the subgroups of the match. The default argument is used for groups that did not participate in the match (defaults to None).""" groups = [] for indices in self.regs[1:]: if indices[0] >= 0: groups.append(self.string[indices[0]:indices[1]]) else: groups.append(default) return tuple(groups) def groupdict(self, default=None): """Return a dictionary containing all the named subgroups of the match. The default argument is used for groups that did not participate in the match (defaults to None).""" groupdict = {} for key, value in self.re.groupindex.items(): groupdict[key] = self._get_slice(value, default) return groupdict def group(self, *args): """Returns one or more subgroups of the match. Each argument is either a group index or a group name.""" if len(args) == 0: args = (0,) grouplist = [] for group in args: grouplist.append(self._get_slice(self._get_index(group), None)) if len(grouplist) == 1: return grouplist[0] else: return tuple(grouplist) def __copy__(): raise TypeError("cannot copy this pattern object") def __deepcopy__(): raise TypeError("cannot copy this pattern object") class _State: def __init__(self, string, start, end, flags): self.string = string if start < 0: start = 0 if end > len(string): end = len(string) self.start = start self.string_position = self.start self.end = end self.pos = start self.flags = flags self.reset() def reset(self): self.marks = [] self.lastindex = -1 self.marks_stack = [] self.context_stack = [] self.repeat = None def match(self, pattern_codes): # Optimization: Check string length. pattern_codes[3] contains the # minimum length for a string to possibly match. # brython.. the optimization doesn't work #if pattern_codes[0] == OPCODES["info"] and pattern_codes[3]: # if self.end - self.string_position < pattern_codes[3]: # #_log("reject (got %d chars, need %d)" # # % (self.end - self.string_position, pattern_codes[3])) # return False dispatcher = _OpcodeDispatcher() self.context_stack.append(_MatchContext(self, pattern_codes)) has_matched = None while len(self.context_stack) > 0: context = self.context_stack[-1] has_matched = dispatcher.match(context) if has_matched is not None: # don't pop if context isn't done self.context_stack.pop() return has_matched def search(self, pattern_codes): flags = 0 if pattern_codes[0] == OPCODES["info"]: # optimization info block # <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> if pattern_codes[2] & SRE_INFO_PREFIX and pattern_codes[5] > 1: return self.fast_search(pattern_codes) flags = pattern_codes[2] pattern_codes = pattern_codes[pattern_codes[1] + 1:] string_position = self.start if pattern_codes[0] == OPCODES["literal"]: # Special case: Pattern starts with a literal character. This is # used for short prefixes character = pattern_codes[1] while True: while string_position < self.end \ and ord(self.string[string_position]) != character: string_position += 1 if string_position >= self.end: return False self.start = string_position string_position += 1 self.string_position = string_position if flags & SRE_INFO_LITERAL: return True if self.match(pattern_codes[2:]): return True return False # General case while string_position <= self.end: self.reset() self.start = self.string_position = string_position if self.match(pattern_codes): return True string_position += 1 return False def fast_search(self, pattern_codes): """Skips forward in a string as fast as possible using information from an optimization info block.""" # pattern starts with a known prefix # <5=length> <6=skip> <7=prefix data> flags = pattern_codes[2] prefix_len = pattern_codes[5] prefix_skip = pattern_codes[6] # don't really know what this is good for prefix = pattern_codes[7:7 + prefix_len] overlap = pattern_codes[7 + prefix_len - 1:pattern_codes[1] + 1] pattern_codes = pattern_codes[pattern_codes[1] + 1:] i = 0 string_position = self.string_position while string_position < self.end: while True: if ord(self.string[string_position]) != prefix[i]: if i == 0: break else: i = overlap[i] else: i += 1 if i == prefix_len: # found a potential match self.start = string_position + 1 - prefix_len self.string_position = string_position + 1 \ - prefix_len + prefix_skip if flags & SRE_INFO_LITERAL: return True # matched all of pure literal pattern if self.match(pattern_codes[2 * prefix_skip:]): return True i = overlap[i] break string_position += 1 return False def set_mark(self, mark_nr, position): if mark_nr & 1: # This id marks the end of a group. # fix python 3 division incompatability #self.lastindex = mark_nr / 2 + 1 self.lastindex = mark_nr // 2 + 1 if mark_nr >= len(self.marks): self.marks.extend([None] * (mark_nr - len(self.marks) + 1)) self.marks[mark_nr] = position def get_marks(self, group_index): marks_index = 2 * group_index if len(self.marks) > marks_index + 1: return self.marks[marks_index], self.marks[marks_index + 1] else: return None, None def marks_push(self): self.marks_stack.append((self.marks[:], self.lastindex)) def marks_pop(self): self.marks, self.lastindex = self.marks_stack.pop() def marks_pop_keep(self): self.marks, self.lastindex = self.marks_stack[-1] def marks_pop_discard(self): self.marks_stack.pop() def lower(self, char_ord): return getlower(char_ord, self.flags) class _MatchContext: def __init__(self, state, pattern_codes): self.state = state self.pattern_codes = pattern_codes self.string_position = state.string_position self.code_position = 0 self.has_matched = None def push_new_context(self, pattern_offset): """Creates a new child context of this context and pushes it on the stack. pattern_offset is the offset off the current code position to start interpreting from.""" child_context = _MatchContext(self.state, self.pattern_codes[self.code_position + pattern_offset:]) #print("_sre.py:517:pushing new context") #, child_context.has_matched) #print(self.state.string_position) #print(self.pattern_codes[self.code_position + pattern_offset:]) #print(pattern_offset) self.state.context_stack.append(child_context) return child_context def peek_char(self, peek=0): return self.state.string[self.string_position + peek] def skip_char(self, skip_count): self.string_position += skip_count def remaining_chars(self): return self.state.end - self.string_position def peek_code(self, peek=0): return self.pattern_codes[self.code_position + peek] def skip_code(self, skip_count): self.code_position += skip_count def remaining_codes(self): return len(self.pattern_codes) - self.code_position def at_beginning(self): return self.string_position == 0 def at_end(self): return self.string_position == self.state.end def at_linebreak(self): return not self.at_end() and _is_linebreak(self.peek_char()) def at_boundary(self, word_checker): if self.at_beginning() and self.at_end(): return False that = not self.at_beginning() and word_checker(self.peek_char(-1)) this = not self.at_end() and word_checker(self.peek_char()) return this != that class _RepeatContext(_MatchContext): def __init__(self, context): _MatchContext.__init__(self, context.state, context.pattern_codes[context.code_position:]) self.count = -1 #print('569:repeat', context.state.repeat) self.previous = context.state.repeat self.last_position = None class _Dispatcher: DISPATCH_TABLE = None def dispatch(self, code, context): method = self.DISPATCH_TABLE.get(code, self.__class__.unknown) return method(self, context) def unknown(self, code, ctx): raise NotImplementedError() def build_dispatch_table(cls, code_dict, method_prefix): if cls.DISPATCH_TABLE is not None: return table = {} for key, value in code_dict.items(): if hasattr(cls, "%s%s" % (method_prefix, key)): table[value] = getattr(cls, "%s%s" % (method_prefix, key)) cls.DISPATCH_TABLE = table build_dispatch_table = classmethod(build_dispatch_table) class _OpcodeDispatcher(_Dispatcher): def __init__(self): self.executing_contexts = {} self.at_dispatcher = _AtcodeDispatcher() self.ch_dispatcher = _ChcodeDispatcher() self.set_dispatcher = _CharsetDispatcher() def match(self, context): """Returns True if the current context matches, False if it doesn't and None if matching is not finished, ie must be resumed after child contexts have been matched.""" while context.remaining_codes() > 0 and context.has_matched is None: opcode = context.peek_code() if not self.dispatch(opcode, context): return None if context.has_matched is None: context.has_matched = False return context.has_matched def dispatch(self, opcode, context): """Dispatches a context on a given opcode. Returns True if the context is done matching, False if it must be resumed when next encountered.""" #if self.executing_contexts.has_key(id(context)): if id(context) in self.executing_contexts: generator = self.executing_contexts[id(context)] del self.executing_contexts[id(context)] has_finished = next(generator) else: method = self.DISPATCH_TABLE.get(opcode, _OpcodeDispatcher.unknown) has_finished = method(self, context) if hasattr(has_finished, "__next__"): # avoid using the types module generator = has_finished has_finished = next(generator) if not has_finished: self.executing_contexts[id(context)] = generator return has_finished def op_success(self, ctx): # end of pattern #self._log(ctx, "SUCCESS") ctx.state.string_position = ctx.string_position ctx.has_matched = True return True def op_failure(self, ctx): # immediate failure #self._log(ctx, "FAILURE") ctx.has_matched = False return True def general_op_literal(self, ctx, compare, decorate=lambda x: x): #print(ctx.peek_char()) if ctx.at_end() or not compare(decorate(ord(ctx.peek_char())), decorate(ctx.peek_code(1))): ctx.has_matched = False ctx.skip_code(2) ctx.skip_char(1) def op_literal(self, ctx): # match literal string # #self._log(ctx, "LITERAL", ctx.peek_code(1)) self.general_op_literal(ctx, operator.eq) return True def op_not_literal(self, ctx): # match anything that is not the given literal character # #self._log(ctx, "NOT_LITERAL", ctx.peek_code(1)) self.general_op_literal(ctx, operator.ne) return True def op_literal_ignore(self, ctx): # match literal regardless of case # #self._log(ctx, "LITERAL_IGNORE", ctx.peek_code(1)) self.general_op_literal(ctx, operator.eq, ctx.state.lower) return True def op_not_literal_ignore(self, ctx): # match literal regardless of case # #self._log(ctx, "LITERAL_IGNORE", ctx.peek_code(1)) self.general_op_literal(ctx, operator.ne, ctx.state.lower) return True def op_at(self, ctx): # match at given position # #self._log(ctx, "AT", ctx.peek_code(1)) if not self.at_dispatcher.dispatch(ctx.peek_code(1), ctx): ctx.has_matched = False #print('_sre.py:line693, update context.has_matched variable') return True ctx.skip_code(2) return True def op_category(self, ctx): # match at given category # #self._log(ctx, "CATEGORY", ctx.peek_code(1)) if ctx.at_end() or not self.ch_dispatcher.dispatch(ctx.peek_code(1), ctx): ctx.has_matched = False #print('_sre.py:line703, update context.has_matched variable') return True ctx.skip_code(2) ctx.skip_char(1) return True def op_any(self, ctx): # match anything (except a newline) # #self._log(ctx, "ANY") if ctx.at_end() or ctx.at_linebreak(): ctx.has_matched = False #print('_sre.py:line714, update context.has_matched variable') return True ctx.skip_code(1) ctx.skip_char(1) return True def op_any_all(self, ctx): # match anything # #self._log(ctx, "ANY_ALL") if ctx.at_end(): ctx.has_matched = False #print('_sre.py:line725, update context.has_matched variable') return True ctx.skip_code(1) ctx.skip_char(1) return True def general_op_in(self, ctx, decorate=lambda x: x): #self._log(ctx, "OP_IN") #print('general_op_in') if ctx.at_end(): ctx.has_matched = False #print('_sre.py:line734, update context.has_matched variable') return skip = ctx.peek_code(1) ctx.skip_code(2) # set op pointer to the set code #print(ctx.peek_char(), ord(ctx.peek_char()), # decorate(ord(ctx.peek_char()))) if not self.check_charset(ctx, decorate(ord(ctx.peek_char()))): #print('_sre.py:line738, update context.has_matched variable') ctx.has_matched = False return ctx.skip_code(skip - 1) ctx.skip_char(1) #print('end:general_op_in') def op_in(self, ctx): # match set member (or non_member) # #self._log(ctx, "OP_IN") self.general_op_in(ctx) return True def op_in_ignore(self, ctx): # match set member (or non_member), disregarding case of current char # #self._log(ctx, "OP_IN_IGNORE") self.general_op_in(ctx, ctx.state.lower) return True def op_jump(self, ctx): # jump forward # #self._log(ctx, "JUMP", ctx.peek_code(1)) ctx.skip_code(ctx.peek_code(1) + 1) return True # skip info # op_info = op_jump def op_mark(self, ctx): # set mark # #self._log(ctx, "OP_MARK", ctx.peek_code(1)) ctx.state.set_mark(ctx.peek_code(1), ctx.string_position) ctx.skip_code(2) return True def op_branch(self, ctx): # alternation # <0=skip> code ... #self._log(ctx, "BRANCH") ctx.state.marks_push() ctx.skip_code(1) current_branch_length = ctx.peek_code(0) while current_branch_length: # The following tries to shortcut branches starting with a # (unmatched) literal. _sre.c also shortcuts charsets here. if not (ctx.peek_code(1) == OPCODES["literal"] and \ (ctx.at_end() or ctx.peek_code(2) != ord(ctx.peek_char()))): ctx.state.string_position = ctx.string_position child_context = ctx.push_new_context(1) #print("_sre.py:803:op_branch") yield False if child_context.has_matched: ctx.has_matched = True yield True ctx.state.marks_pop_keep() ctx.skip_code(current_branch_length) current_branch_length = ctx.peek_code(0) ctx.state.marks_pop_discard() ctx.has_matched = False #print('_sre.py:line805, update context.has_matched variable') yield True def op_repeat_one(self, ctx): # match repeated sequence (maximizing). # this operator only works if the repeated item is exactly one character # wide, and we're not already collecting backtracking points. # <1=min> <2=max> item tail mincount = ctx.peek_code(2) maxcount = ctx.peek_code(3) #print("repeat one", mincount, maxcount) #self._log(ctx, "REPEAT_ONE", mincount, maxcount) if ctx.remaining_chars() < mincount: ctx.has_matched = False yield True ctx.state.string_position = ctx.string_position count = self.count_repetitions(ctx, maxcount) ctx.skip_char(count) if count < mincount: ctx.has_matched = False yield True if ctx.peek_code(ctx.peek_code(1) + 1) == OPCODES["success"]: # tail is empty. we're finished ctx.state.string_position = ctx.string_position ctx.has_matched = True yield True ctx.state.marks_push() if ctx.peek_code(ctx.peek_code(1) + 1) == OPCODES["literal"]: # Special case: Tail starts with a literal. Skip positions where # the rest of the pattern cannot possibly match. char = ctx.peek_code(ctx.peek_code(1) + 2) while True: while count >= mincount and \ (ctx.at_end() or ord(ctx.peek_char()) != char): ctx.skip_char(-1) count -= 1 if count < mincount: break ctx.state.string_position = ctx.string_position child_context = ctx.push_new_context(ctx.peek_code(1) + 1) #print("_sre.py:856:push_new_context") yield False if child_context.has_matched: ctx.has_matched = True yield True ctx.skip_char(-1) count -= 1 ctx.state.marks_pop_keep() else: # General case: backtracking while count >= mincount: ctx.state.string_position = ctx.string_position child_context = ctx.push_new_context(ctx.peek_code(1) + 1) yield False if child_context.has_matched: ctx.has_matched = True yield True ctx.skip_char(-1) count -= 1 ctx.state.marks_pop_keep() ctx.state.marks_pop_discard() ctx.has_matched = False #ctx.has_matched = True # <== this should be True (so match object gets returned to program) yield True def op_min_repeat_one(self, ctx): # match repeated sequence (minimizing) # <1=min> <2=max> item tail mincount = ctx.peek_code(2) maxcount = ctx.peek_code(3) #self._log(ctx, "MIN_REPEAT_ONE", mincount, maxcount) if ctx.remaining_chars() < mincount: ctx.has_matched = False yield True ctx.state.string_position = ctx.string_position if mincount == 0: count = 0 else: count = self.count_repetitions(ctx, mincount) if count < mincount: ctx.has_matched = False #print('_sre.py:line891, update context.has_matched variable') yield True ctx.skip_char(count) if ctx.peek_code(ctx.peek_code(1) + 1) == OPCODES["success"]: # tail is empty. we're finished ctx.state.string_position = ctx.string_position ctx.has_matched = True yield True ctx.state.marks_push() while maxcount == MAXREPEAT or count <= maxcount: ctx.state.string_position = ctx.string_position child_context = ctx.push_new_context(ctx.peek_code(1) + 1) #print('_sre.py:916:push new context') yield False if child_context.has_matched: ctx.has_matched = True yield True ctx.state.string_position = ctx.string_position if self.count_repetitions(ctx, 1) == 0: break ctx.skip_char(1) count += 1 ctx.state.marks_pop_keep() ctx.state.marks_pop_discard() ctx.has_matched = False yield True def op_repeat(self, ctx): # create repeat context. all the hard work is done by the UNTIL # operator (MAX_UNTIL, MIN_UNTIL) # <1=min> <2=max> item tail #self._log(ctx, "REPEAT", ctx.peek_code(2), ctx.peek_code(3)) #if ctx.state.repeat is None: # print("951:ctx.state.repeat is None") # #ctx.state.repeat=_RepeatContext(ctx) repeat = _RepeatContext(ctx) ctx.state.repeat = repeat ctx.state.string_position = ctx.string_position child_context = ctx.push_new_context(ctx.peek_code(1) + 1) #print("_sre.py:941:push new context", id(child_context)) #print(child_context.state.repeat) #print(ctx.state.repeat) # are these two yields causing the issue? yield False ctx.state.repeat = repeat.previous ctx.has_matched = child_context.has_matched yield True def op_max_until(self, ctx): # maximizing repeat # <1=min> <2=max> item tail repeat = ctx.state.repeat #print("op_max_until") #, id(ctx.state.repeat)) if repeat is None: #print(id(ctx), id(ctx.state)) raise RuntimeError("Internal re error: MAX_UNTIL without REPEAT.") mincount = repeat.peek_code(2) maxcount = repeat.peek_code(3) ctx.state.string_position = ctx.string_position count = repeat.count + 1 #self._log(ctx, "MAX_UNTIL", count) if count < mincount: # not enough matches repeat.count = count child_context = repeat.push_new_context(4) yield False ctx.has_matched = child_context.has_matched if not ctx.has_matched: repeat.count = count - 1 ctx.state.string_position = ctx.string_position yield True if (count < maxcount or maxcount == MAXREPEAT) \ and ctx.state.string_position != repeat.last_position: # we may have enough matches, if we can match another item, do so repeat.count = count ctx.state.marks_push() save_last_position = repeat.last_position # zero-width match protection repeat.last_position = ctx.state.string_position child_context = repeat.push_new_context(4) yield False repeat.last_position = save_last_position if child_context.has_matched: ctx.state.marks_pop_discard() ctx.has_matched = True yield True ctx.state.marks_pop() repeat.count = count - 1 ctx.state.string_position = ctx.string_position # cannot match more repeated items here. make sure the tail matches ctx.state.repeat = repeat.previous child_context = ctx.push_new_context(1) #print("_sre.py:987:op_max_until") yield False ctx.has_matched = child_context.has_matched if not ctx.has_matched: ctx.state.repeat = repeat ctx.state.string_position = ctx.string_position yield True def op_min_until(self, ctx): # minimizing repeat # <1=min> <2=max> item tail repeat = ctx.state.repeat if repeat is None: raise RuntimeError("Internal re error: MIN_UNTIL without REPEAT.") mincount = repeat.peek_code(2) maxcount = repeat.peek_code(3) ctx.state.string_position = ctx.string_position count = repeat.count + 1 #self._log(ctx, "MIN_UNTIL", count) if count < mincount: # not enough matches repeat.count = count child_context = repeat.push_new_context(4) yield False ctx.has_matched = child_context.has_matched if not ctx.has_matched: repeat.count = count - 1 ctx.state.string_position = ctx.string_position yield True # see if the tail matches ctx.state.marks_push() ctx.state.repeat = repeat.previous child_context = ctx.push_new_context(1) #print('_sre.py:1022:push new context') yield False if child_context.has_matched: ctx.has_matched = True yield True ctx.state.repeat = repeat ctx.state.string_position = ctx.string_position ctx.state.marks_pop() # match more until tail matches if count >= maxcount and maxcount != MAXREPEAT: ctx.has_matched = False #print('_sre.py:line1022, update context.has_matched variable') yield True repeat.count = count child_context = repeat.push_new_context(4) yield False ctx.has_matched = child_context.has_matched if not ctx.has_matched: repeat.count = count - 1 ctx.state.string_position = ctx.string_position yield True def general_op_groupref(self, ctx, decorate=lambda x: x): group_start, group_end = ctx.state.get_marks(ctx.peek_code(1)) if group_start is None or group_end is None or group_end < group_start: ctx.has_matched = False return True while group_start < group_end: if ctx.at_end() or decorate(ord(ctx.peek_char())) \ != decorate(ord(ctx.state.string[group_start])): ctx.has_matched = False #print('_sre.py:line1042, update context.has_matched variable') return True group_start += 1 ctx.skip_char(1) ctx.skip_code(2) return True def op_groupref(self, ctx): # match backreference # #self._log(ctx, "GROUPREF", ctx.peek_code(1)) return self.general_op_groupref(ctx) def op_groupref_ignore(self, ctx): # match backreference case-insensitive # #self._log(ctx, "GROUPREF_IGNORE", ctx.peek_code(1)) return self.general_op_groupref(ctx, ctx.state.lower) def op_groupref_exists(self, ctx): # codeyes codeno ... #self._log(ctx, "GROUPREF_EXISTS", ctx.peek_code(1)) group_start, group_end = ctx.state.get_marks(ctx.peek_code(1)) if group_start is None or group_end is None or group_end < group_start: ctx.skip_code(ctx.peek_code(2) + 1) else: ctx.skip_code(3) return True def op_assert(self, ctx): # assert subpattern # #self._log(ctx, "ASSERT", ctx.peek_code(2)) ctx.state.string_position = ctx.string_position - ctx.peek_code(2) if ctx.state.string_position < 0: ctx.has_matched = False yield True child_context = ctx.push_new_context(3) yield False if child_context.has_matched: ctx.skip_code(ctx.peek_code(1) + 1) else: ctx.has_matched = False yield True def op_assert_not(self, ctx): # assert not subpattern # #self._log(ctx, "ASSERT_NOT", ctx.peek_code(2)) ctx.state.string_position = ctx.string_position - ctx.peek_code(2) if ctx.state.string_position >= 0: child_context = ctx.push_new_context(3) yield False if child_context.has_matched: ctx.has_matched = False yield True ctx.skip_code(ctx.peek_code(1) + 1) yield True def unknown(self, ctx): #self._log(ctx, "UNKNOWN", ctx.peek_code()) raise RuntimeError("Internal re error. Unknown opcode: %s" % ctx.peek_code()) def check_charset(self, ctx, char): """Checks whether a character matches set of arbitrary length. Assumes the code pointer is at the first member of the set.""" self.set_dispatcher.reset(char) save_position = ctx.code_position result = None while result is None: result = self.set_dispatcher.dispatch(ctx.peek_code(), ctx) ctx.code_position = save_position #print("_sre.py:1123:check_charset", result) return result def count_repetitions(self, ctx, maxcount): """Returns the number of repetitions of a single item, starting from the current string position. The code pointer is expected to point to a REPEAT_ONE operation (with the repeated 4 ahead).""" count = 0 real_maxcount = ctx.state.end - ctx.string_position if maxcount < real_maxcount and maxcount != MAXREPEAT: real_maxcount = maxcount # XXX could special case every single character pattern here, as in C. # This is a general solution, a bit hackisch, but works and should be # efficient. code_position = ctx.code_position string_position = ctx.string_position ctx.skip_code(4) reset_position = ctx.code_position while count < real_maxcount: # this works because the single character pattern is followed by # a success opcode ctx.code_position = reset_position self.dispatch(ctx.peek_code(), ctx) #print("count_repetitions", ctx.has_matched, count) if ctx.has_matched is False: # could be None as well break count += 1 ctx.has_matched = None ctx.code_position = code_position ctx.string_position = string_position return count def _log(self, context, opname, *args): arg_string = ("%s " * len(args)) % args _log("|%s|%s|%s %s" % (context.pattern_codes, context.string_position, opname, arg_string)) _OpcodeDispatcher.build_dispatch_table(OPCODES, "op_") class _CharsetDispatcher(_Dispatcher): def __init__(self): self.ch_dispatcher = _ChcodeDispatcher() def reset(self, char): self.char = char self.ok = True def set_failure(self, ctx): return not self.ok def set_literal(self, ctx): # if ctx.peek_code(1) == self.char: return self.ok else: ctx.skip_code(2) def set_category(self, ctx): # if self.ch_dispatcher.dispatch(ctx.peek_code(1), ctx): return self.ok else: ctx.skip_code(2) def set_charset(self, ctx): # (16 bits per code word) char_code = self.char ctx.skip_code(1) # point to beginning of bitmap if CODESIZE == 2: if char_code < 256 and ctx.peek_code(char_code >> 4) \ & (1 << (char_code & 15)): return self.ok ctx.skip_code(16) # skip bitmap else: if char_code < 256 and ctx.peek_code(char_code >> 5) \ & (1 << (char_code & 31)): return self.ok ctx.skip_code(8) # skip bitmap def set_range(self, ctx): # if ctx.peek_code(1) <= self.char <= ctx.peek_code(2): return self.ok ctx.skip_code(3) def set_negate(self, ctx): self.ok = not self.ok ctx.skip_code(1) #fixme brython. array module doesn't exist def set_bigcharset(self, ctx): raise NotImplementationError("_sre.py: set_bigcharset, array not implemented") # <256 blockindices> char_code = self.char count = ctx.peek_code(1) ctx.skip_code(2) if char_code < 65536: block_index = char_code >> 8 # NB: there are CODESIZE block indices per bytecode a = array.array("B") a.fromstring(array.array(CODESIZE == 2 and "H" or "I", [ctx.peek_code(block_index // CODESIZE)]).tostring()) block = a[block_index % CODESIZE] ctx.skip_code(256 // CODESIZE) # skip block indices block_value = ctx.peek_code(block * (32 // CODESIZE) + ((char_code & 255) >> (CODESIZE == 2 and 4 or 5))) if block_value & (1 << (char_code & ((8 * CODESIZE) - 1))): return self.ok else: ctx.skip_code(256 // CODESIZE) # skip block indices ctx.skip_code(count * (32 // CODESIZE)) # skip blocks def unknown(self, ctx): return False _CharsetDispatcher.build_dispatch_table(OPCODES, "set_") class _AtcodeDispatcher(_Dispatcher): def at_beginning(self, ctx): return ctx.at_beginning() at_beginning_string = at_beginning def at_beginning_line(self, ctx): return ctx.at_beginning() or _is_linebreak(ctx.peek_char(-1)) def at_end(self, ctx): return (ctx.remaining_chars() == 1 and ctx.at_linebreak()) or ctx.at_end() def at_end_line(self, ctx): return ctx.at_linebreak() or ctx.at_end() def at_end_string(self, ctx): return ctx.at_end() def at_boundary(self, ctx): return ctx.at_boundary(_is_word) def at_non_boundary(self, ctx): return not ctx.at_boundary(_is_word) def at_loc_boundary(self, ctx): return ctx.at_boundary(_is_loc_word) def at_loc_non_boundary(self, ctx): return not ctx.at_boundary(_is_loc_word) def at_uni_boundary(self, ctx): return ctx.at_boundary(_is_uni_word) def at_uni_non_boundary(self, ctx): return not ctx.at_boundary(_is_uni_word) def unknown(self, ctx): return False _AtcodeDispatcher.build_dispatch_table(ATCODES, "") class _ChcodeDispatcher(_Dispatcher): def category_digit(self, ctx): return _is_digit(ctx.peek_char()) def category_not_digit(self, ctx): return not _is_digit(ctx.peek_char()) def category_space(self, ctx): return _is_space(ctx.peek_char()) def category_not_space(self, ctx): return not _is_space(ctx.peek_char()) def category_word(self, ctx): return _is_word(ctx.peek_char()) def category_not_word(self, ctx): return not _is_word(ctx.peek_char()) def category_linebreak(self, ctx): return _is_linebreak(ctx.peek_char()) def category_not_linebreak(self, ctx): return not _is_linebreak(ctx.peek_char()) def category_loc_word(self, ctx): return _is_loc_word(ctx.peek_char()) def category_loc_not_word(self, ctx): return not _is_loc_word(ctx.peek_char()) def category_uni_digit(self, ctx): return ctx.peek_char().isdigit() def category_uni_not_digit(self, ctx): return not ctx.peek_char().isdigit() def category_uni_space(self, ctx): return ctx.peek_char().isspace() def category_uni_not_space(self, ctx): return not ctx.peek_char().isspace() def category_uni_word(self, ctx): return _is_uni_word(ctx.peek_char()) def category_uni_not_word(self, ctx): return not _is_uni_word(ctx.peek_char()) def category_uni_linebreak(self, ctx): return ord(ctx.peek_char()) in _uni_linebreaks def category_uni_not_linebreak(self, ctx): return ord(ctx.peek_char()) not in _uni_linebreaks def unknown(self, ctx): return False _ChcodeDispatcher.build_dispatch_table(CHCODES, "") _ascii_char_info = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 ] def _is_digit(char): code = ord(char) return code < 128 and _ascii_char_info[code] & 1 def _is_space(char): code = ord(char) return code < 128 and _ascii_char_info[code] & 2 def _is_word(char): # NB: non-ASCII chars aren't words according to _sre.c code = ord(char) return code < 128 and _ascii_char_info[code] & 16 def _is_loc_word(char): return (not (ord(char) & ~255) and char.isalnum()) or char == '_' def _is_uni_word(char): # not valid in python 3 #return unichr(ord(char)).isalnum() or char == '_' return chr(ord(char)).isalnum() or char == '_' def _is_linebreak(char): return char == "\n" # Static list of all unicode codepoints reported by Py_UNICODE_ISLINEBREAK. _uni_linebreaks = [10, 13, 28, 29, 30, 133, 8232, 8233] def _log(message): if 0: print(message)