diff -r 6641e941ef1e -r ff1a9aa48cfd app/django/utils/regex_helper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app/django/utils/regex_helper.py Tue Oct 14 16:00:59 2008 +0000 @@ -0,0 +1,328 @@ +""" +Functions for reversing a regular expression (used in reverse URL resolving). +Used internally by Django and not intended for external use. + +This is not, and is not intended to be, a complete reg-exp decompiler. It +should be good enough for a large class of URLS, however. +""" + +# Mapping of an escape character to a representative of that class. So, e.g., +# "\w" is replaced by "x" in a reverse URL. A value of None means to ignore +# this sequence. Any missing key is mapped to itself. +ESCAPE_MAPPINGS = { + "A": None, + "b": None, + "B": None, + "d": u"0", + "D": u"x", + "s": u" ", + "S": u"x", + "w": u"x", + "W": u"!", + "Z": None, +} + +class Choice(list): + """ + Used to represent multiple possibilities at this point in a pattern string. + We use a distinguished type, rather than a list, so that the usage in the + code is clear. + """ + +class Group(list): + """ + Used to represent a capturing group in the pattern string. + """ + +class NonCapture(list): + """ + Used to represent a non-capturing group in the pattern string. + """ + +def normalize(pattern): + """ + Given a reg-exp pattern, normalizes it to a list of forms that suffice for + reverse matching. This does the following: + + (1) For any repeating sections, keeps the minimum number of occurrences + permitted (this means zero for optional groups). + (2) If an optional group includes parameters, include one occurrence of + that group (along with the zero occurrence case from step (1)). + (3) Select the first (essentially an arbitrary) element from any character + class. Select an arbitrary character for any unordered class (e.g. '.' + or '\w') in the pattern. + (5) Ignore comments and any of the reg-exp flags that won't change + what we construct ("iLmsu"). "(?x)" is an error, however. + (6) Raise an error on all other non-capturing (?...) forms (e.g. + look-ahead and look-behind matches) and any disjunctive ('|') + constructs. + + Django's URLs for forward resolving are either all positional arguments or + all keyword arguments. That is assumed here, as well. Although reverse + resolving can be done using positional args when keyword args are + specified, the two cannot be mixed in the same reverse() call. + """ + # Do a linear scan to work out the special features of this pattern. The + # idea is that we scan once here and collect all the information we need to + # make future decisions. + result = [] + non_capturing_groups = [] + consume_next = True + pattern_iter = next_char(iter(pattern)) + num_args = 0 + + # A "while" loop is used here because later on we need to be able to peek + # at the next character and possibly go around without consuming another + # one at the top of the loop. + try: + ch, escaped = pattern_iter.next() + except StopIteration: + return zip([u''], [[]]) + + try: + while True: + if escaped: + result.append(ch) + elif ch == '.': + # Replace "any character" with an arbitrary representative. + result.append(u".") + elif ch == '|': + # FIXME: One day we'll should do this, but not in 1.0. + raise NotImplementedError + elif ch == "^": + pass + elif ch == '$': + break + elif ch == ')': + # This can only be the end of a non-capturing group, since all + # other unescaped parentheses are handled by the grouping + # section later (and the full group is handled there). + # + # We regroup everything inside the capturing group so that it + # can be quantified, if necessary. + start = non_capturing_groups.pop() + inner = NonCapture(result[start:]) + result = result[:start] + [inner] + elif ch == '[': + # Replace ranges with the first character in the range. + ch, escaped = pattern_iter.next() + result.append(ch) + ch, escaped = pattern_iter.next() + while escaped or ch != ']': + ch, escaped = pattern_iter.next() + elif ch == '(': + # Some kind of group. + ch, escaped = pattern_iter.next() + if ch != '?' or escaped: + # A positional group + name = "_%d" % num_args + num_args += 1 + result.append(Group(((u"%%(%s)s" % name), name))) + walk_to_end(ch, pattern_iter) + else: + ch, escaped = pattern_iter.next() + if ch in "iLmsu#": + # All of these are ignorable. Walk to the end of the + # group. + walk_to_end(ch, pattern_iter) + elif ch == ':': + # Non-capturing group + non_capturing_groups.append(len(result)) + elif ch != 'P': + # Anything else, other than a named group, is something + # we cannot reverse. + raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch) + else: + ch, escaped = pattern_iter.next() + if ch != '<': + raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch) + # We are in a named capturing group. Extra the name and + # then skip to the end. + name = [] + ch, escaped = pattern_iter.next() + while ch != '>': + name.append(ch) + ch, escaped = pattern_iter.next() + param = ''.join(name) + result.append(Group(((u"%%(%s)s" % param), param))) + walk_to_end(ch, pattern_iter) + elif ch in "*?+{": + # Quanitifers affect the previous item in the result list. + count, ch = get_quantifier(ch, pattern_iter) + if ch: + # We had to look ahead, but it wasn't need to compute the + # quanitifer, so use this character next time around the + # main loop. + consume_next = False + + if count == 0: + if contains(result[-1], Group): + # If we are quantifying a capturing group (or + # something containing such a group) and the minimum is + # zero, we must also handle the case of one occurrence + # being present. All the quantifiers (except {0,0}, + # which we conveniently ignore) that have a 0 minimum + # also allow a single occurrence. + result[-1] = Choice([None, result[-1]]) + else: + result.pop() + elif count > 1: + result.extend([result[-1]] * (count - 1)) + else: + # Anything else is a literal. + result.append(ch) + + if consume_next: + ch, escaped = pattern_iter.next() + else: + consume_next = True + except StopIteration: + pass + except NotImplementedError: + # A case of using the disjunctive form. No results for you! + return zip([u''], [[]]) + + return zip(*flatten_result(result)) + +def next_char(input_iter): + """ + An iterator that yields the next character from "pattern_iter", respecting + escape sequences. An escaped character is replaced by a representative of + its class (e.g. \w -> "x"). If the escaped character is one that is + skipped, it is not returned (the next character is returned instead). + + Yields the next character, along with a boolean indicating whether it is a + raw (unescaped) character or not. + """ + for ch in input_iter: + if ch != '\\': + yield ch, False + continue + ch = input_iter.next() + representative = ESCAPE_MAPPINGS.get(ch, ch) + if representative is None: + continue + yield representative, True + +def walk_to_end(ch, input_iter): + """ + The iterator is currently inside a capturing group. We want to walk to the + close of this group, skipping over any nested groups and handling escaped + parentheses correctly. + """ + if ch == '(': + nesting = 1 + else: + nesting = 0 + for ch, escaped in input_iter: + if escaped: + continue + elif ch == '(': + nesting += 1 + elif ch == ')': + if not nesting: + return + nesting -= 1 + +def get_quantifier(ch, input_iter): + """ + Parse a quantifier from the input, where "ch" is the first character in the + quantifier. + + Returns the minimum number of occurences permitted by the quantifier and + either None or the next character from the input_iter if the next character + is not part of the quantifier. + """ + if ch in '*?+': + try: + ch2, escaped = input_iter.next() + except StopIteration: + ch2 = None + if ch2 == '?': + ch2 = None + if ch == '+': + return 1, ch2 + return 0, ch2 + + quant = [] + while ch != '}': + ch, escaped = input_iter.next() + quant.append(ch) + quant = quant[:-1] + values = ''.join(quant).split(',') + + # Consume the trailing '?', if necessary. + try: + ch, escaped = input_iter.next() + except StopIteration: + ch = None + if ch == '?': + ch = None + return int(values[0]), ch + +def contains(source, inst): + """ + Returns True if the "source" contains an instance of "inst". False, + otherwise. + """ + if isinstance(source, inst): + return True + if isinstance(source, NonCapture): + for elt in source: + if contains(elt, inst): + return True + return False + +def flatten_result(source): + """ + Turns the given source sequence into a list of reg-exp possibilities and + their arguments. Returns a list of strings and a list of argument lists. + Each of the two lists will be of the same length. + """ + if source is None: + return [u''], [[]] + if isinstance(source, Group): + if source[1] is None: + params = [] + else: + params = [source[1]] + return [source[0]], [params] + result = [u''] + result_args = [[]] + pos = last = 0 + for pos, elt in enumerate(source): + if isinstance(elt, basestring): + continue + piece = u''.join(source[last:pos]) + if isinstance(elt, Group): + piece += elt[0] + param = elt[1] + else: + param = None + last = pos + 1 + for i in range(len(result)): + result[i] += piece + if param: + result_args[i].append(param) + if isinstance(elt, (Choice, NonCapture)): + if isinstance(elt, NonCapture): + elt = [elt] + inner_result, inner_args = [], [] + for item in elt: + res, args = flatten_result(item) + inner_result.extend(res) + inner_args.extend(args) + new_result = [] + new_args = [] + for item, args in zip(result, result_args): + for i_item, i_args in zip(inner_result, inner_args): + new_result.append(item + i_item) + new_args.append(args[:] + i_args) + result = new_result + result_args = new_args + if pos >= last: + piece = u''.join(source[last:]) + for i in range(len(result)): + result[i] += piece + return result, result_args +