app/django/utils/regex_helper.py
changeset 323 ff1a9aa48cfd
equal deleted inserted replaced
322:6641e941ef1e 323:ff1a9aa48cfd
       
     1 """
       
     2 Functions for reversing a regular expression (used in reverse URL resolving).
       
     3 Used internally by Django and not intended for external use.
       
     4 
       
     5 This is not, and is not intended to be, a complete reg-exp decompiler. It
       
     6 should be good enough for a large class of URLS, however.
       
     7 """
       
     8 
       
     9 # Mapping of an escape character to a representative of that class. So, e.g.,
       
    10 # "\w" is replaced by "x" in a reverse URL. A value of None means to ignore
       
    11 # this sequence. Any missing key is mapped to itself.
       
    12 ESCAPE_MAPPINGS = {
       
    13     "A": None,
       
    14     "b": None,
       
    15     "B": None,
       
    16     "d": u"0",
       
    17     "D": u"x",
       
    18     "s": u" ",
       
    19     "S": u"x",
       
    20     "w": u"x",
       
    21     "W": u"!",
       
    22     "Z": None,
       
    23 }
       
    24 
       
    25 class Choice(list):
       
    26     """
       
    27     Used to represent multiple possibilities at this point in a pattern string.
       
    28     We use a distinguished type, rather than a list, so that the usage in the
       
    29     code is clear.
       
    30     """
       
    31 
       
    32 class Group(list):
       
    33     """
       
    34     Used to represent a capturing group in the pattern string.
       
    35     """
       
    36 
       
    37 class NonCapture(list):
       
    38     """
       
    39     Used to represent a non-capturing group in the pattern string.
       
    40     """
       
    41 
       
    42 def normalize(pattern):
       
    43     """
       
    44     Given a reg-exp pattern, normalizes it to a list of forms that suffice for
       
    45     reverse matching. This does the following:
       
    46 
       
    47     (1) For any repeating sections, keeps the minimum number of occurrences
       
    48         permitted (this means zero for optional groups).
       
    49     (2) If an optional group includes parameters, include one occurrence of
       
    50         that group (along with the zero occurrence case from step (1)).
       
    51     (3) Select the first (essentially an arbitrary) element from any character
       
    52         class. Select an arbitrary character for any unordered class (e.g. '.'
       
    53         or '\w') in the pattern.
       
    54     (5) Ignore comments and any of the reg-exp flags that won't change
       
    55         what we construct ("iLmsu"). "(?x)" is an error, however.
       
    56     (6) Raise an error on all other non-capturing (?...) forms (e.g.
       
    57         look-ahead and look-behind matches) and any disjunctive ('|')
       
    58         constructs.
       
    59 
       
    60     Django's URLs for forward resolving are either all positional arguments or
       
    61     all keyword arguments. That is assumed here, as well. Although reverse
       
    62     resolving can be done using positional args when keyword args are
       
    63     specified, the two cannot be mixed in the same reverse() call.
       
    64     """
       
    65     # Do a linear scan to work out the special features of this pattern. The
       
    66     # idea is that we scan once here and collect all the information we need to
       
    67     # make future decisions.
       
    68     result = []
       
    69     non_capturing_groups = []
       
    70     consume_next = True
       
    71     pattern_iter = next_char(iter(pattern))
       
    72     num_args = 0
       
    73 
       
    74     # A "while" loop is used here because later on we need to be able to peek
       
    75     # at the next character and possibly go around without consuming another
       
    76     # one at the top of the loop.
       
    77     try:
       
    78         ch, escaped = pattern_iter.next()
       
    79     except StopIteration:
       
    80         return zip([u''],  [[]])
       
    81 
       
    82     try:
       
    83         while True:
       
    84             if escaped:
       
    85                 result.append(ch)
       
    86             elif ch == '.':
       
    87                 # Replace "any character" with an arbitrary representative.
       
    88                 result.append(u".")
       
    89             elif ch == '|':
       
    90                 # FIXME: One day we'll should do this, but not in 1.0.
       
    91                 raise NotImplementedError
       
    92             elif ch == "^":
       
    93                 pass
       
    94             elif ch == '$':
       
    95                 break
       
    96             elif ch == ')':
       
    97                 # This can only be the end of a non-capturing group, since all
       
    98                 # other unescaped parentheses are handled by the grouping
       
    99                 # section later (and the full group is handled there).
       
   100                 #
       
   101                 # We regroup everything inside the capturing group so that it
       
   102                 # can be quantified, if necessary.
       
   103                 start = non_capturing_groups.pop()
       
   104                 inner = NonCapture(result[start:])
       
   105                 result = result[:start] + [inner]
       
   106             elif ch == '[':
       
   107                 # Replace ranges with the first character in the range.
       
   108                 ch, escaped = pattern_iter.next()
       
   109                 result.append(ch)
       
   110                 ch, escaped = pattern_iter.next()
       
   111                 while escaped or ch != ']':
       
   112                     ch, escaped = pattern_iter.next()
       
   113             elif ch == '(':
       
   114                 # Some kind of group.
       
   115                 ch, escaped = pattern_iter.next()
       
   116                 if ch != '?' or escaped:
       
   117                     # A positional group
       
   118                     name = "_%d" % num_args
       
   119                     num_args += 1
       
   120                     result.append(Group(((u"%%(%s)s" % name), name)))
       
   121                     walk_to_end(ch, pattern_iter)
       
   122                 else:
       
   123                     ch, escaped = pattern_iter.next()
       
   124                     if ch in "iLmsu#":
       
   125                         # All of these are ignorable. Walk to the end of the
       
   126                         # group.
       
   127                         walk_to_end(ch, pattern_iter)
       
   128                     elif ch == ':':
       
   129                         # Non-capturing group
       
   130                         non_capturing_groups.append(len(result))
       
   131                     elif ch != 'P':
       
   132                         # Anything else, other than a named group, is something
       
   133                         # we cannot reverse.
       
   134                         raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch)
       
   135                     else:
       
   136                         ch, escaped = pattern_iter.next()
       
   137                         if ch != '<':
       
   138                             raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch)
       
   139                         # We are in a named capturing group. Extra the name and
       
   140                         # then skip to the end.
       
   141                         name = []
       
   142                         ch, escaped = pattern_iter.next()
       
   143                         while ch != '>':
       
   144                             name.append(ch)
       
   145                             ch, escaped = pattern_iter.next()
       
   146                         param = ''.join(name)
       
   147                         result.append(Group(((u"%%(%s)s" % param), param)))
       
   148                         walk_to_end(ch, pattern_iter)
       
   149             elif ch in "*?+{":
       
   150                 # Quanitifers affect the previous item in the result list.
       
   151                 count, ch = get_quantifier(ch, pattern_iter)
       
   152                 if ch:
       
   153                     # We had to look ahead, but it wasn't need to compute the
       
   154                     # quanitifer, so use this character next time around the
       
   155                     # main loop.
       
   156                     consume_next = False
       
   157 
       
   158                 if count == 0:
       
   159                     if contains(result[-1], Group):
       
   160                         # If we are quantifying a capturing group (or
       
   161                         # something containing such a group) and the minimum is
       
   162                         # zero, we must also handle the case of one occurrence
       
   163                         # being present. All the quantifiers (except {0,0},
       
   164                         # which we conveniently ignore) that have a 0 minimum
       
   165                         # also allow a single occurrence.
       
   166                         result[-1] = Choice([None, result[-1]])
       
   167                     else:
       
   168                         result.pop()
       
   169                 elif count > 1:
       
   170                     result.extend([result[-1]] * (count - 1))
       
   171             else:
       
   172                 # Anything else is a literal.
       
   173                 result.append(ch)
       
   174 
       
   175             if consume_next:
       
   176                 ch, escaped = pattern_iter.next()
       
   177             else:
       
   178                 consume_next = True
       
   179     except StopIteration:
       
   180         pass
       
   181     except NotImplementedError:
       
   182         # A case of using the disjunctive form. No results for you!
       
   183         return zip([u''],  [[]])
       
   184 
       
   185     return zip(*flatten_result(result))
       
   186 
       
   187 def next_char(input_iter):
       
   188     """
       
   189     An iterator that yields the next character from "pattern_iter", respecting
       
   190     escape sequences. An escaped character is replaced by a representative of
       
   191     its class (e.g. \w -> "x"). If the escaped character is one that is
       
   192     skipped, it is not returned (the next character is returned instead).
       
   193 
       
   194     Yields the next character, along with a boolean indicating whether it is a
       
   195     raw (unescaped) character or not.
       
   196     """
       
   197     for ch in input_iter:
       
   198         if ch != '\\':
       
   199             yield ch, False
       
   200             continue
       
   201         ch = input_iter.next()
       
   202         representative = ESCAPE_MAPPINGS.get(ch, ch)
       
   203         if representative is None:
       
   204             continue
       
   205         yield representative, True
       
   206 
       
   207 def walk_to_end(ch, input_iter):
       
   208     """
       
   209     The iterator is currently inside a capturing group. We want to walk to the
       
   210     close of this group, skipping over any nested groups and handling escaped
       
   211     parentheses correctly.
       
   212     """
       
   213     if ch == '(':
       
   214         nesting = 1
       
   215     else:
       
   216         nesting = 0
       
   217     for ch, escaped in input_iter:
       
   218         if escaped:
       
   219             continue
       
   220         elif ch == '(':
       
   221             nesting += 1
       
   222         elif ch == ')':
       
   223             if not nesting:
       
   224                 return
       
   225             nesting -= 1
       
   226 
       
   227 def get_quantifier(ch, input_iter):
       
   228     """
       
   229     Parse a quantifier from the input, where "ch" is the first character in the
       
   230     quantifier.
       
   231 
       
   232     Returns the minimum number of occurences permitted by the quantifier and
       
   233     either None or the next character from the input_iter if the next character
       
   234     is not part of the quantifier.
       
   235     """
       
   236     if ch in '*?+':
       
   237         try:
       
   238             ch2, escaped = input_iter.next()
       
   239         except StopIteration:
       
   240             ch2 = None
       
   241         if ch2 == '?':
       
   242             ch2 = None
       
   243         if ch == '+':
       
   244             return 1, ch2
       
   245         return 0, ch2
       
   246 
       
   247     quant = []
       
   248     while ch != '}':
       
   249         ch, escaped = input_iter.next()
       
   250         quant.append(ch)
       
   251     quant = quant[:-1]
       
   252     values = ''.join(quant).split(',')
       
   253 
       
   254     # Consume the trailing '?', if necessary.
       
   255     try:
       
   256         ch, escaped = input_iter.next()
       
   257     except StopIteration:
       
   258         ch = None
       
   259     if ch == '?':
       
   260         ch = None
       
   261     return int(values[0]), ch
       
   262 
       
   263 def contains(source, inst):
       
   264     """
       
   265     Returns True if the "source" contains an instance of "inst". False,
       
   266     otherwise.
       
   267     """
       
   268     if isinstance(source, inst):
       
   269         return True
       
   270     if isinstance(source, NonCapture):
       
   271         for elt in source:
       
   272             if contains(elt, inst):
       
   273                 return True
       
   274     return False
       
   275 
       
   276 def flatten_result(source):
       
   277     """
       
   278     Turns the given source sequence into a list of reg-exp possibilities and
       
   279     their arguments. Returns a list of strings and a list of argument lists.
       
   280     Each of the two lists will be of the same length.
       
   281     """
       
   282     if source is None:
       
   283         return [u''], [[]]
       
   284     if isinstance(source, Group):
       
   285         if source[1] is None:
       
   286             params = []
       
   287         else:
       
   288             params = [source[1]]
       
   289         return [source[0]], [params]
       
   290     result = [u'']
       
   291     result_args = [[]]
       
   292     pos = last = 0
       
   293     for pos, elt in enumerate(source):
       
   294         if isinstance(elt, basestring):
       
   295             continue
       
   296         piece = u''.join(source[last:pos])
       
   297         if isinstance(elt, Group):
       
   298             piece += elt[0]
       
   299             param = elt[1]
       
   300         else:
       
   301             param = None
       
   302         last = pos + 1
       
   303         for i in range(len(result)):
       
   304             result[i] += piece
       
   305             if param:
       
   306                 result_args[i].append(param)
       
   307         if isinstance(elt, (Choice, NonCapture)):
       
   308             if isinstance(elt, NonCapture):
       
   309                 elt = [elt]
       
   310             inner_result, inner_args = [], []
       
   311             for item in elt:
       
   312                 res, args = flatten_result(item)
       
   313                 inner_result.extend(res)
       
   314                 inner_args.extend(args)
       
   315             new_result = []
       
   316             new_args = []
       
   317             for item, args in zip(result, result_args):
       
   318                 for i_item, i_args in zip(inner_result, inner_args):
       
   319                     new_result.append(item + i_item)
       
   320                     new_args.append(args[:] + i_args)
       
   321             result = new_result
       
   322             result_args = new_args
       
   323     if pos >= last:
       
   324         piece = u''.join(source[last:])
       
   325         for i in range(len(result)):
       
   326             result[i] += piece
       
   327     return result, result_args
       
   328