|
1 """ |
|
2 Functions for reversing a regular expression (used in reverse URL resolving). |
|
3 Used internally by Django and not intended for external use. |
|
4 |
|
5 This is not, and is not intended to be, a complete reg-exp decompiler. It |
|
6 should be good enough for a large class of URLS, however. |
|
7 """ |
|
8 |
|
9 # Mapping of an escape character to a representative of that class. So, e.g., |
|
10 # "\w" is replaced by "x" in a reverse URL. A value of None means to ignore |
|
11 # this sequence. Any missing key is mapped to itself. |
|
12 ESCAPE_MAPPINGS = { |
|
13 "A": None, |
|
14 "b": None, |
|
15 "B": None, |
|
16 "d": u"0", |
|
17 "D": u"x", |
|
18 "s": u" ", |
|
19 "S": u"x", |
|
20 "w": u"x", |
|
21 "W": u"!", |
|
22 "Z": None, |
|
23 } |
|
24 |
|
25 class Choice(list): |
|
26 """ |
|
27 Used to represent multiple possibilities at this point in a pattern string. |
|
28 We use a distinguished type, rather than a list, so that the usage in the |
|
29 code is clear. |
|
30 """ |
|
31 |
|
32 class Group(list): |
|
33 """ |
|
34 Used to represent a capturing group in the pattern string. |
|
35 """ |
|
36 |
|
37 class NonCapture(list): |
|
38 """ |
|
39 Used to represent a non-capturing group in the pattern string. |
|
40 """ |
|
41 |
|
42 def normalize(pattern): |
|
43 """ |
|
44 Given a reg-exp pattern, normalizes it to a list of forms that suffice for |
|
45 reverse matching. This does the following: |
|
46 |
|
47 (1) For any repeating sections, keeps the minimum number of occurrences |
|
48 permitted (this means zero for optional groups). |
|
49 (2) If an optional group includes parameters, include one occurrence of |
|
50 that group (along with the zero occurrence case from step (1)). |
|
51 (3) Select the first (essentially an arbitrary) element from any character |
|
52 class. Select an arbitrary character for any unordered class (e.g. '.' |
|
53 or '\w') in the pattern. |
|
54 (5) Ignore comments and any of the reg-exp flags that won't change |
|
55 what we construct ("iLmsu"). "(?x)" is an error, however. |
|
56 (6) Raise an error on all other non-capturing (?...) forms (e.g. |
|
57 look-ahead and look-behind matches) and any disjunctive ('|') |
|
58 constructs. |
|
59 |
|
60 Django's URLs for forward resolving are either all positional arguments or |
|
61 all keyword arguments. That is assumed here, as well. Although reverse |
|
62 resolving can be done using positional args when keyword args are |
|
63 specified, the two cannot be mixed in the same reverse() call. |
|
64 """ |
|
65 # Do a linear scan to work out the special features of this pattern. The |
|
66 # idea is that we scan once here and collect all the information we need to |
|
67 # make future decisions. |
|
68 result = [] |
|
69 non_capturing_groups = [] |
|
70 consume_next = True |
|
71 pattern_iter = next_char(iter(pattern)) |
|
72 num_args = 0 |
|
73 |
|
74 # A "while" loop is used here because later on we need to be able to peek |
|
75 # at the next character and possibly go around without consuming another |
|
76 # one at the top of the loop. |
|
77 try: |
|
78 ch, escaped = pattern_iter.next() |
|
79 except StopIteration: |
|
80 return zip([u''], [[]]) |
|
81 |
|
82 try: |
|
83 while True: |
|
84 if escaped: |
|
85 result.append(ch) |
|
86 elif ch == '.': |
|
87 # Replace "any character" with an arbitrary representative. |
|
88 result.append(u".") |
|
89 elif ch == '|': |
|
90 # FIXME: One day we'll should do this, but not in 1.0. |
|
91 raise NotImplementedError |
|
92 elif ch == "^": |
|
93 pass |
|
94 elif ch == '$': |
|
95 break |
|
96 elif ch == ')': |
|
97 # This can only be the end of a non-capturing group, since all |
|
98 # other unescaped parentheses are handled by the grouping |
|
99 # section later (and the full group is handled there). |
|
100 # |
|
101 # We regroup everything inside the capturing group so that it |
|
102 # can be quantified, if necessary. |
|
103 start = non_capturing_groups.pop() |
|
104 inner = NonCapture(result[start:]) |
|
105 result = result[:start] + [inner] |
|
106 elif ch == '[': |
|
107 # Replace ranges with the first character in the range. |
|
108 ch, escaped = pattern_iter.next() |
|
109 result.append(ch) |
|
110 ch, escaped = pattern_iter.next() |
|
111 while escaped or ch != ']': |
|
112 ch, escaped = pattern_iter.next() |
|
113 elif ch == '(': |
|
114 # Some kind of group. |
|
115 ch, escaped = pattern_iter.next() |
|
116 if ch != '?' or escaped: |
|
117 # A positional group |
|
118 name = "_%d" % num_args |
|
119 num_args += 1 |
|
120 result.append(Group(((u"%%(%s)s" % name), name))) |
|
121 walk_to_end(ch, pattern_iter) |
|
122 else: |
|
123 ch, escaped = pattern_iter.next() |
|
124 if ch in "iLmsu#": |
|
125 # All of these are ignorable. Walk to the end of the |
|
126 # group. |
|
127 walk_to_end(ch, pattern_iter) |
|
128 elif ch == ':': |
|
129 # Non-capturing group |
|
130 non_capturing_groups.append(len(result)) |
|
131 elif ch != 'P': |
|
132 # Anything else, other than a named group, is something |
|
133 # we cannot reverse. |
|
134 raise ValueError("Non-reversible reg-exp portion: '(?%s'" % ch) |
|
135 else: |
|
136 ch, escaped = pattern_iter.next() |
|
137 if ch != '<': |
|
138 raise ValueError("Non-reversible reg-exp portion: '(?P%s'" % ch) |
|
139 # We are in a named capturing group. Extra the name and |
|
140 # then skip to the end. |
|
141 name = [] |
|
142 ch, escaped = pattern_iter.next() |
|
143 while ch != '>': |
|
144 name.append(ch) |
|
145 ch, escaped = pattern_iter.next() |
|
146 param = ''.join(name) |
|
147 result.append(Group(((u"%%(%s)s" % param), param))) |
|
148 walk_to_end(ch, pattern_iter) |
|
149 elif ch in "*?+{": |
|
150 # Quanitifers affect the previous item in the result list. |
|
151 count, ch = get_quantifier(ch, pattern_iter) |
|
152 if ch: |
|
153 # We had to look ahead, but it wasn't need to compute the |
|
154 # quanitifer, so use this character next time around the |
|
155 # main loop. |
|
156 consume_next = False |
|
157 |
|
158 if count == 0: |
|
159 if contains(result[-1], Group): |
|
160 # If we are quantifying a capturing group (or |
|
161 # something containing such a group) and the minimum is |
|
162 # zero, we must also handle the case of one occurrence |
|
163 # being present. All the quantifiers (except {0,0}, |
|
164 # which we conveniently ignore) that have a 0 minimum |
|
165 # also allow a single occurrence. |
|
166 result[-1] = Choice([None, result[-1]]) |
|
167 else: |
|
168 result.pop() |
|
169 elif count > 1: |
|
170 result.extend([result[-1]] * (count - 1)) |
|
171 else: |
|
172 # Anything else is a literal. |
|
173 result.append(ch) |
|
174 |
|
175 if consume_next: |
|
176 ch, escaped = pattern_iter.next() |
|
177 else: |
|
178 consume_next = True |
|
179 except StopIteration: |
|
180 pass |
|
181 except NotImplementedError: |
|
182 # A case of using the disjunctive form. No results for you! |
|
183 return zip([u''], [[]]) |
|
184 |
|
185 return zip(*flatten_result(result)) |
|
186 |
|
187 def next_char(input_iter): |
|
188 """ |
|
189 An iterator that yields the next character from "pattern_iter", respecting |
|
190 escape sequences. An escaped character is replaced by a representative of |
|
191 its class (e.g. \w -> "x"). If the escaped character is one that is |
|
192 skipped, it is not returned (the next character is returned instead). |
|
193 |
|
194 Yields the next character, along with a boolean indicating whether it is a |
|
195 raw (unescaped) character or not. |
|
196 """ |
|
197 for ch in input_iter: |
|
198 if ch != '\\': |
|
199 yield ch, False |
|
200 continue |
|
201 ch = input_iter.next() |
|
202 representative = ESCAPE_MAPPINGS.get(ch, ch) |
|
203 if representative is None: |
|
204 continue |
|
205 yield representative, True |
|
206 |
|
207 def walk_to_end(ch, input_iter): |
|
208 """ |
|
209 The iterator is currently inside a capturing group. We want to walk to the |
|
210 close of this group, skipping over any nested groups and handling escaped |
|
211 parentheses correctly. |
|
212 """ |
|
213 if ch == '(': |
|
214 nesting = 1 |
|
215 else: |
|
216 nesting = 0 |
|
217 for ch, escaped in input_iter: |
|
218 if escaped: |
|
219 continue |
|
220 elif ch == '(': |
|
221 nesting += 1 |
|
222 elif ch == ')': |
|
223 if not nesting: |
|
224 return |
|
225 nesting -= 1 |
|
226 |
|
227 def get_quantifier(ch, input_iter): |
|
228 """ |
|
229 Parse a quantifier from the input, where "ch" is the first character in the |
|
230 quantifier. |
|
231 |
|
232 Returns the minimum number of occurences permitted by the quantifier and |
|
233 either None or the next character from the input_iter if the next character |
|
234 is not part of the quantifier. |
|
235 """ |
|
236 if ch in '*?+': |
|
237 try: |
|
238 ch2, escaped = input_iter.next() |
|
239 except StopIteration: |
|
240 ch2 = None |
|
241 if ch2 == '?': |
|
242 ch2 = None |
|
243 if ch == '+': |
|
244 return 1, ch2 |
|
245 return 0, ch2 |
|
246 |
|
247 quant = [] |
|
248 while ch != '}': |
|
249 ch, escaped = input_iter.next() |
|
250 quant.append(ch) |
|
251 quant = quant[:-1] |
|
252 values = ''.join(quant).split(',') |
|
253 |
|
254 # Consume the trailing '?', if necessary. |
|
255 try: |
|
256 ch, escaped = input_iter.next() |
|
257 except StopIteration: |
|
258 ch = None |
|
259 if ch == '?': |
|
260 ch = None |
|
261 return int(values[0]), ch |
|
262 |
|
263 def contains(source, inst): |
|
264 """ |
|
265 Returns True if the "source" contains an instance of "inst". False, |
|
266 otherwise. |
|
267 """ |
|
268 if isinstance(source, inst): |
|
269 return True |
|
270 if isinstance(source, NonCapture): |
|
271 for elt in source: |
|
272 if contains(elt, inst): |
|
273 return True |
|
274 return False |
|
275 |
|
276 def flatten_result(source): |
|
277 """ |
|
278 Turns the given source sequence into a list of reg-exp possibilities and |
|
279 their arguments. Returns a list of strings and a list of argument lists. |
|
280 Each of the two lists will be of the same length. |
|
281 """ |
|
282 if source is None: |
|
283 return [u''], [[]] |
|
284 if isinstance(source, Group): |
|
285 if source[1] is None: |
|
286 params = [] |
|
287 else: |
|
288 params = [source[1]] |
|
289 return [source[0]], [params] |
|
290 result = [u''] |
|
291 result_args = [[]] |
|
292 pos = last = 0 |
|
293 for pos, elt in enumerate(source): |
|
294 if isinstance(elt, basestring): |
|
295 continue |
|
296 piece = u''.join(source[last:pos]) |
|
297 if isinstance(elt, Group): |
|
298 piece += elt[0] |
|
299 param = elt[1] |
|
300 else: |
|
301 param = None |
|
302 last = pos + 1 |
|
303 for i in range(len(result)): |
|
304 result[i] += piece |
|
305 if param: |
|
306 result_args[i].append(param) |
|
307 if isinstance(elt, (Choice, NonCapture)): |
|
308 if isinstance(elt, NonCapture): |
|
309 elt = [elt] |
|
310 inner_result, inner_args = [], [] |
|
311 for item in elt: |
|
312 res, args = flatten_result(item) |
|
313 inner_result.extend(res) |
|
314 inner_args.extend(args) |
|
315 new_result = [] |
|
316 new_args = [] |
|
317 for item, args in zip(result, result_args): |
|
318 for i_item, i_args in zip(inner_result, inner_args): |
|
319 new_result.append(item + i_item) |
|
320 new_args.append(args[:] + i_args) |
|
321 result = new_result |
|
322 result_args = new_args |
|
323 if pos >= last: |
|
324 piece = u''.join(source[last:]) |
|
325 for i in range(len(result)): |
|
326 result[i] += piece |
|
327 return result, result_args |
|
328 |