|
1 """Implementation of JSONDecoder |
|
2 """ |
|
3 import re |
|
4 import sys |
|
5 import struct |
|
6 |
|
7 from simplejson.scanner import make_scanner |
|
8 try: |
|
9 from simplejson._speedups import scanstring as c_scanstring |
|
10 except ImportError: |
|
11 c_scanstring = None |
|
12 |
|
13 __all__ = ['JSONDecoder'] |
|
14 |
|
15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL |
|
16 |
|
17 def _floatconstants(): |
|
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') |
|
19 if sys.byteorder != 'big': |
|
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] |
|
21 nan, inf = struct.unpack('dd', _BYTES) |
|
22 return nan, inf, -inf |
|
23 |
|
24 NaN, PosInf, NegInf = _floatconstants() |
|
25 |
|
26 |
|
27 def linecol(doc, pos): |
|
28 lineno = doc.count('\n', 0, pos) + 1 |
|
29 if lineno == 1: |
|
30 colno = pos |
|
31 else: |
|
32 colno = pos - doc.rindex('\n', 0, pos) |
|
33 return lineno, colno |
|
34 |
|
35 |
|
36 def errmsg(msg, doc, pos, end=None): |
|
37 # Note that this function is called from _speedups |
|
38 lineno, colno = linecol(doc, pos) |
|
39 if end is None: |
|
40 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) |
|
41 endlineno, endcolno = linecol(doc, end) |
|
42 return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( |
|
43 msg, lineno, colno, endlineno, endcolno, pos, end) |
|
44 |
|
45 |
|
46 _CONSTANTS = { |
|
47 '-Infinity': NegInf, |
|
48 'Infinity': PosInf, |
|
49 'NaN': NaN, |
|
50 } |
|
51 |
|
52 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) |
|
53 BACKSLASH = { |
|
54 '"': u'"', '\\': u'\\', '/': u'/', |
|
55 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', |
|
56 } |
|
57 |
|
58 DEFAULT_ENCODING = "utf-8" |
|
59 |
|
60 def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): |
|
61 """Scan the string s for a JSON string. End is the index of the |
|
62 character in s after the quote that started the JSON string. |
|
63 Unescapes all valid JSON string escape sequences and raises ValueError |
|
64 on attempt to decode an invalid string. If strict is False then literal |
|
65 control characters are allowed in the string. |
|
66 |
|
67 Returns a tuple of the decoded string and the index of the character in s |
|
68 after the end quote.""" |
|
69 if encoding is None: |
|
70 encoding = DEFAULT_ENCODING |
|
71 chunks = [] |
|
72 _append = chunks.append |
|
73 begin = end - 1 |
|
74 while 1: |
|
75 chunk = _m(s, end) |
|
76 if chunk is None: |
|
77 raise ValueError( |
|
78 errmsg("Unterminated string starting at", s, begin)) |
|
79 end = chunk.end() |
|
80 content, terminator = chunk.groups() |
|
81 # Content is contains zero or more unescaped string characters |
|
82 if content: |
|
83 if not isinstance(content, unicode): |
|
84 content = unicode(content, encoding) |
|
85 _append(content) |
|
86 # Terminator is the end of string, a literal control character, |
|
87 # or a backslash denoting that an escape sequence follows |
|
88 if terminator == '"': |
|
89 break |
|
90 elif terminator != '\\': |
|
91 if strict: |
|
92 msg = "Invalid control character %r at" % (terminator,) |
|
93 raise ValueError(msg, s, end) |
|
94 else: |
|
95 _append(terminator) |
|
96 continue |
|
97 try: |
|
98 esc = s[end] |
|
99 except IndexError: |
|
100 raise ValueError( |
|
101 errmsg("Unterminated string starting at", s, begin)) |
|
102 # If not a unicode escape sequence, must be in the lookup table |
|
103 if esc != 'u': |
|
104 try: |
|
105 char = _b[esc] |
|
106 except KeyError: |
|
107 raise ValueError( |
|
108 errmsg("Invalid \\escape: %r" % (esc,), s, end)) |
|
109 end += 1 |
|
110 else: |
|
111 # Unicode escape sequence |
|
112 esc = s[end + 1:end + 5] |
|
113 next_end = end + 5 |
|
114 if len(esc) != 4: |
|
115 msg = "Invalid \\uXXXX escape" |
|
116 raise ValueError(errmsg(msg, s, end)) |
|
117 uni = int(esc, 16) |
|
118 # Check for surrogate pair on UCS-4 systems |
|
119 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: |
|
120 msg = "Invalid \\uXXXX\\uXXXX surrogate pair" |
|
121 if not s[end + 5:end + 7] == '\\u': |
|
122 raise ValueError(errmsg(msg, s, end)) |
|
123 esc2 = s[end + 7:end + 11] |
|
124 if len(esc2) != 4: |
|
125 raise ValueError(errmsg(msg, s, end)) |
|
126 uni2 = int(esc2, 16) |
|
127 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) |
|
128 next_end += 6 |
|
129 char = unichr(uni) |
|
130 end = next_end |
|
131 # Append the unescaped character |
|
132 _append(char) |
|
133 return u''.join(chunks), end |
|
134 |
|
135 |
|
136 # Use speedup if available |
|
137 scanstring = c_scanstring or py_scanstring |
|
138 |
|
139 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) |
|
140 WHITESPACE_STR = ' \t\n\r' |
|
141 |
|
142 def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
|
143 pairs = {} |
|
144 # Use a slice to prevent IndexError from being raised, the following |
|
145 # check will raise a more specific ValueError if the string is empty |
|
146 nextchar = s[end:end + 1] |
|
147 # Normally we expect nextchar == '"' |
|
148 if nextchar != '"': |
|
149 if nextchar in _ws: |
|
150 end = _w(s, end).end() |
|
151 nextchar = s[end:end + 1] |
|
152 # Trivial empty object |
|
153 if nextchar == '}': |
|
154 return pairs, end + 1 |
|
155 elif nextchar != '"': |
|
156 raise ValueError(errmsg("Expecting property name", s, end)) |
|
157 end += 1 |
|
158 while True: |
|
159 key, end = scanstring(s, end, encoding, strict) |
|
160 |
|
161 # To skip some function call overhead we optimize the fast paths where |
|
162 # the JSON key separator is ": " or just ":". |
|
163 if s[end:end + 1] != ':': |
|
164 end = _w(s, end).end() |
|
165 if s[end:end + 1] != ':': |
|
166 raise ValueError(errmsg("Expecting : delimiter", s, end)) |
|
167 |
|
168 end += 1 |
|
169 |
|
170 try: |
|
171 if s[end] in _ws: |
|
172 end += 1 |
|
173 if s[end] in _ws: |
|
174 end = _w(s, end + 1).end() |
|
175 except IndexError: |
|
176 pass |
|
177 |
|
178 try: |
|
179 value, end = scan_once(s, end) |
|
180 except StopIteration: |
|
181 raise ValueError(errmsg("Expecting object", s, end)) |
|
182 pairs[key] = value |
|
183 |
|
184 try: |
|
185 nextchar = s[end] |
|
186 if nextchar in _ws: |
|
187 end = _w(s, end + 1).end() |
|
188 nextchar = s[end] |
|
189 except IndexError: |
|
190 nextchar = '' |
|
191 end += 1 |
|
192 |
|
193 if nextchar == '}': |
|
194 break |
|
195 elif nextchar != ',': |
|
196 raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) |
|
197 |
|
198 try: |
|
199 nextchar = s[end] |
|
200 if nextchar in _ws: |
|
201 end += 1 |
|
202 nextchar = s[end] |
|
203 if nextchar in _ws: |
|
204 end = _w(s, end + 1).end() |
|
205 nextchar = s[end] |
|
206 except IndexError: |
|
207 nextchar = '' |
|
208 |
|
209 end += 1 |
|
210 if nextchar != '"': |
|
211 raise ValueError(errmsg("Expecting property name", s, end - 1)) |
|
212 |
|
213 if object_hook is not None: |
|
214 pairs = object_hook(pairs) |
|
215 return pairs, end |
|
216 |
|
217 def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
|
218 values = [] |
|
219 nextchar = s[end:end + 1] |
|
220 if nextchar in _ws: |
|
221 end = _w(s, end + 1).end() |
|
222 nextchar = s[end:end + 1] |
|
223 # Look-ahead for trivial empty array |
|
224 if nextchar == ']': |
|
225 return values, end + 1 |
|
226 _append = values.append |
|
227 while True: |
|
228 try: |
|
229 value, end = scan_once(s, end) |
|
230 except StopIteration: |
|
231 raise ValueError(errmsg("Expecting object", s, end)) |
|
232 _append(value) |
|
233 nextchar = s[end:end + 1] |
|
234 if nextchar in _ws: |
|
235 end = _w(s, end + 1).end() |
|
236 nextchar = s[end:end + 1] |
|
237 end += 1 |
|
238 if nextchar == ']': |
|
239 break |
|
240 elif nextchar != ',': |
|
241 raise ValueError(errmsg("Expecting , delimiter", s, end)) |
|
242 |
|
243 try: |
|
244 if s[end] in _ws: |
|
245 end += 1 |
|
246 if s[end] in _ws: |
|
247 end = _w(s, end + 1).end() |
|
248 except IndexError: |
|
249 pass |
|
250 |
|
251 return values, end |
|
252 |
|
253 class JSONDecoder(object): |
|
254 """Simple JSON <http://json.org> decoder |
|
255 |
|
256 Performs the following translations in decoding by default: |
|
257 |
|
258 +---------------+-------------------+ |
|
259 | JSON | Python | |
|
260 +===============+===================+ |
|
261 | object | dict | |
|
262 +---------------+-------------------+ |
|
263 | array | list | |
|
264 +---------------+-------------------+ |
|
265 | string | unicode | |
|
266 +---------------+-------------------+ |
|
267 | number (int) | int, long | |
|
268 +---------------+-------------------+ |
|
269 | number (real) | float | |
|
270 +---------------+-------------------+ |
|
271 | true | True | |
|
272 +---------------+-------------------+ |
|
273 | false | False | |
|
274 +---------------+-------------------+ |
|
275 | null | None | |
|
276 +---------------+-------------------+ |
|
277 |
|
278 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as |
|
279 their corresponding ``float`` values, which is outside the JSON spec. |
|
280 |
|
281 """ |
|
282 |
|
283 def __init__(self, encoding=None, object_hook=None, parse_float=None, |
|
284 parse_int=None, parse_constant=None, strict=True): |
|
285 """``encoding`` determines the encoding used to interpret any ``str`` |
|
286 objects decoded by this instance (utf-8 by default). It has no |
|
287 effect when decoding ``unicode`` objects. |
|
288 |
|
289 Note that currently only encodings that are a superset of ASCII work, |
|
290 strings of other encodings should be passed in as ``unicode``. |
|
291 |
|
292 ``object_hook``, if specified, will be called with the result |
|
293 of every JSON object decoded and its return value will be used in |
|
294 place of the given ``dict``. This can be used to provide custom |
|
295 deserializations (e.g. to support JSON-RPC class hinting). |
|
296 |
|
297 ``parse_float``, if specified, will be called with the string |
|
298 of every JSON float to be decoded. By default this is equivalent to |
|
299 float(num_str). This can be used to use another datatype or parser |
|
300 for JSON floats (e.g. decimal.Decimal). |
|
301 |
|
302 ``parse_int``, if specified, will be called with the string |
|
303 of every JSON int to be decoded. By default this is equivalent to |
|
304 int(num_str). This can be used to use another datatype or parser |
|
305 for JSON integers (e.g. float). |
|
306 |
|
307 ``parse_constant``, if specified, will be called with one of the |
|
308 following strings: -Infinity, Infinity, NaN. |
|
309 This can be used to raise an exception if invalid JSON numbers |
|
310 are encountered. |
|
311 |
|
312 """ |
|
313 self.encoding = encoding |
|
314 self.object_hook = object_hook |
|
315 self.parse_float = parse_float or float |
|
316 self.parse_int = parse_int or int |
|
317 self.parse_constant = parse_constant or _CONSTANTS.__getitem__ |
|
318 self.strict = strict |
|
319 self.parse_object = JSONObject |
|
320 self.parse_array = JSONArray |
|
321 self.parse_string = scanstring |
|
322 self.scan_once = make_scanner(self) |
|
323 |
|
324 def decode(self, s, _w=WHITESPACE.match): |
|
325 """Return the Python representation of ``s`` (a ``str`` or ``unicode`` |
|
326 instance containing a JSON document) |
|
327 |
|
328 """ |
|
329 obj, end = self.raw_decode(s, idx=_w(s, 0).end()) |
|
330 end = _w(s, end).end() |
|
331 if end != len(s): |
|
332 raise ValueError(errmsg("Extra data", s, end, len(s))) |
|
333 return obj |
|
334 |
|
335 def raw_decode(self, s, idx=0): |
|
336 """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning |
|
337 with a JSON document) and return a 2-tuple of the Python |
|
338 representation and the index in ``s`` where the document ended. |
|
339 |
|
340 This can be used to decode a JSON document from a string that may |
|
341 have extraneous data at the end. |
|
342 |
|
343 """ |
|
344 try: |
|
345 obj, end = self.scan_once(s, idx) |
|
346 except StopIteration: |
|
347 raise ValueError("No JSON object could be decoded") |
|
348 return obj, end |