app/simplejson/decoder.py
changeset 975 295d67509412
equal deleted inserted replaced
974:2f86cbc90b65 975:295d67509412
       
     1 """Implementation of JSONDecoder
       
     2 """
       
     3 import re
       
     4 import sys
       
     5 import struct
       
     6 
       
     7 from simplejson.scanner import make_scanner
       
     8 try:
       
     9     from simplejson._speedups import scanstring as c_scanstring
       
    10 except ImportError:
       
    11     c_scanstring = None
       
    12 
       
    13 __all__ = ['JSONDecoder']
       
    14 
       
    15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
       
    16 
       
    17 def _floatconstants():
       
    18     _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
       
    19     if sys.byteorder != 'big':
       
    20         _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
       
    21     nan, inf = struct.unpack('dd', _BYTES)
       
    22     return nan, inf, -inf
       
    23 
       
    24 NaN, PosInf, NegInf = _floatconstants()
       
    25 
       
    26 
       
    27 def linecol(doc, pos):
       
    28     lineno = doc.count('\n', 0, pos) + 1
       
    29     if lineno == 1:
       
    30         colno = pos
       
    31     else:
       
    32         colno = pos - doc.rindex('\n', 0, pos)
       
    33     return lineno, colno
       
    34 
       
    35 
       
    36 def errmsg(msg, doc, pos, end=None):
       
    37     # Note that this function is called from _speedups
       
    38     lineno, colno = linecol(doc, pos)
       
    39     if end is None:
       
    40         return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
       
    41     endlineno, endcolno = linecol(doc, end)
       
    42     return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
       
    43         msg, lineno, colno, endlineno, endcolno, pos, end)
       
    44 
       
    45 
       
    46 _CONSTANTS = {
       
    47     '-Infinity': NegInf,
       
    48     'Infinity': PosInf,
       
    49     'NaN': NaN,
       
    50 }
       
    51 
       
    52 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
       
    53 BACKSLASH = {
       
    54     '"': u'"', '\\': u'\\', '/': u'/',
       
    55     'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
       
    56 }
       
    57 
       
    58 DEFAULT_ENCODING = "utf-8"
       
    59 
       
    60 def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
       
    61     """Scan the string s for a JSON string. End is the index of the
       
    62     character in s after the quote that started the JSON string.
       
    63     Unescapes all valid JSON string escape sequences and raises ValueError
       
    64     on attempt to decode an invalid string. If strict is False then literal
       
    65     control characters are allowed in the string.
       
    66     
       
    67     Returns a tuple of the decoded string and the index of the character in s
       
    68     after the end quote."""
       
    69     if encoding is None:
       
    70         encoding = DEFAULT_ENCODING
       
    71     chunks = []
       
    72     _append = chunks.append
       
    73     begin = end - 1
       
    74     while 1:
       
    75         chunk = _m(s, end)
       
    76         if chunk is None:
       
    77             raise ValueError(
       
    78                 errmsg("Unterminated string starting at", s, begin))
       
    79         end = chunk.end()
       
    80         content, terminator = chunk.groups()
       
    81         # Content is contains zero or more unescaped string characters
       
    82         if content:
       
    83             if not isinstance(content, unicode):
       
    84                 content = unicode(content, encoding)
       
    85             _append(content)
       
    86         # Terminator is the end of string, a literal control character,
       
    87         # or a backslash denoting that an escape sequence follows
       
    88         if terminator == '"':
       
    89             break
       
    90         elif terminator != '\\':
       
    91             if strict:
       
    92                 msg = "Invalid control character %r at" % (terminator,)
       
    93                 raise ValueError(msg, s, end)
       
    94             else:
       
    95                 _append(terminator)
       
    96                 continue
       
    97         try:
       
    98             esc = s[end]
       
    99         except IndexError:
       
   100             raise ValueError(
       
   101                 errmsg("Unterminated string starting at", s, begin))
       
   102         # If not a unicode escape sequence, must be in the lookup table
       
   103         if esc != 'u':
       
   104             try:
       
   105                 char = _b[esc]
       
   106             except KeyError:
       
   107                 raise ValueError(
       
   108                     errmsg("Invalid \\escape: %r" % (esc,), s, end))
       
   109             end += 1
       
   110         else:
       
   111             # Unicode escape sequence
       
   112             esc = s[end + 1:end + 5]
       
   113             next_end = end + 5
       
   114             if len(esc) != 4:
       
   115                 msg = "Invalid \\uXXXX escape"
       
   116                 raise ValueError(errmsg(msg, s, end))
       
   117             uni = int(esc, 16)
       
   118             # Check for surrogate pair on UCS-4 systems
       
   119             if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
       
   120                 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
       
   121                 if not s[end + 5:end + 7] == '\\u':
       
   122                     raise ValueError(errmsg(msg, s, end))
       
   123                 esc2 = s[end + 7:end + 11]
       
   124                 if len(esc2) != 4:
       
   125                     raise ValueError(errmsg(msg, s, end))
       
   126                 uni2 = int(esc2, 16)
       
   127                 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
       
   128                 next_end += 6
       
   129             char = unichr(uni)
       
   130             end = next_end
       
   131         # Append the unescaped character
       
   132         _append(char)
       
   133     return u''.join(chunks), end
       
   134 
       
   135 
       
   136 # Use speedup if available
       
   137 scanstring = c_scanstring or py_scanstring
       
   138 
       
   139 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
       
   140 WHITESPACE_STR = ' \t\n\r'
       
   141 
       
   142 def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
       
   143     pairs = {}
       
   144     # Use a slice to prevent IndexError from being raised, the following
       
   145     # check will raise a more specific ValueError if the string is empty
       
   146     nextchar = s[end:end + 1]
       
   147     # Normally we expect nextchar == '"'
       
   148     if nextchar != '"':
       
   149         if nextchar in _ws:
       
   150             end = _w(s, end).end()
       
   151             nextchar = s[end:end + 1]
       
   152         # Trivial empty object
       
   153         if nextchar == '}':
       
   154             return pairs, end + 1
       
   155         elif nextchar != '"':
       
   156             raise ValueError(errmsg("Expecting property name", s, end))
       
   157     end += 1
       
   158     while True:
       
   159         key, end = scanstring(s, end, encoding, strict)
       
   160 
       
   161         # To skip some function call overhead we optimize the fast paths where
       
   162         # the JSON key separator is ": " or just ":".
       
   163         if s[end:end + 1] != ':':
       
   164             end = _w(s, end).end()
       
   165             if s[end:end + 1] != ':':
       
   166                 raise ValueError(errmsg("Expecting : delimiter", s, end))
       
   167 
       
   168         end += 1
       
   169 
       
   170         try:
       
   171             if s[end] in _ws:
       
   172                 end += 1
       
   173                 if s[end] in _ws:
       
   174                     end = _w(s, end + 1).end()
       
   175         except IndexError:
       
   176             pass
       
   177 
       
   178         try:
       
   179             value, end = scan_once(s, end)
       
   180         except StopIteration:
       
   181             raise ValueError(errmsg("Expecting object", s, end))
       
   182         pairs[key] = value
       
   183 
       
   184         try:
       
   185             nextchar = s[end]
       
   186             if nextchar in _ws:
       
   187                 end = _w(s, end + 1).end()
       
   188                 nextchar = s[end]
       
   189         except IndexError:
       
   190             nextchar = ''
       
   191         end += 1
       
   192 
       
   193         if nextchar == '}':
       
   194             break
       
   195         elif nextchar != ',':
       
   196             raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
       
   197 
       
   198         try:
       
   199             nextchar = s[end]
       
   200             if nextchar in _ws:
       
   201                 end += 1
       
   202                 nextchar = s[end]
       
   203                 if nextchar in _ws:
       
   204                     end = _w(s, end + 1).end()
       
   205                     nextchar = s[end]
       
   206         except IndexError:
       
   207             nextchar = ''
       
   208 
       
   209         end += 1
       
   210         if nextchar != '"':
       
   211             raise ValueError(errmsg("Expecting property name", s, end - 1))
       
   212 
       
   213     if object_hook is not None:
       
   214         pairs = object_hook(pairs)
       
   215     return pairs, end
       
   216 
       
   217 def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
       
   218     values = []
       
   219     nextchar = s[end:end + 1]
       
   220     if nextchar in _ws:
       
   221         end = _w(s, end + 1).end()
       
   222         nextchar = s[end:end + 1]
       
   223     # Look-ahead for trivial empty array
       
   224     if nextchar == ']':
       
   225         return values, end + 1
       
   226     _append = values.append
       
   227     while True:
       
   228         try:
       
   229             value, end = scan_once(s, end)
       
   230         except StopIteration:
       
   231             raise ValueError(errmsg("Expecting object", s, end))
       
   232         _append(value)
       
   233         nextchar = s[end:end + 1]
       
   234         if nextchar in _ws:
       
   235             end = _w(s, end + 1).end()
       
   236             nextchar = s[end:end + 1]
       
   237         end += 1
       
   238         if nextchar == ']':
       
   239             break
       
   240         elif nextchar != ',':
       
   241             raise ValueError(errmsg("Expecting , delimiter", s, end))
       
   242 
       
   243         try:
       
   244             if s[end] in _ws:
       
   245                 end += 1
       
   246                 if s[end] in _ws:
       
   247                     end = _w(s, end + 1).end()
       
   248         except IndexError:
       
   249             pass
       
   250 
       
   251     return values, end
       
   252 
       
   253 class JSONDecoder(object):
       
   254     """Simple JSON <http://json.org> decoder
       
   255 
       
   256     Performs the following translations in decoding by default:
       
   257 
       
   258     +---------------+-------------------+
       
   259     | JSON          | Python            |
       
   260     +===============+===================+
       
   261     | object        | dict              |
       
   262     +---------------+-------------------+
       
   263     | array         | list              |
       
   264     +---------------+-------------------+
       
   265     | string        | unicode           |
       
   266     +---------------+-------------------+
       
   267     | number (int)  | int, long         |
       
   268     +---------------+-------------------+
       
   269     | number (real) | float             |
       
   270     +---------------+-------------------+
       
   271     | true          | True              |
       
   272     +---------------+-------------------+
       
   273     | false         | False             |
       
   274     +---------------+-------------------+
       
   275     | null          | None              |
       
   276     +---------------+-------------------+
       
   277 
       
   278     It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
       
   279     their corresponding ``float`` values, which is outside the JSON spec.
       
   280 
       
   281     """
       
   282 
       
   283     def __init__(self, encoding=None, object_hook=None, parse_float=None,
       
   284             parse_int=None, parse_constant=None, strict=True):
       
   285         """``encoding`` determines the encoding used to interpret any ``str``
       
   286         objects decoded by this instance (utf-8 by default).  It has no
       
   287         effect when decoding ``unicode`` objects.
       
   288 
       
   289         Note that currently only encodings that are a superset of ASCII work,
       
   290         strings of other encodings should be passed in as ``unicode``.
       
   291 
       
   292         ``object_hook``, if specified, will be called with the result
       
   293         of every JSON object decoded and its return value will be used in
       
   294         place of the given ``dict``.  This can be used to provide custom
       
   295         deserializations (e.g. to support JSON-RPC class hinting).
       
   296 
       
   297         ``parse_float``, if specified, will be called with the string
       
   298         of every JSON float to be decoded. By default this is equivalent to
       
   299         float(num_str). This can be used to use another datatype or parser
       
   300         for JSON floats (e.g. decimal.Decimal).
       
   301 
       
   302         ``parse_int``, if specified, will be called with the string
       
   303         of every JSON int to be decoded. By default this is equivalent to
       
   304         int(num_str). This can be used to use another datatype or parser
       
   305         for JSON integers (e.g. float).
       
   306 
       
   307         ``parse_constant``, if specified, will be called with one of the
       
   308         following strings: -Infinity, Infinity, NaN.
       
   309         This can be used to raise an exception if invalid JSON numbers
       
   310         are encountered.
       
   311 
       
   312         """
       
   313         self.encoding = encoding
       
   314         self.object_hook = object_hook
       
   315         self.parse_float = parse_float or float
       
   316         self.parse_int = parse_int or int
       
   317         self.parse_constant = parse_constant or _CONSTANTS.__getitem__
       
   318         self.strict = strict
       
   319         self.parse_object = JSONObject
       
   320         self.parse_array = JSONArray
       
   321         self.parse_string = scanstring
       
   322         self.scan_once = make_scanner(self)
       
   323 
       
   324     def decode(self, s, _w=WHITESPACE.match):
       
   325         """Return the Python representation of ``s`` (a ``str`` or ``unicode``
       
   326         instance containing a JSON document)
       
   327 
       
   328         """
       
   329         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
       
   330         end = _w(s, end).end()
       
   331         if end != len(s):
       
   332             raise ValueError(errmsg("Extra data", s, end, len(s)))
       
   333         return obj
       
   334 
       
   335     def raw_decode(self, s, idx=0):
       
   336         """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
       
   337         with a JSON document) and return a 2-tuple of the Python
       
   338         representation and the index in ``s`` where the document ended.
       
   339 
       
   340         This can be used to decode a JSON document from a string that may
       
   341         have extraneous data at the end.
       
   342 
       
   343         """
       
   344         try:
       
   345             obj, end = self.scan_once(s, idx)
       
   346         except StopIteration:
       
   347             raise ValueError("No JSON object could be decoded")
       
   348         return obj, end