app/django/http/multipartparser.py
changeset 323 ff1a9aa48cfd
equal deleted inserted replaced
322:6641e941ef1e 323:ff1a9aa48cfd
       
     1 """
       
     2 Multi-part parsing for file uploads.
       
     3 
       
     4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
       
     5 file upload handlers for processing.
       
     6 """
       
     7 
       
     8 import cgi
       
     9 from django.conf import settings
       
    10 from django.core.exceptions import SuspiciousOperation
       
    11 from django.utils.datastructures import MultiValueDict
       
    12 from django.utils.encoding import force_unicode
       
    13 from django.utils.text import unescape_entities
       
    14 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
       
    15 
       
    16 __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
       
    17 
       
    18 class MultiPartParserError(Exception):
       
    19     pass
       
    20 
       
    21 class InputStreamExhausted(Exception):
       
    22     """
       
    23     No more reads are allowed from this device.
       
    24     """
       
    25     pass
       
    26 
       
    27 RAW = "raw"
       
    28 FILE = "file"
       
    29 FIELD = "field"
       
    30 
       
    31 class MultiPartParser(object):
       
    32     """
       
    33     A rfc2388 multipart/form-data parser.
       
    34 
       
    35     ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
       
    36     and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
       
    37     ``file_upload_dir`` is defined files will be streamed to temporary files in
       
    38     that directory.
       
    39     """
       
    40     def __init__(self, META, input_data, upload_handlers, encoding=None):
       
    41         """
       
    42         Initialize the MultiPartParser object.
       
    43 
       
    44         :META:
       
    45             The standard ``META`` dictionary in Django request objects.
       
    46         :input_data:
       
    47             The raw post data, as a bytestring.
       
    48         :upload_handler:
       
    49             An UploadHandler instance that performs operations on the uploaded
       
    50             data.
       
    51         :encoding:
       
    52             The encoding with which to treat the incoming data.
       
    53         """
       
    54 
       
    55         #
       
    56         # Content-Type should containt multipart and the boundary information.
       
    57         #
       
    58 
       
    59         content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
       
    60         if not content_type.startswith('multipart/'):
       
    61             raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
       
    62 
       
    63         # Parse the header to get the boundary to split the parts.
       
    64         ctypes, opts = parse_header(content_type)
       
    65         boundary = opts.get('boundary')
       
    66         if not boundary or not cgi.valid_boundary(boundary):
       
    67             raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
       
    68 
       
    69 
       
    70         #
       
    71         # Content-Length should contain the length of the body we are about
       
    72         # to receive.
       
    73         #
       
    74         try:
       
    75             content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
       
    76         except (ValueError, TypeError):
       
    77             # For now set it to 0; we'll try again later on down.
       
    78             content_length = 0
       
    79 
       
    80         if content_length <= 0:
       
    81             # This means we shouldn't continue...raise an error.
       
    82             raise MultiPartParserError("Invalid content length: %r" % content_length)
       
    83 
       
    84         self._boundary = boundary
       
    85         self._input_data = input_data
       
    86 
       
    87         # For compatibility with low-level network APIs (with 32-bit integers),
       
    88         # the chunk size should be < 2^31, but still divisible by 4.
       
    89         self._chunk_size = min(2**31-4, *[x.chunk_size for x in upload_handlers if x.chunk_size])
       
    90 
       
    91         self._meta = META
       
    92         self._encoding = encoding or settings.DEFAULT_CHARSET
       
    93         self._content_length = content_length
       
    94         self._upload_handlers = upload_handlers
       
    95 
       
    96     def parse(self):
       
    97         """
       
    98         Parse the POST data and break it into a FILES MultiValueDict and a POST
       
    99         MultiValueDict.
       
   100 
       
   101         Returns a tuple containing the POST and FILES dictionary, respectively.
       
   102         """
       
   103         # We have to import QueryDict down here to avoid a circular import.
       
   104         from django.http import QueryDict
       
   105 
       
   106         encoding = self._encoding
       
   107         handlers = self._upload_handlers
       
   108 
       
   109         limited_input_data = LimitBytes(self._input_data, self._content_length)
       
   110 
       
   111         # See if the handler will want to take care of the parsing.
       
   112         # This allows overriding everything if somebody wants it.
       
   113         for handler in handlers:
       
   114             result = handler.handle_raw_input(limited_input_data,
       
   115                                               self._meta,
       
   116                                               self._content_length,
       
   117                                               self._boundary,
       
   118                                               encoding)
       
   119             if result is not None:
       
   120                 return result[0], result[1]
       
   121 
       
   122         # Create the data structures to be used later.
       
   123         self._post = QueryDict('', mutable=True)
       
   124         self._files = MultiValueDict()
       
   125 
       
   126         # Instantiate the parser and stream:
       
   127         stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
       
   128 
       
   129         # Whether or not to signal a file-completion at the beginning of the loop.
       
   130         old_field_name = None
       
   131         counters = [0] * len(handlers)
       
   132 
       
   133         try:
       
   134             for item_type, meta_data, field_stream in Parser(stream, self._boundary):
       
   135                 if old_field_name:
       
   136                     # We run this at the beginning of the next loop
       
   137                     # since we cannot be sure a file is complete until
       
   138                     # we hit the next boundary/part of the multipart content.
       
   139                     self.handle_file_complete(old_field_name, counters)
       
   140                     old_field_name = None
       
   141 
       
   142                 try:
       
   143                     disposition = meta_data['content-disposition'][1]
       
   144                     field_name = disposition['name'].strip()
       
   145                 except (KeyError, IndexError, AttributeError):
       
   146                     continue
       
   147 
       
   148                 transfer_encoding = meta_data.get('content-transfer-encoding')
       
   149                 field_name = force_unicode(field_name, encoding, errors='replace')
       
   150 
       
   151                 if item_type == FIELD:
       
   152                     # This is a post field, we can just set it in the post
       
   153                     if transfer_encoding == 'base64':
       
   154                         raw_data = field_stream.read()
       
   155                         try:
       
   156                             data = str(raw_data).decode('base64')
       
   157                         except:
       
   158                             data = raw_data
       
   159                     else:
       
   160                         data = field_stream.read()
       
   161 
       
   162                     self._post.appendlist(field_name,
       
   163                                           force_unicode(data, encoding, errors='replace'))
       
   164                 elif item_type == FILE:
       
   165                     # This is a file, use the handler...
       
   166                     file_name = disposition.get('filename')
       
   167                     if not file_name:
       
   168                         continue
       
   169                     file_name = force_unicode(file_name, encoding, errors='replace')
       
   170                     file_name = self.IE_sanitize(unescape_entities(file_name))
       
   171 
       
   172                     content_type = meta_data.get('content-type', ('',))[0].strip()
       
   173                     try:
       
   174                         charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
       
   175                     except:
       
   176                         charset = None
       
   177 
       
   178                     try:
       
   179                         content_length = int(meta_data.get('content-length')[0])
       
   180                     except (IndexError, TypeError, ValueError):
       
   181                         content_length = None
       
   182 
       
   183                     counters = [0] * len(handlers)
       
   184                     try:
       
   185                         for handler in handlers:
       
   186                             try:
       
   187                                 handler.new_file(field_name, file_name,
       
   188                                                  content_type, content_length,
       
   189                                                  charset)
       
   190                             except StopFutureHandlers:
       
   191                                 break
       
   192 
       
   193                         for chunk in field_stream:
       
   194                             if transfer_encoding == 'base64':
       
   195                                 # We only special-case base64 transfer encoding
       
   196                                 try:
       
   197                                     chunk = str(chunk).decode('base64')
       
   198                                 except Exception, e:
       
   199                                     # Since this is only a chunk, any error is an unfixable error.
       
   200                                     raise MultiPartParserError("Could not decode base64 data: %r" % e)
       
   201 
       
   202                             for i, handler in enumerate(handlers):
       
   203                                 chunk_length = len(chunk)
       
   204                                 chunk = handler.receive_data_chunk(chunk,
       
   205                                                                    counters[i])
       
   206                                 counters[i] += chunk_length
       
   207                                 if chunk is None:
       
   208                                     # If the chunk received by the handler is None, then don't continue.
       
   209                                     break
       
   210 
       
   211                     except SkipFile, e:
       
   212                         # Just use up the rest of this file...
       
   213                         exhaust(field_stream)
       
   214                     else:
       
   215                         # Handle file upload completions on next iteration.
       
   216                         old_field_name = field_name
       
   217                 else:
       
   218                     # If this is neither a FIELD or a FILE, just exhaust the stream.
       
   219                     exhaust(stream)
       
   220         except StopUpload, e:
       
   221             if not e.connection_reset:
       
   222                 exhaust(limited_input_data)
       
   223         else:
       
   224             # Make sure that the request data is all fed
       
   225             exhaust(limited_input_data)
       
   226 
       
   227         # Signal that the upload has completed.
       
   228         for handler in handlers:
       
   229             retval = handler.upload_complete()
       
   230             if retval:
       
   231                 break
       
   232 
       
   233         return self._post, self._files
       
   234 
       
   235     def handle_file_complete(self, old_field_name, counters):
       
   236         """
       
   237         Handle all the signalling that takes place when a file is complete.
       
   238         """
       
   239         for i, handler in enumerate(self._upload_handlers):
       
   240             file_obj = handler.file_complete(counters[i])
       
   241             if file_obj:
       
   242                 # If it returns a file object, then set the files dict.
       
   243                 self._files.appendlist(force_unicode(old_field_name,
       
   244                                                      self._encoding,
       
   245                                                      errors='replace'),
       
   246                                        file_obj)
       
   247                 break
       
   248 
       
   249     def IE_sanitize(self, filename):
       
   250         """Cleanup filename from Internet Explorer full paths."""
       
   251         return filename and filename[filename.rfind("\\")+1:].strip()
       
   252 
       
   253 class LazyStream(object):
       
   254     """
       
   255     The LazyStream wrapper allows one to get and "unget" bytes from a stream.
       
   256 
       
   257     Given a producer object (an iterator that yields bytestrings), the
       
   258     LazyStream object will support iteration, reading, and keeping a "look-back"
       
   259     variable in case you need to "unget" some bytes.
       
   260     """
       
   261     def __init__(self, producer, length=None):
       
   262         """
       
   263         Every LazyStream must have a producer when instantiated.
       
   264 
       
   265         A producer is an iterable that returns a string each time it
       
   266         is called.
       
   267         """
       
   268         self._producer = producer
       
   269         self._empty = False
       
   270         self._leftover = ''
       
   271         self.length = length
       
   272         self.position = 0
       
   273         self._remaining = length
       
   274         self._unget_history = []
       
   275 
       
   276     def tell(self):
       
   277         return self.position
       
   278 
       
   279     def read(self, size=None):
       
   280         def parts():
       
   281             remaining = (size is not None and [size] or [self._remaining])[0]
       
   282             # do the whole thing in one shot if no limit was provided.
       
   283             if remaining is None:
       
   284                 yield ''.join(self)
       
   285                 return
       
   286 
       
   287             # otherwise do some bookkeeping to return exactly enough
       
   288             # of the stream and stashing any extra content we get from
       
   289             # the producer
       
   290             while remaining != 0:
       
   291                 assert remaining > 0, 'remaining bytes to read should never go negative'
       
   292 
       
   293                 chunk = self.next()
       
   294 
       
   295                 emitting = chunk[:remaining]
       
   296                 self.unget(chunk[remaining:])
       
   297                 remaining -= len(emitting)
       
   298                 yield emitting
       
   299 
       
   300         out = ''.join(parts())
       
   301         return out
       
   302 
       
   303     def next(self):
       
   304         """
       
   305         Used when the exact number of bytes to read is unimportant.
       
   306 
       
   307         This procedure just returns whatever is chunk is conveniently returned
       
   308         from the iterator instead. Useful to avoid unnecessary bookkeeping if
       
   309         performance is an issue.
       
   310         """
       
   311         if self._leftover:
       
   312             output = self._leftover
       
   313             self._leftover = ''
       
   314         else:
       
   315             output = self._producer.next()
       
   316             self._unget_history = []
       
   317         self.position += len(output)
       
   318         return output
       
   319 
       
   320     def close(self):
       
   321         """
       
   322         Used to invalidate/disable this lazy stream.
       
   323 
       
   324         Replaces the producer with an empty list. Any leftover bytes that have
       
   325         already been read will still be reported upon read() and/or next().
       
   326         """
       
   327         self._producer = []
       
   328 
       
   329     def __iter__(self):
       
   330         return self
       
   331 
       
   332     def unget(self, bytes):
       
   333         """
       
   334         Places bytes back onto the front of the lazy stream.
       
   335 
       
   336         Future calls to read() will return those bytes first. The
       
   337         stream position and thus tell() will be rewound.
       
   338         """
       
   339         if not bytes:
       
   340             return
       
   341         self._update_unget_history(len(bytes))
       
   342         self.position -= len(bytes)
       
   343         self._leftover = ''.join([bytes, self._leftover])
       
   344 
       
   345     def _update_unget_history(self, num_bytes):
       
   346         """
       
   347         Updates the unget history as a sanity check to see if we've pushed
       
   348         back the same number of bytes in one chunk. If we keep ungetting the
       
   349         same number of bytes many times (here, 50), we're mostly likely in an
       
   350         infinite loop of some sort. This is usually caused by a
       
   351         maliciously-malformed MIME request.
       
   352         """
       
   353         self._unget_history = [num_bytes] + self._unget_history[:49]
       
   354         number_equal = len([current_number for current_number in self._unget_history
       
   355                             if current_number == num_bytes])
       
   356 
       
   357         if number_equal > 40:
       
   358             raise SuspiciousOperation(
       
   359                 "The multipart parser got stuck, which shouldn't happen with"
       
   360                 " normal uploaded files. Check for malicious upload activity;"
       
   361                 " if there is none, report this to the Django developers."
       
   362             )
       
   363 
       
   364 class ChunkIter(object):
       
   365     """
       
   366     An iterable that will yield chunks of data. Given a file-like object as the
       
   367     constructor, this object will yield chunks of read operations from that
       
   368     object.
       
   369     """
       
   370     def __init__(self, flo, chunk_size=64 * 1024):
       
   371         self.flo = flo
       
   372         self.chunk_size = chunk_size
       
   373 
       
   374     def next(self):
       
   375         try:
       
   376             data = self.flo.read(self.chunk_size)
       
   377         except InputStreamExhausted:
       
   378             raise StopIteration()
       
   379         if data:
       
   380             return data
       
   381         else:
       
   382             raise StopIteration()
       
   383 
       
   384     def __iter__(self):
       
   385         return self
       
   386 
       
   387 class LimitBytes(object):
       
   388     """ Limit bytes for a file object. """
       
   389     def __init__(self, fileobject, length):
       
   390         self._file = fileobject
       
   391         self.remaining = length
       
   392 
       
   393     def read(self, num_bytes=None):
       
   394         """
       
   395         Read data from the underlying file.
       
   396         If you ask for too much or there isn't anything left,
       
   397         this will raise an InputStreamExhausted error.
       
   398         """
       
   399         if self.remaining <= 0:
       
   400             raise InputStreamExhausted()
       
   401         if num_bytes is None:
       
   402             num_bytes = self.remaining
       
   403         else:
       
   404             num_bytes = min(num_bytes, self.remaining)
       
   405         self.remaining -= num_bytes
       
   406         return self._file.read(num_bytes)
       
   407 
       
   408 class InterBoundaryIter(object):
       
   409     """
       
   410     A Producer that will iterate over boundaries.
       
   411     """
       
   412     def __init__(self, stream, boundary):
       
   413         self._stream = stream
       
   414         self._boundary = boundary
       
   415 
       
   416     def __iter__(self):
       
   417         return self
       
   418 
       
   419     def next(self):
       
   420         try:
       
   421             return LazyStream(BoundaryIter(self._stream, self._boundary))
       
   422         except InputStreamExhausted:
       
   423             raise StopIteration()
       
   424 
       
   425 class BoundaryIter(object):
       
   426     """
       
   427     A Producer that is sensitive to boundaries.
       
   428 
       
   429     Will happily yield bytes until a boundary is found. Will yield the bytes
       
   430     before the boundary, throw away the boundary bytes themselves, and push the
       
   431     post-boundary bytes back on the stream.
       
   432 
       
   433     The future calls to .next() after locating the boundary will raise a
       
   434     StopIteration exception.
       
   435     """
       
   436 
       
   437     def __init__(self, stream, boundary):
       
   438         self._stream = stream
       
   439         self._boundary = boundary
       
   440         self._done = False
       
   441         # rollback an additional six bytes because the format is like
       
   442         # this: CRLF<boundary>[--CRLF]
       
   443         self._rollback = len(boundary) + 6
       
   444 
       
   445         # Try to use mx fast string search if available. Otherwise
       
   446         # use Python find. Wrap the latter for consistency.
       
   447         unused_char = self._stream.read(1)
       
   448         if not unused_char:
       
   449             raise InputStreamExhausted()
       
   450         self._stream.unget(unused_char)
       
   451         try:
       
   452             from mx.TextTools import FS
       
   453             self._fs = FS(boundary).find
       
   454         except ImportError:
       
   455             self._fs = lambda data: data.find(boundary)
       
   456 
       
   457     def __iter__(self):
       
   458         return self
       
   459 
       
   460     def next(self):
       
   461         if self._done:
       
   462             raise StopIteration()
       
   463 
       
   464         stream = self._stream
       
   465         rollback = self._rollback
       
   466 
       
   467         bytes_read = 0
       
   468         chunks = []
       
   469         for bytes in stream:
       
   470             bytes_read += len(bytes)
       
   471             chunks.append(bytes)
       
   472             if bytes_read > rollback:
       
   473                 break
       
   474             if not bytes:
       
   475                 break
       
   476         else:
       
   477             self._done = True
       
   478 
       
   479         if not chunks:
       
   480             raise StopIteration()
       
   481 
       
   482         chunk = ''.join(chunks)
       
   483         boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
       
   484 
       
   485         if boundary:
       
   486             end, next = boundary
       
   487             stream.unget(chunk[next:])
       
   488             self._done = True
       
   489             return chunk[:end]
       
   490         else:
       
   491             # make sure we dont treat a partial boundary (and
       
   492             # its separators) as data
       
   493             if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
       
   494                 # There's nothing left, we should just return and mark as done.
       
   495                 self._done = True
       
   496                 return chunk
       
   497             else:
       
   498                 stream.unget(chunk[-rollback:])
       
   499                 return chunk[:-rollback]
       
   500 
       
   501     def _find_boundary(self, data, eof = False):
       
   502         """
       
   503         Finds a multipart boundary in data.
       
   504 
       
   505         Should no boundry exist in the data None is returned instead. Otherwise
       
   506         a tuple containing the indices of the following are returned:
       
   507 
       
   508          * the end of current encapsulation
       
   509          * the start of the next encapsulation
       
   510         """
       
   511         index = self._fs(data)
       
   512         if index < 0:
       
   513             return None
       
   514         else:
       
   515             end = index
       
   516             next = index + len(self._boundary)
       
   517             # backup over CRLF
       
   518             if data[max(0,end-1)] == '\n':
       
   519                 end -= 1
       
   520             if data[max(0,end-1)] == '\r':
       
   521                 end -= 1
       
   522             return end, next
       
   523 
       
   524 def exhaust(stream_or_iterable):
       
   525     """
       
   526     Completely exhausts an iterator or stream.
       
   527 
       
   528     Raise a MultiPartParserError if the argument is not a stream or an iterable.
       
   529     """
       
   530     iterator = None
       
   531     try:
       
   532         iterator = iter(stream_or_iterable)
       
   533     except TypeError:
       
   534         iterator = ChunkIter(stream_or_iterable, 16384)
       
   535 
       
   536     if iterator is None:
       
   537         raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
       
   538 
       
   539     for __ in iterator:
       
   540         pass
       
   541 
       
   542 def parse_boundary_stream(stream, max_header_size):
       
   543     """
       
   544     Parses one and exactly one stream that encapsulates a boundary.
       
   545     """
       
   546     # Stream at beginning of header, look for end of header
       
   547     # and parse it if found. The header must fit within one
       
   548     # chunk.
       
   549     chunk = stream.read(max_header_size)
       
   550 
       
   551     # 'find' returns the top of these four bytes, so we'll
       
   552     # need to munch them later to prevent them from polluting
       
   553     # the payload.
       
   554     header_end = chunk.find('\r\n\r\n')
       
   555 
       
   556     def _parse_header(line):
       
   557         main_value_pair, params = parse_header(line)
       
   558         try:
       
   559             name, value = main_value_pair.split(':', 1)
       
   560         except:
       
   561             raise ValueError("Invalid header: %r" % line)
       
   562         return name, (value, params)
       
   563 
       
   564     if header_end == -1:
       
   565         # we find no header, so we just mark this fact and pass on
       
   566         # the stream verbatim
       
   567         stream.unget(chunk)
       
   568         return (RAW, {}, stream)
       
   569 
       
   570     header = chunk[:header_end]
       
   571 
       
   572     # here we place any excess chunk back onto the stream, as
       
   573     # well as throwing away the CRLFCRLF bytes from above.
       
   574     stream.unget(chunk[header_end + 4:])
       
   575 
       
   576     TYPE = RAW
       
   577     outdict = {}
       
   578 
       
   579     # Eliminate blank lines
       
   580     for line in header.split('\r\n'):
       
   581         # This terminology ("main value" and "dictionary of
       
   582         # parameters") is from the Python docs.
       
   583         try:
       
   584             name, (value, params) = _parse_header(line)
       
   585         except:
       
   586             continue
       
   587 
       
   588         if name == 'content-disposition':
       
   589             TYPE = FIELD
       
   590             if params.get('filename'):
       
   591                 TYPE = FILE
       
   592 
       
   593         outdict[name] = value, params
       
   594 
       
   595     if TYPE == RAW:
       
   596         stream.unget(chunk)
       
   597 
       
   598     return (TYPE, outdict, stream)
       
   599 
       
   600 class Parser(object):
       
   601     def __init__(self, stream, boundary):
       
   602         self._stream = stream
       
   603         self._separator = '--' + boundary
       
   604 
       
   605     def __iter__(self):
       
   606         boundarystream = InterBoundaryIter(self._stream, self._separator)
       
   607         for sub_stream in boundarystream:
       
   608             # Iterate over each part
       
   609             yield parse_boundary_stream(sub_stream, 1024)
       
   610 
       
   611 def parse_header(line):
       
   612     """ Parse the header into a key-value. """
       
   613     plist = _parse_header_params(';' + line)
       
   614     key = plist.pop(0).lower()
       
   615     pdict = {}
       
   616     for p in plist:
       
   617         i = p.find('=')
       
   618         if i >= 0:
       
   619             name = p[:i].strip().lower()
       
   620             value = p[i+1:].strip()
       
   621             if len(value) >= 2 and value[0] == value[-1] == '"':
       
   622                 value = value[1:-1]
       
   623                 value = value.replace('\\\\', '\\').replace('\\"', '"')
       
   624             pdict[name] = value
       
   625     return key, pdict
       
   626 
       
   627 def _parse_header_params(s):
       
   628     plist = []
       
   629     while s[:1] == ';':
       
   630         s = s[1:]
       
   631         end = s.find(';')
       
   632         while end > 0 and s.count('"', 0, end) % 2:
       
   633             end = s.find(';', end + 1)
       
   634         if end < 0:
       
   635             end = len(s)
       
   636         f = s[:end]
       
   637         plist.append(f.strip())
       
   638         s = s[end:]
       
   639     return plist