|
1 """ |
|
2 Multi-part parsing for file uploads. |
|
3 |
|
4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to |
|
5 file upload handlers for processing. |
|
6 """ |
|
7 |
|
8 import cgi |
|
9 from django.conf import settings |
|
10 from django.core.exceptions import SuspiciousOperation |
|
11 from django.utils.datastructures import MultiValueDict |
|
12 from django.utils.encoding import force_unicode |
|
13 from django.utils.text import unescape_entities |
|
14 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers |
|
15 |
|
16 __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted') |
|
17 |
|
18 class MultiPartParserError(Exception): |
|
19 pass |
|
20 |
|
21 class InputStreamExhausted(Exception): |
|
22 """ |
|
23 No more reads are allowed from this device. |
|
24 """ |
|
25 pass |
|
26 |
|
27 RAW = "raw" |
|
28 FILE = "file" |
|
29 FIELD = "field" |
|
30 |
|
31 class MultiPartParser(object): |
|
32 """ |
|
33 A rfc2388 multipart/form-data parser. |
|
34 |
|
35 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks |
|
36 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If |
|
37 ``file_upload_dir`` is defined files will be streamed to temporary files in |
|
38 that directory. |
|
39 """ |
|
40 def __init__(self, META, input_data, upload_handlers, encoding=None): |
|
41 """ |
|
42 Initialize the MultiPartParser object. |
|
43 |
|
44 :META: |
|
45 The standard ``META`` dictionary in Django request objects. |
|
46 :input_data: |
|
47 The raw post data, as a bytestring. |
|
48 :upload_handler: |
|
49 An UploadHandler instance that performs operations on the uploaded |
|
50 data. |
|
51 :encoding: |
|
52 The encoding with which to treat the incoming data. |
|
53 """ |
|
54 |
|
55 # |
|
56 # Content-Type should containt multipart and the boundary information. |
|
57 # |
|
58 |
|
59 content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', '')) |
|
60 if not content_type.startswith('multipart/'): |
|
61 raise MultiPartParserError('Invalid Content-Type: %s' % content_type) |
|
62 |
|
63 # Parse the header to get the boundary to split the parts. |
|
64 ctypes, opts = parse_header(content_type) |
|
65 boundary = opts.get('boundary') |
|
66 if not boundary or not cgi.valid_boundary(boundary): |
|
67 raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary) |
|
68 |
|
69 |
|
70 # |
|
71 # Content-Length should contain the length of the body we are about |
|
72 # to receive. |
|
73 # |
|
74 try: |
|
75 content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0))) |
|
76 except (ValueError, TypeError): |
|
77 # For now set it to 0; we'll try again later on down. |
|
78 content_length = 0 |
|
79 |
|
80 if content_length <= 0: |
|
81 # This means we shouldn't continue...raise an error. |
|
82 raise MultiPartParserError("Invalid content length: %r" % content_length) |
|
83 |
|
84 self._boundary = boundary |
|
85 self._input_data = input_data |
|
86 |
|
87 # For compatibility with low-level network APIs (with 32-bit integers), |
|
88 # the chunk size should be < 2^31, but still divisible by 4. |
|
89 self._chunk_size = min(2**31-4, *[x.chunk_size for x in upload_handlers if x.chunk_size]) |
|
90 |
|
91 self._meta = META |
|
92 self._encoding = encoding or settings.DEFAULT_CHARSET |
|
93 self._content_length = content_length |
|
94 self._upload_handlers = upload_handlers |
|
95 |
|
96 def parse(self): |
|
97 """ |
|
98 Parse the POST data and break it into a FILES MultiValueDict and a POST |
|
99 MultiValueDict. |
|
100 |
|
101 Returns a tuple containing the POST and FILES dictionary, respectively. |
|
102 """ |
|
103 # We have to import QueryDict down here to avoid a circular import. |
|
104 from django.http import QueryDict |
|
105 |
|
106 encoding = self._encoding |
|
107 handlers = self._upload_handlers |
|
108 |
|
109 limited_input_data = LimitBytes(self._input_data, self._content_length) |
|
110 |
|
111 # See if the handler will want to take care of the parsing. |
|
112 # This allows overriding everything if somebody wants it. |
|
113 for handler in handlers: |
|
114 result = handler.handle_raw_input(limited_input_data, |
|
115 self._meta, |
|
116 self._content_length, |
|
117 self._boundary, |
|
118 encoding) |
|
119 if result is not None: |
|
120 return result[0], result[1] |
|
121 |
|
122 # Create the data structures to be used later. |
|
123 self._post = QueryDict('', mutable=True) |
|
124 self._files = MultiValueDict() |
|
125 |
|
126 # Instantiate the parser and stream: |
|
127 stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size)) |
|
128 |
|
129 # Whether or not to signal a file-completion at the beginning of the loop. |
|
130 old_field_name = None |
|
131 counters = [0] * len(handlers) |
|
132 |
|
133 try: |
|
134 for item_type, meta_data, field_stream in Parser(stream, self._boundary): |
|
135 if old_field_name: |
|
136 # We run this at the beginning of the next loop |
|
137 # since we cannot be sure a file is complete until |
|
138 # we hit the next boundary/part of the multipart content. |
|
139 self.handle_file_complete(old_field_name, counters) |
|
140 old_field_name = None |
|
141 |
|
142 try: |
|
143 disposition = meta_data['content-disposition'][1] |
|
144 field_name = disposition['name'].strip() |
|
145 except (KeyError, IndexError, AttributeError): |
|
146 continue |
|
147 |
|
148 transfer_encoding = meta_data.get('content-transfer-encoding') |
|
149 field_name = force_unicode(field_name, encoding, errors='replace') |
|
150 |
|
151 if item_type == FIELD: |
|
152 # This is a post field, we can just set it in the post |
|
153 if transfer_encoding == 'base64': |
|
154 raw_data = field_stream.read() |
|
155 try: |
|
156 data = str(raw_data).decode('base64') |
|
157 except: |
|
158 data = raw_data |
|
159 else: |
|
160 data = field_stream.read() |
|
161 |
|
162 self._post.appendlist(field_name, |
|
163 force_unicode(data, encoding, errors='replace')) |
|
164 elif item_type == FILE: |
|
165 # This is a file, use the handler... |
|
166 file_name = disposition.get('filename') |
|
167 if not file_name: |
|
168 continue |
|
169 file_name = force_unicode(file_name, encoding, errors='replace') |
|
170 file_name = self.IE_sanitize(unescape_entities(file_name)) |
|
171 |
|
172 content_type = meta_data.get('content-type', ('',))[0].strip() |
|
173 try: |
|
174 charset = meta_data.get('content-type', (0,{}))[1].get('charset', None) |
|
175 except: |
|
176 charset = None |
|
177 |
|
178 try: |
|
179 content_length = int(meta_data.get('content-length')[0]) |
|
180 except (IndexError, TypeError, ValueError): |
|
181 content_length = None |
|
182 |
|
183 counters = [0] * len(handlers) |
|
184 try: |
|
185 for handler in handlers: |
|
186 try: |
|
187 handler.new_file(field_name, file_name, |
|
188 content_type, content_length, |
|
189 charset) |
|
190 except StopFutureHandlers: |
|
191 break |
|
192 |
|
193 for chunk in field_stream: |
|
194 if transfer_encoding == 'base64': |
|
195 # We only special-case base64 transfer encoding |
|
196 try: |
|
197 chunk = str(chunk).decode('base64') |
|
198 except Exception, e: |
|
199 # Since this is only a chunk, any error is an unfixable error. |
|
200 raise MultiPartParserError("Could not decode base64 data: %r" % e) |
|
201 |
|
202 for i, handler in enumerate(handlers): |
|
203 chunk_length = len(chunk) |
|
204 chunk = handler.receive_data_chunk(chunk, |
|
205 counters[i]) |
|
206 counters[i] += chunk_length |
|
207 if chunk is None: |
|
208 # If the chunk received by the handler is None, then don't continue. |
|
209 break |
|
210 |
|
211 except SkipFile, e: |
|
212 # Just use up the rest of this file... |
|
213 exhaust(field_stream) |
|
214 else: |
|
215 # Handle file upload completions on next iteration. |
|
216 old_field_name = field_name |
|
217 else: |
|
218 # If this is neither a FIELD or a FILE, just exhaust the stream. |
|
219 exhaust(stream) |
|
220 except StopUpload, e: |
|
221 if not e.connection_reset: |
|
222 exhaust(limited_input_data) |
|
223 else: |
|
224 # Make sure that the request data is all fed |
|
225 exhaust(limited_input_data) |
|
226 |
|
227 # Signal that the upload has completed. |
|
228 for handler in handlers: |
|
229 retval = handler.upload_complete() |
|
230 if retval: |
|
231 break |
|
232 |
|
233 return self._post, self._files |
|
234 |
|
235 def handle_file_complete(self, old_field_name, counters): |
|
236 """ |
|
237 Handle all the signalling that takes place when a file is complete. |
|
238 """ |
|
239 for i, handler in enumerate(self._upload_handlers): |
|
240 file_obj = handler.file_complete(counters[i]) |
|
241 if file_obj: |
|
242 # If it returns a file object, then set the files dict. |
|
243 self._files.appendlist(force_unicode(old_field_name, |
|
244 self._encoding, |
|
245 errors='replace'), |
|
246 file_obj) |
|
247 break |
|
248 |
|
249 def IE_sanitize(self, filename): |
|
250 """Cleanup filename from Internet Explorer full paths.""" |
|
251 return filename and filename[filename.rfind("\\")+1:].strip() |
|
252 |
|
253 class LazyStream(object): |
|
254 """ |
|
255 The LazyStream wrapper allows one to get and "unget" bytes from a stream. |
|
256 |
|
257 Given a producer object (an iterator that yields bytestrings), the |
|
258 LazyStream object will support iteration, reading, and keeping a "look-back" |
|
259 variable in case you need to "unget" some bytes. |
|
260 """ |
|
261 def __init__(self, producer, length=None): |
|
262 """ |
|
263 Every LazyStream must have a producer when instantiated. |
|
264 |
|
265 A producer is an iterable that returns a string each time it |
|
266 is called. |
|
267 """ |
|
268 self._producer = producer |
|
269 self._empty = False |
|
270 self._leftover = '' |
|
271 self.length = length |
|
272 self.position = 0 |
|
273 self._remaining = length |
|
274 self._unget_history = [] |
|
275 |
|
276 def tell(self): |
|
277 return self.position |
|
278 |
|
279 def read(self, size=None): |
|
280 def parts(): |
|
281 remaining = (size is not None and [size] or [self._remaining])[0] |
|
282 # do the whole thing in one shot if no limit was provided. |
|
283 if remaining is None: |
|
284 yield ''.join(self) |
|
285 return |
|
286 |
|
287 # otherwise do some bookkeeping to return exactly enough |
|
288 # of the stream and stashing any extra content we get from |
|
289 # the producer |
|
290 while remaining != 0: |
|
291 assert remaining > 0, 'remaining bytes to read should never go negative' |
|
292 |
|
293 chunk = self.next() |
|
294 |
|
295 emitting = chunk[:remaining] |
|
296 self.unget(chunk[remaining:]) |
|
297 remaining -= len(emitting) |
|
298 yield emitting |
|
299 |
|
300 out = ''.join(parts()) |
|
301 return out |
|
302 |
|
303 def next(self): |
|
304 """ |
|
305 Used when the exact number of bytes to read is unimportant. |
|
306 |
|
307 This procedure just returns whatever is chunk is conveniently returned |
|
308 from the iterator instead. Useful to avoid unnecessary bookkeeping if |
|
309 performance is an issue. |
|
310 """ |
|
311 if self._leftover: |
|
312 output = self._leftover |
|
313 self._leftover = '' |
|
314 else: |
|
315 output = self._producer.next() |
|
316 self._unget_history = [] |
|
317 self.position += len(output) |
|
318 return output |
|
319 |
|
320 def close(self): |
|
321 """ |
|
322 Used to invalidate/disable this lazy stream. |
|
323 |
|
324 Replaces the producer with an empty list. Any leftover bytes that have |
|
325 already been read will still be reported upon read() and/or next(). |
|
326 """ |
|
327 self._producer = [] |
|
328 |
|
329 def __iter__(self): |
|
330 return self |
|
331 |
|
332 def unget(self, bytes): |
|
333 """ |
|
334 Places bytes back onto the front of the lazy stream. |
|
335 |
|
336 Future calls to read() will return those bytes first. The |
|
337 stream position and thus tell() will be rewound. |
|
338 """ |
|
339 if not bytes: |
|
340 return |
|
341 self._update_unget_history(len(bytes)) |
|
342 self.position -= len(bytes) |
|
343 self._leftover = ''.join([bytes, self._leftover]) |
|
344 |
|
345 def _update_unget_history(self, num_bytes): |
|
346 """ |
|
347 Updates the unget history as a sanity check to see if we've pushed |
|
348 back the same number of bytes in one chunk. If we keep ungetting the |
|
349 same number of bytes many times (here, 50), we're mostly likely in an |
|
350 infinite loop of some sort. This is usually caused by a |
|
351 maliciously-malformed MIME request. |
|
352 """ |
|
353 self._unget_history = [num_bytes] + self._unget_history[:49] |
|
354 number_equal = len([current_number for current_number in self._unget_history |
|
355 if current_number == num_bytes]) |
|
356 |
|
357 if number_equal > 40: |
|
358 raise SuspiciousOperation( |
|
359 "The multipart parser got stuck, which shouldn't happen with" |
|
360 " normal uploaded files. Check for malicious upload activity;" |
|
361 " if there is none, report this to the Django developers." |
|
362 ) |
|
363 |
|
364 class ChunkIter(object): |
|
365 """ |
|
366 An iterable that will yield chunks of data. Given a file-like object as the |
|
367 constructor, this object will yield chunks of read operations from that |
|
368 object. |
|
369 """ |
|
370 def __init__(self, flo, chunk_size=64 * 1024): |
|
371 self.flo = flo |
|
372 self.chunk_size = chunk_size |
|
373 |
|
374 def next(self): |
|
375 try: |
|
376 data = self.flo.read(self.chunk_size) |
|
377 except InputStreamExhausted: |
|
378 raise StopIteration() |
|
379 if data: |
|
380 return data |
|
381 else: |
|
382 raise StopIteration() |
|
383 |
|
384 def __iter__(self): |
|
385 return self |
|
386 |
|
387 class LimitBytes(object): |
|
388 """ Limit bytes for a file object. """ |
|
389 def __init__(self, fileobject, length): |
|
390 self._file = fileobject |
|
391 self.remaining = length |
|
392 |
|
393 def read(self, num_bytes=None): |
|
394 """ |
|
395 Read data from the underlying file. |
|
396 If you ask for too much or there isn't anything left, |
|
397 this will raise an InputStreamExhausted error. |
|
398 """ |
|
399 if self.remaining <= 0: |
|
400 raise InputStreamExhausted() |
|
401 if num_bytes is None: |
|
402 num_bytes = self.remaining |
|
403 else: |
|
404 num_bytes = min(num_bytes, self.remaining) |
|
405 self.remaining -= num_bytes |
|
406 return self._file.read(num_bytes) |
|
407 |
|
408 class InterBoundaryIter(object): |
|
409 """ |
|
410 A Producer that will iterate over boundaries. |
|
411 """ |
|
412 def __init__(self, stream, boundary): |
|
413 self._stream = stream |
|
414 self._boundary = boundary |
|
415 |
|
416 def __iter__(self): |
|
417 return self |
|
418 |
|
419 def next(self): |
|
420 try: |
|
421 return LazyStream(BoundaryIter(self._stream, self._boundary)) |
|
422 except InputStreamExhausted: |
|
423 raise StopIteration() |
|
424 |
|
425 class BoundaryIter(object): |
|
426 """ |
|
427 A Producer that is sensitive to boundaries. |
|
428 |
|
429 Will happily yield bytes until a boundary is found. Will yield the bytes |
|
430 before the boundary, throw away the boundary bytes themselves, and push the |
|
431 post-boundary bytes back on the stream. |
|
432 |
|
433 The future calls to .next() after locating the boundary will raise a |
|
434 StopIteration exception. |
|
435 """ |
|
436 |
|
437 def __init__(self, stream, boundary): |
|
438 self._stream = stream |
|
439 self._boundary = boundary |
|
440 self._done = False |
|
441 # rollback an additional six bytes because the format is like |
|
442 # this: CRLF<boundary>[--CRLF] |
|
443 self._rollback = len(boundary) + 6 |
|
444 |
|
445 # Try to use mx fast string search if available. Otherwise |
|
446 # use Python find. Wrap the latter for consistency. |
|
447 unused_char = self._stream.read(1) |
|
448 if not unused_char: |
|
449 raise InputStreamExhausted() |
|
450 self._stream.unget(unused_char) |
|
451 try: |
|
452 from mx.TextTools import FS |
|
453 self._fs = FS(boundary).find |
|
454 except ImportError: |
|
455 self._fs = lambda data: data.find(boundary) |
|
456 |
|
457 def __iter__(self): |
|
458 return self |
|
459 |
|
460 def next(self): |
|
461 if self._done: |
|
462 raise StopIteration() |
|
463 |
|
464 stream = self._stream |
|
465 rollback = self._rollback |
|
466 |
|
467 bytes_read = 0 |
|
468 chunks = [] |
|
469 for bytes in stream: |
|
470 bytes_read += len(bytes) |
|
471 chunks.append(bytes) |
|
472 if bytes_read > rollback: |
|
473 break |
|
474 if not bytes: |
|
475 break |
|
476 else: |
|
477 self._done = True |
|
478 |
|
479 if not chunks: |
|
480 raise StopIteration() |
|
481 |
|
482 chunk = ''.join(chunks) |
|
483 boundary = self._find_boundary(chunk, len(chunk) < self._rollback) |
|
484 |
|
485 if boundary: |
|
486 end, next = boundary |
|
487 stream.unget(chunk[next:]) |
|
488 self._done = True |
|
489 return chunk[:end] |
|
490 else: |
|
491 # make sure we dont treat a partial boundary (and |
|
492 # its separators) as data |
|
493 if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6): |
|
494 # There's nothing left, we should just return and mark as done. |
|
495 self._done = True |
|
496 return chunk |
|
497 else: |
|
498 stream.unget(chunk[-rollback:]) |
|
499 return chunk[:-rollback] |
|
500 |
|
501 def _find_boundary(self, data, eof = False): |
|
502 """ |
|
503 Finds a multipart boundary in data. |
|
504 |
|
505 Should no boundry exist in the data None is returned instead. Otherwise |
|
506 a tuple containing the indices of the following are returned: |
|
507 |
|
508 * the end of current encapsulation |
|
509 * the start of the next encapsulation |
|
510 """ |
|
511 index = self._fs(data) |
|
512 if index < 0: |
|
513 return None |
|
514 else: |
|
515 end = index |
|
516 next = index + len(self._boundary) |
|
517 # backup over CRLF |
|
518 if data[max(0,end-1)] == '\n': |
|
519 end -= 1 |
|
520 if data[max(0,end-1)] == '\r': |
|
521 end -= 1 |
|
522 return end, next |
|
523 |
|
524 def exhaust(stream_or_iterable): |
|
525 """ |
|
526 Completely exhausts an iterator or stream. |
|
527 |
|
528 Raise a MultiPartParserError if the argument is not a stream or an iterable. |
|
529 """ |
|
530 iterator = None |
|
531 try: |
|
532 iterator = iter(stream_or_iterable) |
|
533 except TypeError: |
|
534 iterator = ChunkIter(stream_or_iterable, 16384) |
|
535 |
|
536 if iterator is None: |
|
537 raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter') |
|
538 |
|
539 for __ in iterator: |
|
540 pass |
|
541 |
|
542 def parse_boundary_stream(stream, max_header_size): |
|
543 """ |
|
544 Parses one and exactly one stream that encapsulates a boundary. |
|
545 """ |
|
546 # Stream at beginning of header, look for end of header |
|
547 # and parse it if found. The header must fit within one |
|
548 # chunk. |
|
549 chunk = stream.read(max_header_size) |
|
550 |
|
551 # 'find' returns the top of these four bytes, so we'll |
|
552 # need to munch them later to prevent them from polluting |
|
553 # the payload. |
|
554 header_end = chunk.find('\r\n\r\n') |
|
555 |
|
556 def _parse_header(line): |
|
557 main_value_pair, params = parse_header(line) |
|
558 try: |
|
559 name, value = main_value_pair.split(':', 1) |
|
560 except: |
|
561 raise ValueError("Invalid header: %r" % line) |
|
562 return name, (value, params) |
|
563 |
|
564 if header_end == -1: |
|
565 # we find no header, so we just mark this fact and pass on |
|
566 # the stream verbatim |
|
567 stream.unget(chunk) |
|
568 return (RAW, {}, stream) |
|
569 |
|
570 header = chunk[:header_end] |
|
571 |
|
572 # here we place any excess chunk back onto the stream, as |
|
573 # well as throwing away the CRLFCRLF bytes from above. |
|
574 stream.unget(chunk[header_end + 4:]) |
|
575 |
|
576 TYPE = RAW |
|
577 outdict = {} |
|
578 |
|
579 # Eliminate blank lines |
|
580 for line in header.split('\r\n'): |
|
581 # This terminology ("main value" and "dictionary of |
|
582 # parameters") is from the Python docs. |
|
583 try: |
|
584 name, (value, params) = _parse_header(line) |
|
585 except: |
|
586 continue |
|
587 |
|
588 if name == 'content-disposition': |
|
589 TYPE = FIELD |
|
590 if params.get('filename'): |
|
591 TYPE = FILE |
|
592 |
|
593 outdict[name] = value, params |
|
594 |
|
595 if TYPE == RAW: |
|
596 stream.unget(chunk) |
|
597 |
|
598 return (TYPE, outdict, stream) |
|
599 |
|
600 class Parser(object): |
|
601 def __init__(self, stream, boundary): |
|
602 self._stream = stream |
|
603 self._separator = '--' + boundary |
|
604 |
|
605 def __iter__(self): |
|
606 boundarystream = InterBoundaryIter(self._stream, self._separator) |
|
607 for sub_stream in boundarystream: |
|
608 # Iterate over each part |
|
609 yield parse_boundary_stream(sub_stream, 1024) |
|
610 |
|
611 def parse_header(line): |
|
612 """ Parse the header into a key-value. """ |
|
613 plist = _parse_header_params(';' + line) |
|
614 key = plist.pop(0).lower() |
|
615 pdict = {} |
|
616 for p in plist: |
|
617 i = p.find('=') |
|
618 if i >= 0: |
|
619 name = p[:i].strip().lower() |
|
620 value = p[i+1:].strip() |
|
621 if len(value) >= 2 and value[0] == value[-1] == '"': |
|
622 value = value[1:-1] |
|
623 value = value.replace('\\\\', '\\').replace('\\"', '"') |
|
624 pdict[name] = value |
|
625 return key, pdict |
|
626 |
|
627 def _parse_header_params(s): |
|
628 plist = [] |
|
629 while s[:1] == ';': |
|
630 s = s[1:] |
|
631 end = s.find(';') |
|
632 while end > 0 and s.count('"', 0, end) % 2: |
|
633 end = s.find(';', end + 1) |
|
634 if end < 0: |
|
635 end = len(s) |
|
636 f = s[:end] |
|
637 plist.append(f.strip()) |
|
638 s = s[end:] |
|
639 return plist |