author | Sverre Rabbelier <srabbelier@gmail.com> |
Thu, 12 Mar 2009 22:29:14 +0000 | |
changeset 1814 | 0c4fd663704b |
parent 280 | ce9b10bbdd42 |
permissions | -rw-r--r-- |
"""Open an arbitrary URL. See the following document for more info on URLs: "Names and Addresses, URIs, URLs, URNs, URCs", at http://www.w3.org/pub/WWW/Addressing/Overview.html See also the HTTP spec (from which the error codes are derived): "HTTP - Hypertext Transfer Protocol", at http://www.w3.org/pub/WWW/Protocols/ Related standards and specs: - RFC1808: the "relative URL" spec. (authoritative status) - RFC1738 - the "URL standard". (authoritative status) - RFC1630 - the "URI spec". (informational status) All code but that related to URL parsing has been removed (since it is not compatible with Google App Engine)from this fork of the original file, obtained from: http://svn.python.org/view/*checkout*/python/tags/r252/Lib/urllib.py?content-type=text%2Fplain&rev=60915 """ import string import sys from urlparse import urljoin as basejoin __all__ = ["quote", "quote_plus", "unquote", "unquote_plus", "urlencode", "splittag", "basejoin", "unwrap", "splittype", "splithost", "splituser", "splitpasswd", "splitport", "splitnport", "splitquery", "splitattr", "splitvalue", "splitgophertype",] __version__ = '1.17' # XXX This version is not always updated :-( # Utilities to parse URLs (most of these return None for missing parts): # unwrap('<URL:type://host/path>') --> 'type://host/path' # splittype('type:opaquestring') --> 'type', 'opaquestring' # splithost('//host[:port]/path') --> 'host[:port]', '/path' # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' # splitpasswd('user:passwd') -> 'user', 'passwd' # splitport('host:port') --> 'host', 'port' # splitquery('/path?query') --> '/path', 'query' # splittag('/path#tag') --> '/path', 'tag' # splitattr('/path;attr1=value1;attr2=value2;...') -> # '/path', ['attr1=value1', 'attr2=value2', ...] # splitvalue('attr=value') --> 'attr', 'value' # splitgophertype('/Xselector') --> 'X', 'selector' # unquote('abc%20def') -> 'abc def' # quote('abc def') -> 'abc%20def') try: unicode except NameError: def _is_unicode(x): return 0 else: def _is_unicode(x): return isinstance(x, unicode) def toBytes(url): """toBytes(u"URL") --> 'URL'.""" # Most URL schemes require ASCII. If that changes, the conversion # can be relaxed if _is_unicode(url): try: url = url.encode("ASCII") except UnicodeError: raise UnicodeError("URL " + repr(url) + " contains non-ASCII characters") return url def unwrap(url): """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" url = url.strip() if url[:1] == '<' and url[-1:] == '>': url = url[1:-1].strip() if url[:4] == 'URL:': url = url[4:].strip() return url _typeprog = None def splittype(url): """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" global _typeprog if _typeprog is None: import re _typeprog = re.compile('^([^/:]+):') match = _typeprog.match(url) if match: scheme = match.group(1) return scheme.lower(), url[len(scheme) + 1:] return None, url _hostprog = None def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" global _hostprog if _hostprog is None: import re _hostprog = re.compile('^//([^/?]*)(.*)$') match = _hostprog.match(url) if match: return match.group(1, 2) return None, url _userprog = None def splituser(host): """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" global _userprog if _userprog is None: import re _userprog = re.compile('^(.*)@(.*)$') match = _userprog.match(host) if match: return map(unquote, match.group(1, 2)) return None, host _passwdprog = None def splitpasswd(user): """splitpasswd('user:passwd') -> 'user', 'passwd'.""" global _passwdprog if _passwdprog is None: import re _passwdprog = re.compile('^([^:]*):(.*)$') match = _passwdprog.match(user) if match: return match.group(1, 2) return user, None # splittag('/path#tag') --> '/path', 'tag' _portprog = None def splitport(host): """splitport('host:port') --> 'host', 'port'.""" global _portprog if _portprog is None: import re _portprog = re.compile('^(.*):([0-9]+)$') match = _portprog.match(host) if match: return match.group(1, 2) return host, None _nportprog = None def splitnport(host, defport=-1): """Split host and port, returning numeric port. Return given default port if no ':' found; defaults to -1. Return numerical port if a valid number are found after ':'. Return None if ':' but not a valid number.""" global _nportprog if _nportprog is None: import re _nportprog = re.compile('^(.*):(.*)$') match = _nportprog.match(host) if match: host, port = match.group(1, 2) try: if not port: raise ValueError, "no digits" nport = int(port) except ValueError: nport = None return host, nport return host, defport _queryprog = None def splitquery(url): """splitquery('/path?query') --> '/path', 'query'.""" global _queryprog if _queryprog is None: import re _queryprog = re.compile('^(.*)\?([^?]*)$') match = _queryprog.match(url) if match: return match.group(1, 2) return url, None _tagprog = None def splittag(url): """splittag('/path#tag') --> '/path', 'tag'.""" global _tagprog if _tagprog is None: import re _tagprog = re.compile('^(.*)#([^#]*)$') match = _tagprog.match(url) if match: return match.group(1, 2) return url, None def splitattr(url): """splitattr('/path;attr1=value1;attr2=value2;...') -> '/path', ['attr1=value1', 'attr2=value2', ...].""" words = url.split(';') return words[0], words[1:] _valueprog = None def splitvalue(attr): """splitvalue('attr=value') --> 'attr', 'value'.""" global _valueprog if _valueprog is None: import re _valueprog = re.compile('^([^=]*)=(.*)$') match = _valueprog.match(attr) if match: return match.group(1, 2) return attr, None def splitgophertype(selector): """splitgophertype('/Xselector') --> 'X', 'selector'.""" if selector[:1] == '/' and selector[1:2]: return selector[1], selector[2:] return None, selector _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) def unquote(s): """unquote('abc%20def') -> 'abc def'.""" res = s.split('%') for i in xrange(1, len(res)): item = res[i] try: res[i] = _hextochr[item[:2]] + item[2:] except KeyError: res[i] = '%' + item except UnicodeDecodeError: res[i] = unichr(int(item[:2], 16)) + item[2:] return "".join(res) def unquote_plus(s): """unquote('%7e/abc+def') -> '~/abc def'""" s = s.replace('+', ' ') return unquote(s) always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789' '_.-') _safemaps = {} def quote(s, safe = '/'): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," Each of these characters is reserved in some component of a URL, but not necessarily in all of them. By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. """ cachekey = (safe, always_safe) try: safe_map = _safemaps[cachekey] except KeyError: safe += always_safe safe_map = {} for i in range(256): c = chr(i) safe_map[c] = (c in safe) and c or ('%%%02X' % i) _safemaps[cachekey] = safe_map res = map(safe_map.__getitem__, s) return ''.join(res) def quote_plus(s, safe = ''): """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: s = quote(s, safe + ' ') return s.replace(' ', '+') return quote(s, safe) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. If any values in the query arg are sequences and doseq is true, each sequence element is converted to a separate parameter. If the query arg is a sequence of two-element tuples, the order of the parameters in the output will match the order of parameters in the input. """ if hasattr(query,"items"): # mapping objects query = query.items() else: # it's a bother at times that strings and string-like objects are # sequences... try: # non-sequence items should not work with len() # non-empty strings will fail this if len(query) and not isinstance(query[0], tuple): raise TypeError # zero-length sequences of all types will get here and succeed, # but that's a minor nit - since the original implementation # allowed empty dicts that type of behavior probably should be # preserved for consistency except TypeError: ty,va,tb = sys.exc_info() raise TypeError, "not a valid non-string sequence or mapping object", tb l = [] if not doseq: # preserve old behavior for k, v in query: k = quote_plus(str(k)) v = quote_plus(str(v)) l.append(k + '=' + v) else: for k, v in query: k = quote_plus(str(k)) if isinstance(v, str): v = quote_plus(v) l.append(k + '=' + v) elif _is_unicode(v): # is there a reasonable way to convert to ASCII? # encode generates a string, but "replace" or "ignore" # lose information and "strict" can raise UnicodeError v = quote_plus(v.encode("ASCII","replace")) l.append(k + '=' + v) else: try: # is this a sufficient test for sequence-ness? x = len(v) except TypeError: # not a sequence v = quote_plus(str(v)) l.append(k + '=' + v) else: # loop over the sequence for elt in v: l.append(k + '=' + quote_plus(str(elt))) return '&'.join(l)