app/python25src/urllib.py
changeset 280 ce9b10bbdd42
equal deleted inserted replaced
279:6314fce617c5 280:ce9b10bbdd42
       
     1 """Open an arbitrary URL.
       
     2 
       
     3 See the following document for more info on URLs:
       
     4 "Names and Addresses, URIs, URLs, URNs, URCs", at
       
     5 http://www.w3.org/pub/WWW/Addressing/Overview.html
       
     6 
       
     7 See also the HTTP spec (from which the error codes are derived):
       
     8 "HTTP - Hypertext Transfer Protocol", at
       
     9 http://www.w3.org/pub/WWW/Protocols/
       
    10 
       
    11 Related standards and specs:
       
    12 - RFC1808: the "relative URL" spec. (authoritative status)
       
    13 - RFC1738 - the "URL standard". (authoritative status)
       
    14 - RFC1630 - the "URI spec". (informational status)
       
    15 
       
    16 All code but that related to URL parsing has been removed (since it is not
       
    17 compatible with Google App Engine)from this fork of the original file,
       
    18 obtained from:
       
    19 http://svn.python.org/view/*checkout*/python/tags/r252/Lib/urllib.py?content-type=text%2Fplain&rev=60915
       
    20 """
       
    21 
       
    22 import string
       
    23 import sys
       
    24 from urlparse import urljoin as basejoin
       
    25 
       
    26 __all__ = ["quote", "quote_plus", "unquote", "unquote_plus",
       
    27            "urlencode", "splittag",
       
    28            "basejoin", "unwrap",
       
    29            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
       
    30            "splitnport", "splitquery", "splitattr", "splitvalue",
       
    31            "splitgophertype",]
       
    32 
       
    33 __version__ = '1.17'    # XXX This version is not always updated :-(
       
    34 
       
    35 
       
    36 # Utilities to parse URLs (most of these return None for missing parts):
       
    37 # unwrap('<URL:type://host/path>') --> 'type://host/path'
       
    38 # splittype('type:opaquestring') --> 'type', 'opaquestring'
       
    39 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
       
    40 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
       
    41 # splitpasswd('user:passwd') -> 'user', 'passwd'
       
    42 # splitport('host:port') --> 'host', 'port'
       
    43 # splitquery('/path?query') --> '/path', 'query'
       
    44 # splittag('/path#tag') --> '/path', 'tag'
       
    45 # splitattr('/path;attr1=value1;attr2=value2;...') ->
       
    46 #   '/path', ['attr1=value1', 'attr2=value2', ...]
       
    47 # splitvalue('attr=value') --> 'attr', 'value'
       
    48 # splitgophertype('/Xselector') --> 'X', 'selector'
       
    49 # unquote('abc%20def') -> 'abc def'
       
    50 # quote('abc def') -> 'abc%20def')
       
    51 
       
    52 try:
       
    53     unicode
       
    54 except NameError:
       
    55     def _is_unicode(x):
       
    56         return 0
       
    57 else:
       
    58     def _is_unicode(x):
       
    59         return isinstance(x, unicode)
       
    60 
       
    61 def toBytes(url):
       
    62     """toBytes(u"URL") --> 'URL'."""
       
    63     # Most URL schemes require ASCII. If that changes, the conversion
       
    64     # can be relaxed
       
    65     if _is_unicode(url):
       
    66         try:
       
    67             url = url.encode("ASCII")
       
    68         except UnicodeError:
       
    69             raise UnicodeError("URL " + repr(url) +
       
    70                                " contains non-ASCII characters")
       
    71     return url
       
    72 
       
    73 def unwrap(url):
       
    74     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
       
    75     url = url.strip()
       
    76     if url[:1] == '<' and url[-1:] == '>':
       
    77         url = url[1:-1].strip()
       
    78     if url[:4] == 'URL:': url = url[4:].strip()
       
    79     return url
       
    80 
       
    81 _typeprog = None
       
    82 def splittype(url):
       
    83     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
       
    84     global _typeprog
       
    85     if _typeprog is None:
       
    86         import re
       
    87         _typeprog = re.compile('^([^/:]+):')
       
    88 
       
    89     match = _typeprog.match(url)
       
    90     if match:
       
    91         scheme = match.group(1)
       
    92         return scheme.lower(), url[len(scheme) + 1:]
       
    93     return None, url
       
    94 
       
    95 _hostprog = None
       
    96 def splithost(url):
       
    97     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
       
    98     global _hostprog
       
    99     if _hostprog is None:
       
   100         import re
       
   101         _hostprog = re.compile('^//([^/?]*)(.*)$')
       
   102 
       
   103     match = _hostprog.match(url)
       
   104     if match: return match.group(1, 2)
       
   105     return None, url
       
   106 
       
   107 _userprog = None
       
   108 def splituser(host):
       
   109     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
       
   110     global _userprog
       
   111     if _userprog is None:
       
   112         import re
       
   113         _userprog = re.compile('^(.*)@(.*)$')
       
   114 
       
   115     match = _userprog.match(host)
       
   116     if match: return map(unquote, match.group(1, 2))
       
   117     return None, host
       
   118 
       
   119 _passwdprog = None
       
   120 def splitpasswd(user):
       
   121     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
       
   122     global _passwdprog
       
   123     if _passwdprog is None:
       
   124         import re
       
   125         _passwdprog = re.compile('^([^:]*):(.*)$')
       
   126 
       
   127     match = _passwdprog.match(user)
       
   128     if match: return match.group(1, 2)
       
   129     return user, None
       
   130 
       
   131 # splittag('/path#tag') --> '/path', 'tag'
       
   132 _portprog = None
       
   133 def splitport(host):
       
   134     """splitport('host:port') --> 'host', 'port'."""
       
   135     global _portprog
       
   136     if _portprog is None:
       
   137         import re
       
   138         _portprog = re.compile('^(.*):([0-9]+)$')
       
   139 
       
   140     match = _portprog.match(host)
       
   141     if match: return match.group(1, 2)
       
   142     return host, None
       
   143 
       
   144 _nportprog = None
       
   145 def splitnport(host, defport=-1):
       
   146     """Split host and port, returning numeric port.
       
   147     Return given default port if no ':' found; defaults to -1.
       
   148     Return numerical port if a valid number are found after ':'.
       
   149     Return None if ':' but not a valid number."""
       
   150     global _nportprog
       
   151     if _nportprog is None:
       
   152         import re
       
   153         _nportprog = re.compile('^(.*):(.*)$')
       
   154 
       
   155     match = _nportprog.match(host)
       
   156     if match:
       
   157         host, port = match.group(1, 2)
       
   158         try:
       
   159             if not port: raise ValueError, "no digits"
       
   160             nport = int(port)
       
   161         except ValueError:
       
   162             nport = None
       
   163         return host, nport
       
   164     return host, defport
       
   165 
       
   166 _queryprog = None
       
   167 def splitquery(url):
       
   168     """splitquery('/path?query') --> '/path', 'query'."""
       
   169     global _queryprog
       
   170     if _queryprog is None:
       
   171         import re
       
   172         _queryprog = re.compile('^(.*)\?([^?]*)$')
       
   173 
       
   174     match = _queryprog.match(url)
       
   175     if match: return match.group(1, 2)
       
   176     return url, None
       
   177 
       
   178 _tagprog = None
       
   179 def splittag(url):
       
   180     """splittag('/path#tag') --> '/path', 'tag'."""
       
   181     global _tagprog
       
   182     if _tagprog is None:
       
   183         import re
       
   184         _tagprog = re.compile('^(.*)#([^#]*)$')
       
   185 
       
   186     match = _tagprog.match(url)
       
   187     if match: return match.group(1, 2)
       
   188     return url, None
       
   189 
       
   190 def splitattr(url):
       
   191     """splitattr('/path;attr1=value1;attr2=value2;...') ->
       
   192         '/path', ['attr1=value1', 'attr2=value2', ...]."""
       
   193     words = url.split(';')
       
   194     return words[0], words[1:]
       
   195 
       
   196 _valueprog = None
       
   197 def splitvalue(attr):
       
   198     """splitvalue('attr=value') --> 'attr', 'value'."""
       
   199     global _valueprog
       
   200     if _valueprog is None:
       
   201         import re
       
   202         _valueprog = re.compile('^([^=]*)=(.*)$')
       
   203 
       
   204     match = _valueprog.match(attr)
       
   205     if match: return match.group(1, 2)
       
   206     return attr, None
       
   207 
       
   208 def splitgophertype(selector):
       
   209     """splitgophertype('/Xselector') --> 'X', 'selector'."""
       
   210     if selector[:1] == '/' and selector[1:2]:
       
   211         return selector[1], selector[2:]
       
   212     return None, selector
       
   213 
       
   214 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
       
   215 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
       
   216 
       
   217 def unquote(s):
       
   218     """unquote('abc%20def') -> 'abc def'."""
       
   219     res = s.split('%')
       
   220     for i in xrange(1, len(res)):
       
   221         item = res[i]
       
   222         try:
       
   223             res[i] = _hextochr[item[:2]] + item[2:]
       
   224         except KeyError:
       
   225             res[i] = '%' + item
       
   226         except UnicodeDecodeError:
       
   227             res[i] = unichr(int(item[:2], 16)) + item[2:]
       
   228     return "".join(res)
       
   229 
       
   230 def unquote_plus(s):
       
   231     """unquote('%7e/abc+def') -> '~/abc def'"""
       
   232     s = s.replace('+', ' ')
       
   233     return unquote(s)
       
   234 
       
   235 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
       
   236                'abcdefghijklmnopqrstuvwxyz'
       
   237                '0123456789' '_.-')
       
   238 _safemaps = {}
       
   239 
       
   240 def quote(s, safe = '/'):
       
   241     """quote('abc def') -> 'abc%20def'
       
   242 
       
   243     Each part of a URL, e.g. the path info, the query, etc., has a
       
   244     different set of reserved characters that must be quoted.
       
   245 
       
   246     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
       
   247     the following reserved characters.
       
   248 
       
   249     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
       
   250                   "$" | ","
       
   251 
       
   252     Each of these characters is reserved in some component of a URL,
       
   253     but not necessarily in all of them.
       
   254 
       
   255     By default, the quote function is intended for quoting the path
       
   256     section of a URL.  Thus, it will not encode '/'.  This character
       
   257     is reserved, but in typical usage the quote function is being
       
   258     called on a path where the existing slash characters are used as
       
   259     reserved characters.
       
   260     """
       
   261     cachekey = (safe, always_safe)
       
   262     try:
       
   263         safe_map = _safemaps[cachekey]
       
   264     except KeyError:
       
   265         safe += always_safe
       
   266         safe_map = {}
       
   267         for i in range(256):
       
   268             c = chr(i)
       
   269             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
       
   270         _safemaps[cachekey] = safe_map
       
   271     res = map(safe_map.__getitem__, s)
       
   272     return ''.join(res)
       
   273 
       
   274 def quote_plus(s, safe = ''):
       
   275     """Quote the query fragment of a URL; replacing ' ' with '+'"""
       
   276     if ' ' in s:
       
   277         s = quote(s, safe + ' ')
       
   278         return s.replace(' ', '+')
       
   279     return quote(s, safe)
       
   280 
       
   281 def urlencode(query,doseq=0):
       
   282     """Encode a sequence of two-element tuples or dictionary into a URL query string.
       
   283 
       
   284     If any values in the query arg are sequences and doseq is true, each
       
   285     sequence element is converted to a separate parameter.
       
   286 
       
   287     If the query arg is a sequence of two-element tuples, the order of the
       
   288     parameters in the output will match the order of parameters in the
       
   289     input.
       
   290     """
       
   291 
       
   292     if hasattr(query,"items"):
       
   293         # mapping objects
       
   294         query = query.items()
       
   295     else:
       
   296         # it's a bother at times that strings and string-like objects are
       
   297         # sequences...
       
   298         try:
       
   299             # non-sequence items should not work with len()
       
   300             # non-empty strings will fail this
       
   301             if len(query) and not isinstance(query[0], tuple):
       
   302                 raise TypeError
       
   303             # zero-length sequences of all types will get here and succeed,
       
   304             # but that's a minor nit - since the original implementation
       
   305             # allowed empty dicts that type of behavior probably should be
       
   306             # preserved for consistency
       
   307         except TypeError:
       
   308             ty,va,tb = sys.exc_info()
       
   309             raise TypeError, "not a valid non-string sequence or mapping object", tb
       
   310 
       
   311     l = []
       
   312     if not doseq:
       
   313         # preserve old behavior
       
   314         for k, v in query:
       
   315             k = quote_plus(str(k))
       
   316             v = quote_plus(str(v))
       
   317             l.append(k + '=' + v)
       
   318     else:
       
   319         for k, v in query:
       
   320             k = quote_plus(str(k))
       
   321             if isinstance(v, str):
       
   322                 v = quote_plus(v)
       
   323                 l.append(k + '=' + v)
       
   324             elif _is_unicode(v):
       
   325                 # is there a reasonable way to convert to ASCII?
       
   326                 # encode generates a string, but "replace" or "ignore"
       
   327                 # lose information and "strict" can raise UnicodeError
       
   328                 v = quote_plus(v.encode("ASCII","replace"))
       
   329                 l.append(k + '=' + v)
       
   330             else:
       
   331                 try:
       
   332                     # is this a sufficient test for sequence-ness?
       
   333                     x = len(v)
       
   334                 except TypeError:
       
   335                     # not a sequence
       
   336                     v = quote_plus(str(v))
       
   337                     l.append(k + '=' + v)
       
   338                 else:
       
   339                     # loop over the sequence
       
   340                     for elt in v:
       
   341                         l.append(k + '=' + quote_plus(str(elt)))
       
   342     return '&'.join(l)