|
1 """Open an arbitrary URL. |
|
2 |
|
3 See the following document for more info on URLs: |
|
4 "Names and Addresses, URIs, URLs, URNs, URCs", at |
|
5 http://www.w3.org/pub/WWW/Addressing/Overview.html |
|
6 |
|
7 See also the HTTP spec (from which the error codes are derived): |
|
8 "HTTP - Hypertext Transfer Protocol", at |
|
9 http://www.w3.org/pub/WWW/Protocols/ |
|
10 |
|
11 Related standards and specs: |
|
12 - RFC1808: the "relative URL" spec. (authoritative status) |
|
13 - RFC1738 - the "URL standard". (authoritative status) |
|
14 - RFC1630 - the "URI spec". (informational status) |
|
15 |
|
16 All code but that related to URL parsing has been removed (since it is not |
|
17 compatible with Google App Engine)from this fork of the original file, |
|
18 obtained from: |
|
19 http://svn.python.org/view/*checkout*/python/tags/r252/Lib/urllib.py?content-type=text%2Fplain&rev=60915 |
|
20 """ |
|
21 |
|
22 import string |
|
23 import sys |
|
24 from urlparse import urljoin as basejoin |
|
25 |
|
26 __all__ = ["quote", "quote_plus", "unquote", "unquote_plus", |
|
27 "urlencode", "splittag", |
|
28 "basejoin", "unwrap", |
|
29 "splittype", "splithost", "splituser", "splitpasswd", "splitport", |
|
30 "splitnport", "splitquery", "splitattr", "splitvalue", |
|
31 "splitgophertype",] |
|
32 |
|
33 __version__ = '1.17' # XXX This version is not always updated :-( |
|
34 |
|
35 |
|
36 # Utilities to parse URLs (most of these return None for missing parts): |
|
37 # unwrap('<URL:type://host/path>') --> 'type://host/path' |
|
38 # splittype('type:opaquestring') --> 'type', 'opaquestring' |
|
39 # splithost('//host[:port]/path') --> 'host[:port]', '/path' |
|
40 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' |
|
41 # splitpasswd('user:passwd') -> 'user', 'passwd' |
|
42 # splitport('host:port') --> 'host', 'port' |
|
43 # splitquery('/path?query') --> '/path', 'query' |
|
44 # splittag('/path#tag') --> '/path', 'tag' |
|
45 # splitattr('/path;attr1=value1;attr2=value2;...') -> |
|
46 # '/path', ['attr1=value1', 'attr2=value2', ...] |
|
47 # splitvalue('attr=value') --> 'attr', 'value' |
|
48 # splitgophertype('/Xselector') --> 'X', 'selector' |
|
49 # unquote('abc%20def') -> 'abc def' |
|
50 # quote('abc def') -> 'abc%20def') |
|
51 |
|
52 try: |
|
53 unicode |
|
54 except NameError: |
|
55 def _is_unicode(x): |
|
56 return 0 |
|
57 else: |
|
58 def _is_unicode(x): |
|
59 return isinstance(x, unicode) |
|
60 |
|
61 def toBytes(url): |
|
62 """toBytes(u"URL") --> 'URL'.""" |
|
63 # Most URL schemes require ASCII. If that changes, the conversion |
|
64 # can be relaxed |
|
65 if _is_unicode(url): |
|
66 try: |
|
67 url = url.encode("ASCII") |
|
68 except UnicodeError: |
|
69 raise UnicodeError("URL " + repr(url) + |
|
70 " contains non-ASCII characters") |
|
71 return url |
|
72 |
|
73 def unwrap(url): |
|
74 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" |
|
75 url = url.strip() |
|
76 if url[:1] == '<' and url[-1:] == '>': |
|
77 url = url[1:-1].strip() |
|
78 if url[:4] == 'URL:': url = url[4:].strip() |
|
79 return url |
|
80 |
|
81 _typeprog = None |
|
82 def splittype(url): |
|
83 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" |
|
84 global _typeprog |
|
85 if _typeprog is None: |
|
86 import re |
|
87 _typeprog = re.compile('^([^/:]+):') |
|
88 |
|
89 match = _typeprog.match(url) |
|
90 if match: |
|
91 scheme = match.group(1) |
|
92 return scheme.lower(), url[len(scheme) + 1:] |
|
93 return None, url |
|
94 |
|
95 _hostprog = None |
|
96 def splithost(url): |
|
97 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" |
|
98 global _hostprog |
|
99 if _hostprog is None: |
|
100 import re |
|
101 _hostprog = re.compile('^//([^/?]*)(.*)$') |
|
102 |
|
103 match = _hostprog.match(url) |
|
104 if match: return match.group(1, 2) |
|
105 return None, url |
|
106 |
|
107 _userprog = None |
|
108 def splituser(host): |
|
109 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" |
|
110 global _userprog |
|
111 if _userprog is None: |
|
112 import re |
|
113 _userprog = re.compile('^(.*)@(.*)$') |
|
114 |
|
115 match = _userprog.match(host) |
|
116 if match: return map(unquote, match.group(1, 2)) |
|
117 return None, host |
|
118 |
|
119 _passwdprog = None |
|
120 def splitpasswd(user): |
|
121 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" |
|
122 global _passwdprog |
|
123 if _passwdprog is None: |
|
124 import re |
|
125 _passwdprog = re.compile('^([^:]*):(.*)$') |
|
126 |
|
127 match = _passwdprog.match(user) |
|
128 if match: return match.group(1, 2) |
|
129 return user, None |
|
130 |
|
131 # splittag('/path#tag') --> '/path', 'tag' |
|
132 _portprog = None |
|
133 def splitport(host): |
|
134 """splitport('host:port') --> 'host', 'port'.""" |
|
135 global _portprog |
|
136 if _portprog is None: |
|
137 import re |
|
138 _portprog = re.compile('^(.*):([0-9]+)$') |
|
139 |
|
140 match = _portprog.match(host) |
|
141 if match: return match.group(1, 2) |
|
142 return host, None |
|
143 |
|
144 _nportprog = None |
|
145 def splitnport(host, defport=-1): |
|
146 """Split host and port, returning numeric port. |
|
147 Return given default port if no ':' found; defaults to -1. |
|
148 Return numerical port if a valid number are found after ':'. |
|
149 Return None if ':' but not a valid number.""" |
|
150 global _nportprog |
|
151 if _nportprog is None: |
|
152 import re |
|
153 _nportprog = re.compile('^(.*):(.*)$') |
|
154 |
|
155 match = _nportprog.match(host) |
|
156 if match: |
|
157 host, port = match.group(1, 2) |
|
158 try: |
|
159 if not port: raise ValueError, "no digits" |
|
160 nport = int(port) |
|
161 except ValueError: |
|
162 nport = None |
|
163 return host, nport |
|
164 return host, defport |
|
165 |
|
166 _queryprog = None |
|
167 def splitquery(url): |
|
168 """splitquery('/path?query') --> '/path', 'query'.""" |
|
169 global _queryprog |
|
170 if _queryprog is None: |
|
171 import re |
|
172 _queryprog = re.compile('^(.*)\?([^?]*)$') |
|
173 |
|
174 match = _queryprog.match(url) |
|
175 if match: return match.group(1, 2) |
|
176 return url, None |
|
177 |
|
178 _tagprog = None |
|
179 def splittag(url): |
|
180 """splittag('/path#tag') --> '/path', 'tag'.""" |
|
181 global _tagprog |
|
182 if _tagprog is None: |
|
183 import re |
|
184 _tagprog = re.compile('^(.*)#([^#]*)$') |
|
185 |
|
186 match = _tagprog.match(url) |
|
187 if match: return match.group(1, 2) |
|
188 return url, None |
|
189 |
|
190 def splitattr(url): |
|
191 """splitattr('/path;attr1=value1;attr2=value2;...') -> |
|
192 '/path', ['attr1=value1', 'attr2=value2', ...].""" |
|
193 words = url.split(';') |
|
194 return words[0], words[1:] |
|
195 |
|
196 _valueprog = None |
|
197 def splitvalue(attr): |
|
198 """splitvalue('attr=value') --> 'attr', 'value'.""" |
|
199 global _valueprog |
|
200 if _valueprog is None: |
|
201 import re |
|
202 _valueprog = re.compile('^([^=]*)=(.*)$') |
|
203 |
|
204 match = _valueprog.match(attr) |
|
205 if match: return match.group(1, 2) |
|
206 return attr, None |
|
207 |
|
208 def splitgophertype(selector): |
|
209 """splitgophertype('/Xselector') --> 'X', 'selector'.""" |
|
210 if selector[:1] == '/' and selector[1:2]: |
|
211 return selector[1], selector[2:] |
|
212 return None, selector |
|
213 |
|
214 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) |
|
215 _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) |
|
216 |
|
217 def unquote(s): |
|
218 """unquote('abc%20def') -> 'abc def'.""" |
|
219 res = s.split('%') |
|
220 for i in xrange(1, len(res)): |
|
221 item = res[i] |
|
222 try: |
|
223 res[i] = _hextochr[item[:2]] + item[2:] |
|
224 except KeyError: |
|
225 res[i] = '%' + item |
|
226 except UnicodeDecodeError: |
|
227 res[i] = unichr(int(item[:2], 16)) + item[2:] |
|
228 return "".join(res) |
|
229 |
|
230 def unquote_plus(s): |
|
231 """unquote('%7e/abc+def') -> '~/abc def'""" |
|
232 s = s.replace('+', ' ') |
|
233 return unquote(s) |
|
234 |
|
235 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|
236 'abcdefghijklmnopqrstuvwxyz' |
|
237 '0123456789' '_.-') |
|
238 _safemaps = {} |
|
239 |
|
240 def quote(s, safe = '/'): |
|
241 """quote('abc def') -> 'abc%20def' |
|
242 |
|
243 Each part of a URL, e.g. the path info, the query, etc., has a |
|
244 different set of reserved characters that must be quoted. |
|
245 |
|
246 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists |
|
247 the following reserved characters. |
|
248 |
|
249 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | |
|
250 "$" | "," |
|
251 |
|
252 Each of these characters is reserved in some component of a URL, |
|
253 but not necessarily in all of them. |
|
254 |
|
255 By default, the quote function is intended for quoting the path |
|
256 section of a URL. Thus, it will not encode '/'. This character |
|
257 is reserved, but in typical usage the quote function is being |
|
258 called on a path where the existing slash characters are used as |
|
259 reserved characters. |
|
260 """ |
|
261 cachekey = (safe, always_safe) |
|
262 try: |
|
263 safe_map = _safemaps[cachekey] |
|
264 except KeyError: |
|
265 safe += always_safe |
|
266 safe_map = {} |
|
267 for i in range(256): |
|
268 c = chr(i) |
|
269 safe_map[c] = (c in safe) and c or ('%%%02X' % i) |
|
270 _safemaps[cachekey] = safe_map |
|
271 res = map(safe_map.__getitem__, s) |
|
272 return ''.join(res) |
|
273 |
|
274 def quote_plus(s, safe = ''): |
|
275 """Quote the query fragment of a URL; replacing ' ' with '+'""" |
|
276 if ' ' in s: |
|
277 s = quote(s, safe + ' ') |
|
278 return s.replace(' ', '+') |
|
279 return quote(s, safe) |
|
280 |
|
281 def urlencode(query,doseq=0): |
|
282 """Encode a sequence of two-element tuples or dictionary into a URL query string. |
|
283 |
|
284 If any values in the query arg are sequences and doseq is true, each |
|
285 sequence element is converted to a separate parameter. |
|
286 |
|
287 If the query arg is a sequence of two-element tuples, the order of the |
|
288 parameters in the output will match the order of parameters in the |
|
289 input. |
|
290 """ |
|
291 |
|
292 if hasattr(query,"items"): |
|
293 # mapping objects |
|
294 query = query.items() |
|
295 else: |
|
296 # it's a bother at times that strings and string-like objects are |
|
297 # sequences... |
|
298 try: |
|
299 # non-sequence items should not work with len() |
|
300 # non-empty strings will fail this |
|
301 if len(query) and not isinstance(query[0], tuple): |
|
302 raise TypeError |
|
303 # zero-length sequences of all types will get here and succeed, |
|
304 # but that's a minor nit - since the original implementation |
|
305 # allowed empty dicts that type of behavior probably should be |
|
306 # preserved for consistency |
|
307 except TypeError: |
|
308 ty,va,tb = sys.exc_info() |
|
309 raise TypeError, "not a valid non-string sequence or mapping object", tb |
|
310 |
|
311 l = [] |
|
312 if not doseq: |
|
313 # preserve old behavior |
|
314 for k, v in query: |
|
315 k = quote_plus(str(k)) |
|
316 v = quote_plus(str(v)) |
|
317 l.append(k + '=' + v) |
|
318 else: |
|
319 for k, v in query: |
|
320 k = quote_plus(str(k)) |
|
321 if isinstance(v, str): |
|
322 v = quote_plus(v) |
|
323 l.append(k + '=' + v) |
|
324 elif _is_unicode(v): |
|
325 # is there a reasonable way to convert to ASCII? |
|
326 # encode generates a string, but "replace" or "ignore" |
|
327 # lose information and "strict" can raise UnicodeError |
|
328 v = quote_plus(v.encode("ASCII","replace")) |
|
329 l.append(k + '=' + v) |
|
330 else: |
|
331 try: |
|
332 # is this a sufficient test for sequence-ness? |
|
333 x = len(v) |
|
334 except TypeError: |
|
335 # not a sequence |
|
336 v = quote_plus(str(v)) |
|
337 l.append(k + '=' + v) |
|
338 else: |
|
339 # loop over the sequence |
|
340 for elt in v: |
|
341 l.append(k + '=' + quote_plus(str(elt))) |
|
342 return '&'.join(l) |