|
1 # -*- coding: utf-8 -*- |
|
2 """Unit tests for Beautiful Soup. |
|
3 |
|
4 These tests make sure the Beautiful Soup works as it should. If you |
|
5 find a bug in Beautiful Soup, the best way to express it is as a test |
|
6 case like this that fails.""" |
|
7 |
|
8 import unittest |
|
9 from BeautifulSoup import * |
|
10 |
|
11 class SoupTest(unittest.TestCase): |
|
12 |
|
13 def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup, |
|
14 encoding=None): |
|
15 """Parse the given text and make sure its string rep is the other |
|
16 given text.""" |
|
17 if rep == None: |
|
18 rep = toParse |
|
19 obj = c(toParse) |
|
20 if encoding is None: |
|
21 rep2 = obj.decode() |
|
22 else: |
|
23 rep2 = obj.encode(encoding) |
|
24 self.assertEqual(rep2, rep) |
|
25 |
|
26 class FollowThatTag(SoupTest): |
|
27 |
|
28 "Tests the various ways of fetching tags from a soup." |
|
29 |
|
30 def setUp(self): |
|
31 ml = """ |
|
32 <a id="x">1</a> |
|
33 <A id="a">2</a> |
|
34 <b id="b">3</a> |
|
35 <b href="foo" id="x">4</a> |
|
36 <ac width=100>4</ac>""" |
|
37 self.soup = BeautifulStoneSoup(ml) |
|
38 |
|
39 def testFindAllByName(self): |
|
40 matching = self.soup('a') |
|
41 self.assertEqual(len(matching), 2) |
|
42 self.assertEqual(matching[0].name, 'a') |
|
43 self.assertEqual(matching, self.soup.findAll('a')) |
|
44 self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) |
|
45 |
|
46 def testFindAllByAttribute(self): |
|
47 matching = self.soup.findAll(id='x') |
|
48 self.assertEqual(len(matching), 2) |
|
49 self.assertEqual(matching[0].name, 'a') |
|
50 self.assertEqual(matching[1].name, 'b') |
|
51 |
|
52 matching2 = self.soup.findAll(attrs={'id' : 'x'}) |
|
53 self.assertEqual(matching, matching2) |
|
54 |
|
55 strainer = SoupStrainer(attrs={'id' : 'x'}) |
|
56 self.assertEqual(matching, self.soup.findAll(strainer)) |
|
57 |
|
58 self.assertEqual(len(self.soup.findAll(id=None)), 1) |
|
59 |
|
60 self.assertEqual(len(self.soup.findAll(width=100)), 1) |
|
61 self.assertEqual(len(self.soup.findAll(junk=None)), 5) |
|
62 self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5) |
|
63 |
|
64 self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0) |
|
65 self.assertEqual(len(self.soup.findAll(junk=True)), 0) |
|
66 |
|
67 self.assertEqual(len(self.soup.findAll(junk=True)), 0) |
|
68 self.assertEqual(len(self.soup.findAll(href=True)), 1) |
|
69 |
|
70 def testFindallByClass(self): |
|
71 soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>') |
|
72 self.assertEqual(soup.find('a', '1').string, "Bar") |
|
73 |
|
74 def testFindAllByList(self): |
|
75 matching = self.soup(['a', 'ac']) |
|
76 self.assertEqual(len(matching), 3) |
|
77 |
|
78 def testFindAllByHash(self): |
|
79 matching = self.soup({'a' : True, 'b' : True}) |
|
80 self.assertEqual(len(matching), 4) |
|
81 |
|
82 def testFindAllText(self): |
|
83 soup = BeautifulSoup("<html>\xbb</html>") |
|
84 self.assertEqual(soup.findAll(text=re.compile('.*')), |
|
85 [u'\xbb']) |
|
86 |
|
87 def testFindAllByRE(self): |
|
88 import re |
|
89 r = re.compile('a.*') |
|
90 self.assertEqual(len(self.soup(r)), 3) |
|
91 |
|
92 def testFindAllByMethod(self): |
|
93 def matchTagWhereIDMatchesName(tag): |
|
94 return tag.name == tag.get('id') |
|
95 |
|
96 matching = self.soup.findAll(matchTagWhereIDMatchesName) |
|
97 self.assertEqual(len(matching), 2) |
|
98 self.assertEqual(matching[0].name, 'a') |
|
99 |
|
100 def testParents(self): |
|
101 soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah') |
|
102 b = soup.b |
|
103 self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2) |
|
104 self.assertEquals(b.findParent('ul')['a'], 'b') |
|
105 |
|
106 PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">') |
|
107 |
|
108 def testNext(self): |
|
109 soup = self.PROXIMITY_TEST |
|
110 b = soup.find('b', {'id' : 2}) |
|
111 self.assertEquals(b.findNext('b')['id'], '3') |
|
112 self.assertEquals(b.findNext('b')['id'], '3') |
|
113 self.assertEquals(len(b.findAllNext('b')), 2) |
|
114 self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1) |
|
115 |
|
116 def testPrevious(self): |
|
117 soup = self.PROXIMITY_TEST |
|
118 b = soup.find('b', {'id' : 3}) |
|
119 self.assertEquals(b.findPrevious('b')['id'], '2') |
|
120 self.assertEquals(b.findPrevious('b')['id'], '2') |
|
121 self.assertEquals(len(b.findAllPrevious('b')), 2) |
|
122 self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1) |
|
123 |
|
124 |
|
125 SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">') |
|
126 |
|
127 def testNextSibling(self): |
|
128 soup = self.SIBLING_TEST |
|
129 tag = 'blockquote' |
|
130 b = soup.find(tag, {'id' : 2}) |
|
131 self.assertEquals(b.findNext(tag)['id'], '2.1') |
|
132 self.assertEquals(b.findNextSibling(tag)['id'], '3') |
|
133 self.assertEquals(b.findNextSibling(tag)['id'], '3') |
|
134 self.assertEquals(len(b.findNextSiblings(tag)), 2) |
|
135 self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1) |
|
136 |
|
137 def testPreviousSibling(self): |
|
138 soup = self.SIBLING_TEST |
|
139 tag = 'blockquote' |
|
140 b = soup.find(tag, {'id' : 3}) |
|
141 self.assertEquals(b.findPrevious(tag)['id'], '2.1') |
|
142 self.assertEquals(b.findPreviousSibling(tag)['id'], '2') |
|
143 self.assertEquals(b.findPreviousSibling(tag)['id'], '2') |
|
144 self.assertEquals(len(b.findPreviousSiblings(tag)), 2) |
|
145 self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1) |
|
146 |
|
147 def testTextNavigation(self): |
|
148 soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh') |
|
149 baz = soup.find(text='Baz') |
|
150 self.assertEquals(baz.findParent("i")['id'], '1') |
|
151 self.assertEquals(baz.findNext(text='Blee'), 'Blee') |
|
152 self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee') |
|
153 self.assertEquals(baz.findNextSibling(text='Blargh'), None) |
|
154 self.assertEquals(baz.findNextSibling('hr')['id'], '1') |
|
155 |
|
156 class SiblingRivalry(SoupTest): |
|
157 "Tests the nextSibling and previousSibling navigation." |
|
158 |
|
159 def testSiblings(self): |
|
160 soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>") |
|
161 secondLI = soup.find('li').nextSibling |
|
162 self.assert_(secondLI.name == 'li' and secondLI.string == '2') |
|
163 self.assertEquals(soup.find(text='1').nextSibling.name, 'p') |
|
164 self.assertEquals(soup.find('p').nextSibling, 'B') |
|
165 self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B') |
|
166 |
|
167 class TagsAreObjectsToo(SoupTest): |
|
168 "Tests the various built-in functions of Tag objects." |
|
169 |
|
170 def testLen(self): |
|
171 soup = BeautifulSoup("<top>1<b>2</b>3</top>") |
|
172 self.assertEquals(len(soup.top), 3) |
|
173 |
|
174 class StringEmUp(SoupTest): |
|
175 "Tests the use of 'string' as an alias for a tag's only content." |
|
176 |
|
177 def testString(self): |
|
178 s = BeautifulSoup("<b>foo</b>") |
|
179 self.assertEquals(s.b.string, 'foo') |
|
180 |
|
181 def testLackOfString(self): |
|
182 s = BeautifulSoup("<b>f<i>e</i>o</b>") |
|
183 self.assert_(not s.b.string) |
|
184 |
|
185 class ThatsMyLimit(SoupTest): |
|
186 "Tests the limit argument." |
|
187 |
|
188 def testBasicLimits(self): |
|
189 s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />') |
|
190 self.assertEquals(len(s.findAll('br')), 4) |
|
191 self.assertEquals(len(s.findAll('br', limit=2)), 2) |
|
192 self.assertEquals(len(s('br', limit=2)), 2) |
|
193 |
|
194 class OnlyTheLonely(SoupTest): |
|
195 "Tests the parseOnly argument to the constructor." |
|
196 def setUp(self): |
|
197 x = [] |
|
198 for i in range(1,6): |
|
199 x.append('<a id="%s">' % i) |
|
200 for j in range(100,103): |
|
201 x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j)) |
|
202 x.append('</a>') |
|
203 self.x = ''.join(x) |
|
204 |
|
205 def testOnly(self): |
|
206 strainer = SoupStrainer("b") |
|
207 soup = BeautifulSoup(self.x, parseOnlyThese=strainer) |
|
208 self.assertEquals(len(soup), 15) |
|
209 |
|
210 strainer = SoupStrainer(id=re.compile("100.*")) |
|
211 soup = BeautifulSoup(self.x, parseOnlyThese=strainer) |
|
212 self.assertEquals(len(soup), 5) |
|
213 |
|
214 strainer = SoupStrainer(text=re.compile("10[01].*")) |
|
215 soup = BeautifulSoup(self.x, parseOnlyThese=strainer) |
|
216 self.assertEquals(len(soup), 10) |
|
217 |
|
218 strainer = SoupStrainer(text=lambda(x):x[8]=='3') |
|
219 soup = BeautifulSoup(self.x, parseOnlyThese=strainer) |
|
220 self.assertEquals(len(soup), 3) |
|
221 |
|
222 class PickleMeThis(SoupTest): |
|
223 "Testing features like pickle and deepcopy." |
|
224 |
|
225 def setUp(self): |
|
226 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" |
|
227 "http://www.w3.org/TR/REC-html40/transitional.dtd"> |
|
228 <html> |
|
229 <head> |
|
230 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> |
|
231 <title>Beautiful Soup: We called him Tortoise because he taught us.</title> |
|
232 <link rev="made" href="mailto:leonardr@segfault.org"> |
|
233 <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> |
|
234 <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> |
|
235 <meta name="author" content="Leonard Richardson"> |
|
236 </head> |
|
237 <body> |
|
238 <a href="foo">foo</a> |
|
239 <a href="foo"><b>bar</b></a> |
|
240 </body> |
|
241 </html>""" |
|
242 |
|
243 self.soup = BeautifulSoup(self.page) |
|
244 |
|
245 def testPickle(self): |
|
246 import pickle |
|
247 dumped = pickle.dumps(self.soup, 2) |
|
248 loaded = pickle.loads(dumped) |
|
249 self.assertEqual(loaded.__class__, BeautifulSoup) |
|
250 self.assertEqual(loaded.decode(), self.soup.decode()) |
|
251 |
|
252 def testDeepcopy(self): |
|
253 from copy import deepcopy |
|
254 deepcopy(BeautifulSoup("<a></a>")) |
|
255 copied = deepcopy(self.soup) |
|
256 self.assertEqual(copied.decode(), self.soup.decode()) |
|
257 |
|
258 def testUnicodePickle(self): |
|
259 import cPickle as pickle |
|
260 html = "<b>" + chr(0xc3) + "</b>" |
|
261 soup = BeautifulSoup(html) |
|
262 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) |
|
263 loaded = pickle.loads(dumped) |
|
264 self.assertEqual(loaded.decode(), soup.decode()) |
|
265 |
|
266 |
|
267 class WriteOnlyCode(SoupTest): |
|
268 "Testing the modification of the tree." |
|
269 |
|
270 def testModifyAttributes(self): |
|
271 soup = BeautifulSoup('<a id="1"></a>') |
|
272 soup.a['id'] = 2 |
|
273 self.assertEqual(soup.decode(), '<a id="2"></a>') |
|
274 del(soup.a['id']) |
|
275 self.assertEqual(soup.decode(), '<a></a>') |
|
276 soup.a['id2'] = 'foo' |
|
277 self.assertEqual(soup.decode(), '<a id2="foo"></a>') |
|
278 |
|
279 def testNewTagCreation(self): |
|
280 "Makes sure tags don't step on each others' toes." |
|
281 soup = BeautifulSoup() |
|
282 a = Tag(soup, 'a') |
|
283 ol = Tag(soup, 'ol') |
|
284 a['href'] = 'http://foo.com/' |
|
285 self.assertRaises(KeyError, lambda : ol['href']) |
|
286 |
|
287 def testTagReplacement(self): |
|
288 # Make sure you can replace an element with itself. |
|
289 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" |
|
290 soup = BeautifulSoup(text) |
|
291 c = soup.c |
|
292 soup.c.replaceWith(c) |
|
293 self.assertEquals(soup.decode(), text) |
|
294 |
|
295 # A very simple case |
|
296 soup = BeautifulSoup("<b>Argh!</b>") |
|
297 soup.find(text="Argh!").replaceWith("Hooray!") |
|
298 newText = soup.find(text="Hooray!") |
|
299 b = soup.b |
|
300 self.assertEqual(newText.previous, b) |
|
301 self.assertEqual(newText.parent, b) |
|
302 self.assertEqual(newText.previous.next, newText) |
|
303 self.assertEqual(newText.next, None) |
|
304 |
|
305 # A more complex case |
|
306 soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>") |
|
307 soup.b.insert(1, "Hooray!") |
|
308 newText = soup.find(text="Hooray!") |
|
309 self.assertEqual(newText.previous, "Argh!") |
|
310 self.assertEqual(newText.previous.next, newText) |
|
311 |
|
312 self.assertEqual(newText.previousSibling, "Argh!") |
|
313 self.assertEqual(newText.previousSibling.nextSibling, newText) |
|
314 |
|
315 self.assertEqual(newText.nextSibling, None) |
|
316 self.assertEqual(newText.next, soup.c) |
|
317 |
|
318 text = "<html>There's <b>no</b> business like <b>show</b> business</html>" |
|
319 soup = BeautifulSoup(text) |
|
320 no, show = soup.findAll('b') |
|
321 show.replaceWith(no) |
|
322 self.assertEquals(soup.decode(), "<html>There's business like <b>no</b> business</html>") |
|
323 |
|
324 # Even more complex |
|
325 soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>") |
|
326 tag = Tag(soup, 'magictag') |
|
327 tag.insert(0, "the") |
|
328 soup.a.insert(1, tag) |
|
329 |
|
330 b = soup.b |
|
331 c = soup.c |
|
332 theText = tag.find(text=True) |
|
333 findText = b.find(text="Find") |
|
334 |
|
335 self.assertEqual(findText.next, tag) |
|
336 self.assertEqual(tag.previous, findText) |
|
337 self.assertEqual(b.nextSibling, tag) |
|
338 self.assertEqual(tag.previousSibling, b) |
|
339 self.assertEqual(tag.nextSibling, c) |
|
340 self.assertEqual(c.previousSibling, tag) |
|
341 |
|
342 self.assertEqual(theText.next, c) |
|
343 self.assertEqual(c.previous, theText) |
|
344 |
|
345 # Aand... incredibly complex. |
|
346 soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") |
|
347 f = soup.f |
|
348 a = soup.a |
|
349 c = soup.c |
|
350 e = soup.e |
|
351 weText = a.find(text="We") |
|
352 soup.b.replaceWith(soup.f) |
|
353 self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>") |
|
354 |
|
355 self.assertEqual(f.previous, weText) |
|
356 self.assertEqual(weText.next, f) |
|
357 self.assertEqual(f.previousSibling, weText) |
|
358 self.assertEqual(f.nextSibling, None) |
|
359 self.assertEqual(weText.nextSibling, f) |
|
360 |
|
361 def testAppend(self): |
|
362 doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>" |
|
363 soup = BeautifulSoup(doc) |
|
364 second_para = soup('p')[1] |
|
365 bold = soup.find('b') |
|
366 soup('p')[1].append(soup.find('b')) |
|
367 self.assertEqual(bold.parent, second_para) |
|
368 self.assertEqual(soup.decode(), |
|
369 "<p>Don't leave me .</p> " |
|
370 "<p>Don't leave me.<b>here</b></p>") |
|
371 |
|
372 def testTagExtraction(self): |
|
373 # A very simple case |
|
374 text = '<html><div id="nav">Nav crap</div>Real content here.</html>' |
|
375 soup = BeautifulSoup(text) |
|
376 extracted = soup.find("div", id="nav").extract() |
|
377 self.assertEqual(soup.decode(), "<html>Real content here.</html>") |
|
378 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') |
|
379 |
|
380 # A simple case, a more complex test. |
|
381 text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>" |
|
382 soup = BeautifulStoneSoup(text) |
|
383 doc = soup.doc |
|
384 numbers, roman, letters = soup("a") |
|
385 |
|
386 self.assertEqual(roman.parent, doc) |
|
387 oldPrevious = roman.previous |
|
388 endOfThisTag = roman.nextSibling.previous |
|
389 self.assertEqual(oldPrevious, "2") |
|
390 self.assertEqual(roman.next, "i") |
|
391 self.assertEqual(endOfThisTag, "ii") |
|
392 self.assertEqual(roman.previousSibling, numbers) |
|
393 self.assertEqual(roman.nextSibling, letters) |
|
394 |
|
395 roman.extract() |
|
396 self.assertEqual(roman.parent, None) |
|
397 self.assertEqual(roman.previous, None) |
|
398 self.assertEqual(roman.next, "i") |
|
399 self.assertEqual(letters.previous, '2') |
|
400 self.assertEqual(roman.previousSibling, None) |
|
401 self.assertEqual(roman.nextSibling, None) |
|
402 self.assertEqual(endOfThisTag.next, None) |
|
403 self.assertEqual(roman.b.contents[0].next, None) |
|
404 self.assertEqual(numbers.nextSibling, letters) |
|
405 self.assertEqual(letters.previousSibling, numbers) |
|
406 self.assertEqual(len(doc.contents), 2) |
|
407 self.assertEqual(doc.contents[0], numbers) |
|
408 self.assertEqual(doc.contents[1], letters) |
|
409 |
|
410 # A more complex case. |
|
411 text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3" |
|
412 soup = BeautifulStoneSoup(text) |
|
413 one = soup.find(text="1") |
|
414 three = soup.find(text="3") |
|
415 toExtract = soup.b |
|
416 soup.b.extract() |
|
417 self.assertEqual(one.next, three) |
|
418 self.assertEqual(three.previous, one) |
|
419 self.assertEqual(one.parent.nextSibling, three) |
|
420 self.assertEqual(three.previousSibling, soup.a) |
|
421 |
|
422 class TheManWithoutAttributes(SoupTest): |
|
423 "Test attribute access" |
|
424 |
|
425 def testHasKey(self): |
|
426 text = "<foo attr='bar'>" |
|
427 self.assertTrue(BeautifulSoup(text).foo.has_key('attr')) |
|
428 |
|
429 class QuoteMeOnThat(SoupTest): |
|
430 "Test quoting" |
|
431 def testQuotedAttributeValues(self): |
|
432 self.assertSoupEquals("<foo attr='bar'></foo>", |
|
433 '<foo attr="bar"></foo>') |
|
434 |
|
435 text = """<foo attr='bar "brawls" happen'>a</foo>""" |
|
436 soup = BeautifulSoup(text) |
|
437 self.assertEquals(soup.decode(), text) |
|
438 |
|
439 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' |
|
440 newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""" |
|
441 self.assertSoupEquals(soup.decode(), newText) |
|
442 |
|
443 self.assertSoupEquals('<this is="really messed up & stuff">', |
|
444 '<this is="really messed up & stuff"></this>') |
|
445 |
|
446 |
|
447 |
|
448 class YoureSoLiteral(SoupTest): |
|
449 "Test literal mode." |
|
450 def testLiteralMode(self): |
|
451 text = "<script>if (i<imgs.length)</script><b>Foo</b>" |
|
452 soup = BeautifulSoup(text) |
|
453 self.assertEqual(soup.script.contents[0], "if (i<imgs.length)") |
|
454 self.assertEqual(soup.b.contents[0], "Foo") |
|
455 |
|
456 def testTextArea(self): |
|
457 text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>" |
|
458 soup = BeautifulSoup(text) |
|
459 self.assertEqual(soup.textarea.contents[0], |
|
460 "<b>This is an example of an HTML tag</b><&<&") |
|
461 |
|
462 class OperatorOverload(SoupTest): |
|
463 "Our operators do it all! Call now!" |
|
464 |
|
465 def testTagNameAsFind(self): |
|
466 "Tests that referencing a tag name as a member delegates to find()." |
|
467 soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>') |
|
468 self.assertEqual(soup.b.i, soup.find('b').find('i')) |
|
469 self.assertEqual(soup.b.i.string, 'bar') |
|
470 self.assertEqual(soup.b['id'], '1') |
|
471 self.assertEqual(soup.b.contents[0], 'foo') |
|
472 self.assert_(not soup.a) |
|
473 |
|
474 #Test the .fooTag variant of .foo. |
|
475 self.assertEqual(soup.bTag.iTag.string, 'bar') |
|
476 self.assertEqual(soup.b.iTag.string, 'bar') |
|
477 self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag) |
|
478 |
|
479 class NestableEgg(SoupTest): |
|
480 """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!""" |
|
481 |
|
482 def testParaInsideBlockquote(self): |
|
483 soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar') |
|
484 self.assertEqual(soup.blockquote.p.b.string, 'Foo') |
|
485 self.assertEqual(soup.blockquote.b.string, 'Foo') |
|
486 self.assertEqual(soup.find('p', recursive=False).string, 'Bar') |
|
487 |
|
488 def testNestedTables(self): |
|
489 text = """<table id="1"><tr><td>Here's another table: |
|
490 <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>""" |
|
491 soup = BeautifulSoup(text) |
|
492 self.assertEquals(soup.table.table.td.string, 'Juicy text') |
|
493 self.assertEquals(len(soup.findAll('table')), 2) |
|
494 self.assertEquals(len(soup.table.findAll('table')), 1) |
|
495 self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name, |
|
496 'table') |
|
497 |
|
498 text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>" |
|
499 soup = BeautifulSoup(text) |
|
500 self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo") |
|
501 |
|
502 text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody> |
|
503 <tfoot><tr>Baz</tr></tfoot></table>""" |
|
504 soup = BeautifulSoup(text) |
|
505 self.assertEquals(soup.table.thead.tr.contents[0], "Foo") |
|
506 |
|
507 def testBadNestedTables(self): |
|
508 soup = BeautifulSoup("<table><tr><table><tr id='nested'>") |
|
509 self.assertEquals(soup.table.tr.table.tr['id'], 'nested') |
|
510 |
|
511 class CleanupOnAisleFour(SoupTest): |
|
512 """Here we test cleanup of text that breaks HTMLParser or is just |
|
513 obnoxious.""" |
|
514 |
|
515 def testSelfClosingtag(self): |
|
516 self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(), |
|
517 '<br />') |
|
518 |
|
519 self.assertSoupEquals('<p>test1<br/>test2</p>', |
|
520 '<p>test1<br />test2</p>') |
|
521 |
|
522 text = '<p>test1<selfclosing>test2' |
|
523 soup = BeautifulStoneSoup(text) |
|
524 self.assertEqual(soup.decode(), |
|
525 '<p>test1<selfclosing>test2</selfclosing></p>') |
|
526 |
|
527 soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing') |
|
528 self.assertEqual(soup.decode(), |
|
529 '<p>test1<selfclosing />test2</p>') |
|
530 |
|
531 def testSelfClosingTagOrNot(self): |
|
532 text = "<item><link>http://foo.com/</link></item>" |
|
533 self.assertEqual(BeautifulStoneSoup(text).decode(), text) |
|
534 self.assertEqual(BeautifulSoup(text).decode(), |
|
535 '<item><link />http://foo.com/</item>') |
|
536 |
|
537 def testBooleanAttributes(self): |
|
538 text = "<td nowrap>foo</td>" |
|
539 self.assertSoupEquals(text, text) |
|
540 |
|
541 def testCData(self): |
|
542 xml = "<root>foo<![CDATA[foobar]]>bar</root>" |
|
543 self.assertSoupEquals(xml, xml) |
|
544 r = re.compile("foo.*bar") |
|
545 soup = BeautifulSoup(xml) |
|
546 self.assertEquals(soup.find(text=r).string, "foobar") |
|
547 self.assertEquals(soup.find(text=r).__class__, CData) |
|
548 |
|
549 def testComments(self): |
|
550 xml = "foo<!--foobar-->baz" |
|
551 self.assertSoupEquals(xml) |
|
552 r = re.compile("foo.*bar") |
|
553 soup = BeautifulSoup(xml) |
|
554 self.assertEquals(soup.find(text=r).string, "foobar") |
|
555 self.assertEquals(soup.find(text="foobar").__class__, Comment) |
|
556 |
|
557 def testDeclaration(self): |
|
558 xml = "foo<!DOCTYPE foobar>baz" |
|
559 self.assertSoupEquals(xml) |
|
560 r = re.compile(".*foo.*bar") |
|
561 soup = BeautifulSoup(xml) |
|
562 text = "DOCTYPE foobar" |
|
563 self.assertEquals(soup.find(text=r).string, text) |
|
564 self.assertEquals(soup.find(text=text).__class__, Declaration) |
|
565 |
|
566 namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">' |
|
567 '<html>foo</html>') |
|
568 soup = BeautifulSoup(namespaced_doctype) |
|
569 self.assertEquals(soup.contents[0], |
|
570 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"') |
|
571 self.assertEquals(soup.html.contents[0], 'foo') |
|
572 |
|
573 def testEntityConversions(self): |
|
574 text = "<<sacré bleu!>>" |
|
575 soup = BeautifulStoneSoup(text) |
|
576 self.assertSoupEquals(text) |
|
577 |
|
578 xmlEnt = BeautifulStoneSoup.XML_ENTITIES |
|
579 htmlEnt = BeautifulStoneSoup.HTML_ENTITIES |
|
580 xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES |
|
581 |
|
582 soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) |
|
583 self.assertEquals(soup.decode(), "<<sacré bleu!>>") |
|
584 |
|
585 soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) |
|
586 self.assertEquals(soup.decode(), "<<sacré bleu!>>") |
|
587 |
|
588 soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) |
|
589 self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>") |
|
590 |
|
591 # Make sure the "XML", "HTML", and "XHTML" settings work. |
|
592 text = "<™'" |
|
593 soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) |
|
594 self.assertEquals(soup.decode(), u"<™'") |
|
595 |
|
596 soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) |
|
597 self.assertEquals(soup.decode(), u"<\u2122'") |
|
598 |
|
599 soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt) |
|
600 self.assertEquals(soup.decode(), u"<\u2122'") |
|
601 |
|
602 def testNonBreakingSpaces(self): |
|
603 soup = BeautifulSoup("<a> </a>", |
|
604 convertEntities=BeautifulStoneSoup.HTML_ENTITIES) |
|
605 self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>") |
|
606 |
|
607 def testWhitespaceInDeclaration(self): |
|
608 self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>') |
|
609 |
|
610 def testJunkInDeclaration(self): |
|
611 self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a') |
|
612 |
|
613 def testIncompleteDeclaration(self): |
|
614 self.assertSoupEquals('a<!b <p>c') |
|
615 |
|
616 def testEntityReplacement(self): |
|
617 self.assertSoupEquals('<b>hello there</b>') |
|
618 |
|
619 def testEntitiesInAttributeValues(self): |
|
620 self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', |
|
621 encoding='utf-8') |
|
622 self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', |
|
623 encoding='utf-8') |
|
624 |
|
625 soup = BeautifulSoup('<x t=">™">', |
|
626 convertEntities=BeautifulStoneSoup.HTML_ENTITIES) |
|
627 self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>') |
|
628 |
|
629 uri = "http://crummy.com?sacré&bleu" |
|
630 link = '<a href="%s"></a>' % uri |
|
631 |
|
632 soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) |
|
633 self.assertEquals(soup.decode(), |
|
634 link.replace("é", u"\xe9")) |
|
635 |
|
636 uri = "http://crummy.com?sacré&bleu" |
|
637 link = '<a href="%s"></a>' % uri |
|
638 soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) |
|
639 self.assertEquals(soup.a['href'], |
|
640 uri.replace("é", u"\xe9")) |
|
641 |
|
642 def testNakedAmpersands(self): |
|
643 html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES} |
|
644 soup = BeautifulStoneSoup("AT&T ", **html) |
|
645 self.assertEquals(soup.decode(), 'AT&T ') |
|
646 |
|
647 nakedAmpersandInASentence = "AT&T was Ma Bell" |
|
648 soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html) |
|
649 self.assertEquals(soup.decode(), \ |
|
650 nakedAmpersandInASentence.replace('&','&')) |
|
651 |
|
652 invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>' |
|
653 validURL = invalidURL.replace('&','&') |
|
654 soup = BeautifulStoneSoup(invalidURL) |
|
655 self.assertEquals(soup.decode(), validURL) |
|
656 |
|
657 soup = BeautifulStoneSoup(validURL) |
|
658 self.assertEquals(soup.decode(), validURL) |
|
659 |
|
660 |
|
661 class EncodeRed(SoupTest): |
|
662 """Tests encoding conversion, Unicode conversion, and Microsoft |
|
663 smart quote fixes.""" |
|
664 |
|
665 def testUnicodeDammitStandalone(self): |
|
666 markup = "<foo>\x92</foo>" |
|
667 dammit = UnicodeDammit(markup) |
|
668 self.assertEquals(dammit.unicode, "<foo>’</foo>") |
|
669 |
|
670 hebrew = "\xed\xe5\xec\xf9" |
|
671 dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
|
672 self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') |
|
673 self.assertEquals(dammit.originalEncoding, 'iso-8859-8') |
|
674 |
|
675 def testGarbageInGarbageOut(self): |
|
676 ascii = "<foo>a</foo>" |
|
677 asciiSoup = BeautifulStoneSoup(ascii) |
|
678 self.assertEquals(ascii, asciiSoup.decode()) |
|
679 |
|
680 unicodeData = u"<foo>\u00FC</foo>" |
|
681 utf8 = unicodeData.encode("utf-8") |
|
682 self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') |
|
683 |
|
684 unicodeSoup = BeautifulStoneSoup(unicodeData) |
|
685 self.assertEquals(unicodeData, unicodeSoup.decode()) |
|
686 self.assertEquals(unicodeSoup.foo.string, u'\u00FC') |
|
687 |
|
688 utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') |
|
689 self.assertEquals(utf8, utf8Soup.encode('utf-8')) |
|
690 self.assertEquals(utf8Soup.originalEncoding, "utf-8") |
|
691 |
|
692 utf8Soup = BeautifulStoneSoup(unicodeData) |
|
693 self.assertEquals(utf8, utf8Soup.encode('utf-8')) |
|
694 self.assertEquals(utf8Soup.originalEncoding, None) |
|
695 |
|
696 |
|
697 def testHandleInvalidCodec(self): |
|
698 for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
|
699 soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"), |
|
700 fromEncoding=bad_encoding) |
|
701 self.assertEquals(soup.originalEncoding, 'utf-8') |
|
702 |
|
703 def testUnicodeSearch(self): |
|
704 html = u'<html><body><h1>Räksmörgås</h1></body></html>' |
|
705 soup = BeautifulSoup(html) |
|
706 self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') |
|
707 |
|
708 def testRewrittenXMLHeader(self): |
|
709 euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n' |
|
710 utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n" |
|
711 soup = BeautifulStoneSoup(euc_jp) |
|
712 if soup.originalEncoding != "euc-jp": |
|
713 raise Exception("Test failed when parsing euc-jp document. " |
|
714 "If you're running Python >=2.4, or you have " |
|
715 "cjkcodecs installed, this is a real problem. " |
|
716 "Otherwise, ignore it.") |
|
717 |
|
718 self.assertEquals(soup.originalEncoding, "euc-jp") |
|
719 self.assertEquals(soup.renderContents('utf-8'), utf8) |
|
720 |
|
721 old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>" |
|
722 new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>" |
|
723 self.assertSoupEquals(old_text, new_text) |
|
724 |
|
725 def testRewrittenMetaTag(self): |
|
726 no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>''' |
|
727 soup = BeautifulSoup(no_shift_jis_html) |
|
728 |
|
729 # Beautiful Soup used to try to rewrite the meta tag even if the |
|
730 # meta tag got filtered out by the strainer. This test makes |
|
731 # sure that doesn't happen. |
|
732 strainer = SoupStrainer('pre') |
|
733 soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) |
|
734 self.assertEquals(soup.contents[0].name, 'pre') |
|
735 |
|
736 meta_tag = ('<meta content="text/html; charset=x-sjis" ' |
|
737 'http-equiv="Content-type" />') |
|
738 shift_jis_html = ( |
|
739 '<html><head>\n%s\n' |
|
740 '<meta http-equiv="Content-language" content="ja" />' |
|
741 '</head><body><pre>\n' |
|
742 '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' |
|
743 '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' |
|
744 '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n' |
|
745 '</pre></body></html>') % meta_tag |
|
746 soup = BeautifulSoup(shift_jis_html) |
|
747 if soup.originalEncoding != "shift-jis": |
|
748 raise Exception("Test failed when parsing shift-jis document " |
|
749 "with meta tag '%s'." |
|
750 "If you're running Python >=2.4, or you have " |
|
751 "cjkcodecs installed, this is a real problem. " |
|
752 "Otherwise, ignore it." % meta_tag) |
|
753 self.assertEquals(soup.originalEncoding, "shift-jis") |
|
754 |
|
755 content_type_tag = soup.meta['content'] |
|
756 self.assertEquals(content_type_tag[content_type_tag.find('charset='):], |
|
757 'charset=%SOUP-ENCODING%') |
|
758 content_type = str(soup.meta) |
|
759 index = content_type.find('charset=') |
|
760 self.assertEqual(content_type[index:index+len('charset=utf8')+1], |
|
761 'charset=utf-8') |
|
762 content_type = soup.meta.encode('shift-jis') |
|
763 index = content_type.find('charset=') |
|
764 self.assertEqual(content_type[index:index+len('charset=shift-jis')], |
|
765 'charset=shift-jis'.encode()) |
|
766 |
|
767 self.assertEquals(soup.encode('utf-8'), ( |
|
768 '<html><head>\n' |
|
769 '<meta content="text/html; charset=utf-8" ' |
|
770 'http-equiv="Content-type" />\n' |
|
771 '<meta http-equiv="Content-language" content="ja" />' |
|
772 '</head><body><pre>\n' |
|
773 '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3' |
|
774 '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3' |
|
775 '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6' |
|
776 '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3' |
|
777 '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n' |
|
778 '</pre></body></html>')) |
|
779 self.assertEquals(soup.encode("shift-jis"), |
|
780 shift_jis_html.replace('x-sjis'.encode(), |
|
781 'shift-jis'.encode())) |
|
782 |
|
783 isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" |
|
784 soup = BeautifulSoup(isolatin) |
|
785 |
|
786 utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) |
|
787 utf8 = utf8.replace("\xe9", "\xc3\xa9") |
|
788 self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') |
|
789 |
|
790 def testHebrew(self): |
|
791 iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n' |
|
792 utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n' |
|
793 soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") |
|
794 self.assertEquals(soup.encode('utf-8'), utf8) |
|
795 |
|
796 def testSmartQuotesNotSoSmartAnymore(self): |
|
797 self.assertSoupEquals("\x91Foo\x92 <!--blah-->", |
|
798 '‘Foo’ <!--blah-->') |
|
799 |
|
800 def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): |
|
801 smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" |
|
802 soup = BeautifulSoup(smartQuotes) |
|
803 self.assertEquals(soup.decode(), |
|
804 'Il a dit, ‹Sacré bleu!›') |
|
805 soup = BeautifulSoup(smartQuotes, convertEntities="html") |
|
806 self.assertEquals(soup.encode('utf-8'), |
|
807 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') |
|
808 |
|
809 def testDontSeeSmartQuotesWhereThereAreNone(self): |
|
810 utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
|
811 self.assertSoupEquals(utf_8, encoding='utf-8') |
|
812 |
|
813 |
|
814 class Whitewash(SoupTest): |
|
815 """Test whitespace preservation.""" |
|
816 |
|
817 def testPreservedWhitespace(self): |
|
818 self.assertSoupEquals("<pre> </pre>") |
|
819 self.assertSoupEquals("<pre> woo </pre>") |
|
820 |
|
821 def testCollapsedWhitespace(self): |
|
822 self.assertSoupEquals("<p> </p>", "<p> </p>") |
|
823 |
|
824 |
|
825 if __name__ == '__main__': |
|
826 unittest.main() |