app/htmlsanitizer/BeautifulSoupTests.py
changeset 2323 b3daada52dd3
equal deleted inserted replaced
2322:98fe07a5542f 2323:b3daada52dd3
       
     1 # -*- coding: utf-8 -*-
       
     2 """Unit tests for Beautiful Soup.
       
     3 
       
     4 These tests make sure the Beautiful Soup works as it should. If you
       
     5 find a bug in Beautiful Soup, the best way to express it is as a test
       
     6 case like this that fails."""
       
     7 
       
     8 import unittest
       
     9 from BeautifulSoup import *
       
    10 
       
    11 class SoupTest(unittest.TestCase):
       
    12 
       
    13     def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
       
    14                          encoding=None):
       
    15         """Parse the given text and make sure its string rep is the other
       
    16         given text."""
       
    17         if rep == None:
       
    18             rep = toParse
       
    19         obj = c(toParse)
       
    20         if encoding is None:
       
    21             rep2 = obj.decode()
       
    22         else:
       
    23             rep2 = obj.encode(encoding)
       
    24         self.assertEqual(rep2, rep)
       
    25 
       
    26 class FollowThatTag(SoupTest):
       
    27 
       
    28     "Tests the various ways of fetching tags from a soup."
       
    29 
       
    30     def setUp(self):
       
    31         ml = """
       
    32         <a id="x">1</a>
       
    33         <A id="a">2</a>
       
    34         <b id="b">3</a>
       
    35         <b href="foo" id="x">4</a>
       
    36         <ac width=100>4</ac>"""
       
    37         self.soup = BeautifulStoneSoup(ml)
       
    38 
       
    39     def testFindAllByName(self):
       
    40         matching = self.soup('a')
       
    41         self.assertEqual(len(matching), 2)
       
    42         self.assertEqual(matching[0].name, 'a')
       
    43         self.assertEqual(matching, self.soup.findAll('a'))
       
    44         self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
       
    45 
       
    46     def testFindAllByAttribute(self):
       
    47         matching = self.soup.findAll(id='x')
       
    48         self.assertEqual(len(matching), 2)
       
    49         self.assertEqual(matching[0].name, 'a')
       
    50         self.assertEqual(matching[1].name, 'b')
       
    51 
       
    52         matching2 = self.soup.findAll(attrs={'id' : 'x'})
       
    53         self.assertEqual(matching, matching2)
       
    54 
       
    55         strainer = SoupStrainer(attrs={'id' : 'x'})
       
    56         self.assertEqual(matching, self.soup.findAll(strainer))
       
    57 
       
    58         self.assertEqual(len(self.soup.findAll(id=None)), 1)
       
    59 
       
    60         self.assertEqual(len(self.soup.findAll(width=100)), 1)
       
    61         self.assertEqual(len(self.soup.findAll(junk=None)), 5)
       
    62         self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
       
    63 
       
    64         self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
       
    65         self.assertEqual(len(self.soup.findAll(junk=True)), 0)
       
    66 
       
    67         self.assertEqual(len(self.soup.findAll(junk=True)), 0)
       
    68         self.assertEqual(len(self.soup.findAll(href=True)), 1)
       
    69 
       
    70     def testFindallByClass(self):
       
    71         soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
       
    72         self.assertEqual(soup.find('a', '1').string, "Bar")
       
    73 
       
    74     def testFindAllByList(self):
       
    75         matching = self.soup(['a', 'ac'])
       
    76         self.assertEqual(len(matching), 3)
       
    77 
       
    78     def testFindAllByHash(self):
       
    79         matching = self.soup({'a' : True, 'b' : True})
       
    80         self.assertEqual(len(matching), 4)
       
    81 
       
    82     def testFindAllText(self):
       
    83         soup = BeautifulSoup("<html>\xbb</html>")
       
    84         self.assertEqual(soup.findAll(text=re.compile('.*')),
       
    85                          [u'\xbb'])
       
    86 
       
    87     def testFindAllByRE(self):
       
    88         import re
       
    89         r = re.compile('a.*')
       
    90         self.assertEqual(len(self.soup(r)), 3)
       
    91 
       
    92     def testFindAllByMethod(self):
       
    93         def matchTagWhereIDMatchesName(tag):
       
    94             return tag.name == tag.get('id')
       
    95 
       
    96         matching = self.soup.findAll(matchTagWhereIDMatchesName)
       
    97         self.assertEqual(len(matching), 2)
       
    98         self.assertEqual(matching[0].name, 'a')
       
    99 
       
   100     def testParents(self):
       
   101         soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
       
   102         b = soup.b
       
   103         self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
       
   104         self.assertEquals(b.findParent('ul')['a'], 'b')
       
   105 
       
   106     PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
       
   107 
       
   108     def testNext(self):
       
   109         soup = self.PROXIMITY_TEST
       
   110         b = soup.find('b', {'id' : 2})
       
   111         self.assertEquals(b.findNext('b')['id'], '3')
       
   112         self.assertEquals(b.findNext('b')['id'], '3')
       
   113         self.assertEquals(len(b.findAllNext('b')), 2)
       
   114         self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
       
   115 
       
   116     def testPrevious(self):
       
   117         soup = self.PROXIMITY_TEST
       
   118         b = soup.find('b', {'id' : 3})
       
   119         self.assertEquals(b.findPrevious('b')['id'], '2')
       
   120         self.assertEquals(b.findPrevious('b')['id'], '2')
       
   121         self.assertEquals(len(b.findAllPrevious('b')), 2)
       
   122         self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
       
   123 
       
   124 
       
   125     SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
       
   126 
       
   127     def testNextSibling(self):
       
   128         soup = self.SIBLING_TEST
       
   129         tag = 'blockquote'
       
   130         b = soup.find(tag, {'id' : 2})
       
   131         self.assertEquals(b.findNext(tag)['id'], '2.1')
       
   132         self.assertEquals(b.findNextSibling(tag)['id'], '3')
       
   133         self.assertEquals(b.findNextSibling(tag)['id'], '3')
       
   134         self.assertEquals(len(b.findNextSiblings(tag)), 2)
       
   135         self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
       
   136 
       
   137     def testPreviousSibling(self):
       
   138         soup = self.SIBLING_TEST
       
   139         tag = 'blockquote'
       
   140         b = soup.find(tag, {'id' : 3})
       
   141         self.assertEquals(b.findPrevious(tag)['id'], '2.1')
       
   142         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
       
   143         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
       
   144         self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
       
   145         self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
       
   146 
       
   147     def testTextNavigation(self):
       
   148         soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
       
   149         baz = soup.find(text='Baz')
       
   150         self.assertEquals(baz.findParent("i")['id'], '1')
       
   151         self.assertEquals(baz.findNext(text='Blee'), 'Blee')
       
   152         self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
       
   153         self.assertEquals(baz.findNextSibling(text='Blargh'), None)
       
   154         self.assertEquals(baz.findNextSibling('hr')['id'], '1')
       
   155 
       
   156 class SiblingRivalry(SoupTest):
       
   157     "Tests the nextSibling and previousSibling navigation."
       
   158 
       
   159     def testSiblings(self):
       
   160         soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
       
   161         secondLI = soup.find('li').nextSibling
       
   162         self.assert_(secondLI.name == 'li' and secondLI.string == '2')
       
   163         self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
       
   164         self.assertEquals(soup.find('p').nextSibling, 'B')
       
   165         self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
       
   166 
       
   167 class TagsAreObjectsToo(SoupTest):
       
   168     "Tests the various built-in functions of Tag objects."
       
   169 
       
   170     def testLen(self):
       
   171         soup = BeautifulSoup("<top>1<b>2</b>3</top>")
       
   172         self.assertEquals(len(soup.top), 3)
       
   173 
       
   174 class StringEmUp(SoupTest):
       
   175     "Tests the use of 'string' as an alias for a tag's only content."
       
   176 
       
   177     def testString(self):
       
   178         s = BeautifulSoup("<b>foo</b>")
       
   179         self.assertEquals(s.b.string, 'foo')
       
   180 
       
   181     def testLackOfString(self):
       
   182         s = BeautifulSoup("<b>f<i>e</i>o</b>")
       
   183         self.assert_(not s.b.string)
       
   184 
       
   185 class ThatsMyLimit(SoupTest):
       
   186     "Tests the limit argument."
       
   187 
       
   188     def testBasicLimits(self):
       
   189         s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
       
   190         self.assertEquals(len(s.findAll('br')), 4)
       
   191         self.assertEquals(len(s.findAll('br', limit=2)), 2)
       
   192         self.assertEquals(len(s('br', limit=2)), 2)
       
   193 
       
   194 class OnlyTheLonely(SoupTest):
       
   195     "Tests the parseOnly argument to the constructor."
       
   196     def setUp(self):
       
   197         x = []
       
   198         for i in range(1,6):
       
   199             x.append('<a id="%s">' % i)
       
   200             for j in range(100,103):
       
   201                 x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
       
   202             x.append('</a>')
       
   203         self.x = ''.join(x)
       
   204 
       
   205     def testOnly(self):
       
   206         strainer = SoupStrainer("b")
       
   207         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
       
   208         self.assertEquals(len(soup), 15)
       
   209 
       
   210         strainer = SoupStrainer(id=re.compile("100.*"))
       
   211         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
       
   212         self.assertEquals(len(soup), 5)
       
   213 
       
   214         strainer = SoupStrainer(text=re.compile("10[01].*"))
       
   215         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
       
   216         self.assertEquals(len(soup), 10)
       
   217 
       
   218         strainer = SoupStrainer(text=lambda(x):x[8]=='3')
       
   219         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
       
   220         self.assertEquals(len(soup), 3)
       
   221 
       
   222 class PickleMeThis(SoupTest):
       
   223     "Testing features like pickle and deepcopy."
       
   224 
       
   225     def setUp(self):
       
   226         self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
       
   227 "http://www.w3.org/TR/REC-html40/transitional.dtd">
       
   228 <html>
       
   229 <head>
       
   230 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
       
   231 <title>Beautiful Soup: We called him Tortoise because he taught us.</title>
       
   232 <link rev="made" href="mailto:leonardr@segfault.org">
       
   233 <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
       
   234 <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
       
   235 <meta name="author" content="Leonard Richardson">
       
   236 </head>
       
   237 <body>
       
   238 <a href="foo">foo</a>
       
   239 <a href="foo"><b>bar</b></a>
       
   240 </body>
       
   241 </html>"""
       
   242 
       
   243         self.soup = BeautifulSoup(self.page)
       
   244 
       
   245     def testPickle(self):
       
   246         import pickle
       
   247         dumped = pickle.dumps(self.soup, 2)
       
   248         loaded = pickle.loads(dumped)
       
   249         self.assertEqual(loaded.__class__, BeautifulSoup)
       
   250         self.assertEqual(loaded.decode(), self.soup.decode())
       
   251 
       
   252     def testDeepcopy(self):
       
   253         from copy import deepcopy
       
   254         deepcopy(BeautifulSoup("<a></a>"))
       
   255         copied = deepcopy(self.soup)
       
   256         self.assertEqual(copied.decode(), self.soup.decode())
       
   257 
       
   258     def testUnicodePickle(self):
       
   259         import cPickle as pickle
       
   260         html = "<b>" + chr(0xc3) + "</b>"
       
   261         soup = BeautifulSoup(html)
       
   262         dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
       
   263         loaded = pickle.loads(dumped)
       
   264         self.assertEqual(loaded.decode(), soup.decode())
       
   265 
       
   266 
       
   267 class WriteOnlyCode(SoupTest):
       
   268     "Testing the modification of the tree."
       
   269 
       
   270     def testModifyAttributes(self):
       
   271         soup = BeautifulSoup('<a id="1"></a>')
       
   272         soup.a['id'] = 2
       
   273         self.assertEqual(soup.decode(), '<a id="2"></a>')
       
   274         del(soup.a['id'])
       
   275         self.assertEqual(soup.decode(), '<a></a>')
       
   276         soup.a['id2'] = 'foo'
       
   277         self.assertEqual(soup.decode(), '<a id2="foo"></a>')
       
   278 
       
   279     def testNewTagCreation(self):
       
   280         "Makes sure tags don't step on each others' toes."
       
   281         soup = BeautifulSoup()
       
   282         a = Tag(soup, 'a')
       
   283         ol = Tag(soup, 'ol')
       
   284         a['href'] = 'http://foo.com/'
       
   285         self.assertRaises(KeyError, lambda : ol['href'])
       
   286 
       
   287     def testTagReplacement(self):
       
   288         # Make sure you can replace an element with itself.
       
   289         text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
       
   290         soup = BeautifulSoup(text)
       
   291         c = soup.c
       
   292         soup.c.replaceWith(c)
       
   293         self.assertEquals(soup.decode(), text)
       
   294 
       
   295         # A very simple case
       
   296         soup = BeautifulSoup("<b>Argh!</b>")
       
   297         soup.find(text="Argh!").replaceWith("Hooray!")
       
   298         newText = soup.find(text="Hooray!")
       
   299         b = soup.b
       
   300         self.assertEqual(newText.previous, b)
       
   301         self.assertEqual(newText.parent, b)
       
   302         self.assertEqual(newText.previous.next, newText)
       
   303         self.assertEqual(newText.next, None)
       
   304 
       
   305         # A more complex case
       
   306         soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
       
   307         soup.b.insert(1, "Hooray!")
       
   308         newText = soup.find(text="Hooray!")
       
   309         self.assertEqual(newText.previous, "Argh!")
       
   310         self.assertEqual(newText.previous.next, newText)
       
   311 
       
   312         self.assertEqual(newText.previousSibling, "Argh!")
       
   313         self.assertEqual(newText.previousSibling.nextSibling, newText)
       
   314 
       
   315         self.assertEqual(newText.nextSibling, None)
       
   316         self.assertEqual(newText.next, soup.c)
       
   317 
       
   318         text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
       
   319         soup = BeautifulSoup(text)
       
   320         no, show = soup.findAll('b')
       
   321         show.replaceWith(no)
       
   322         self.assertEquals(soup.decode(), "<html>There's  business like <b>no</b> business</html>")
       
   323 
       
   324         # Even more complex
       
   325         soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
       
   326         tag = Tag(soup, 'magictag')
       
   327         tag.insert(0, "the")
       
   328         soup.a.insert(1, tag)
       
   329 
       
   330         b = soup.b
       
   331         c = soup.c
       
   332         theText = tag.find(text=True)
       
   333         findText = b.find(text="Find")
       
   334 
       
   335         self.assertEqual(findText.next, tag)
       
   336         self.assertEqual(tag.previous, findText)
       
   337         self.assertEqual(b.nextSibling, tag)
       
   338         self.assertEqual(tag.previousSibling, b)
       
   339         self.assertEqual(tag.nextSibling, c)
       
   340         self.assertEqual(c.previousSibling, tag)
       
   341 
       
   342         self.assertEqual(theText.next, c)
       
   343         self.assertEqual(c.previous, theText)
       
   344 
       
   345         # Aand... incredibly complex.
       
   346         soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
       
   347         f = soup.f
       
   348         a = soup.a
       
   349         c = soup.c
       
   350         e = soup.e
       
   351         weText = a.find(text="We")
       
   352         soup.b.replaceWith(soup.f)
       
   353         self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
       
   354 
       
   355         self.assertEqual(f.previous, weText)
       
   356         self.assertEqual(weText.next, f)
       
   357         self.assertEqual(f.previousSibling, weText)
       
   358         self.assertEqual(f.nextSibling, None)
       
   359         self.assertEqual(weText.nextSibling, f)
       
   360 
       
   361     def testAppend(self):
       
   362        doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
       
   363        soup = BeautifulSoup(doc)
       
   364        second_para = soup('p')[1]
       
   365        bold = soup.find('b')
       
   366        soup('p')[1].append(soup.find('b'))
       
   367        self.assertEqual(bold.parent, second_para)
       
   368        self.assertEqual(soup.decode(),
       
   369                         "<p>Don't leave me .</p> "
       
   370                         "<p>Don't leave me.<b>here</b></p>")
       
   371 
       
   372     def testTagExtraction(self):
       
   373         # A very simple case
       
   374         text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
       
   375         soup = BeautifulSoup(text)
       
   376         extracted = soup.find("div", id="nav").extract()
       
   377         self.assertEqual(soup.decode(), "<html>Real content here.</html>")
       
   378         self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
       
   379 
       
   380         # A simple case, a more complex test.
       
   381         text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
       
   382         soup = BeautifulStoneSoup(text)
       
   383         doc = soup.doc
       
   384         numbers, roman, letters = soup("a")
       
   385 
       
   386         self.assertEqual(roman.parent, doc)
       
   387         oldPrevious = roman.previous
       
   388         endOfThisTag = roman.nextSibling.previous
       
   389         self.assertEqual(oldPrevious, "2")
       
   390         self.assertEqual(roman.next, "i")
       
   391         self.assertEqual(endOfThisTag, "ii")
       
   392         self.assertEqual(roman.previousSibling, numbers)
       
   393         self.assertEqual(roman.nextSibling, letters)
       
   394 
       
   395         roman.extract()
       
   396         self.assertEqual(roman.parent, None)
       
   397         self.assertEqual(roman.previous, None)
       
   398         self.assertEqual(roman.next, "i")
       
   399         self.assertEqual(letters.previous, '2')
       
   400         self.assertEqual(roman.previousSibling, None)
       
   401         self.assertEqual(roman.nextSibling, None)
       
   402         self.assertEqual(endOfThisTag.next, None)
       
   403         self.assertEqual(roman.b.contents[0].next, None)
       
   404         self.assertEqual(numbers.nextSibling, letters)
       
   405         self.assertEqual(letters.previousSibling, numbers)
       
   406         self.assertEqual(len(doc.contents), 2)
       
   407         self.assertEqual(doc.contents[0], numbers)
       
   408         self.assertEqual(doc.contents[1], letters)
       
   409 
       
   410         # A more complex case.
       
   411         text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
       
   412         soup = BeautifulStoneSoup(text)
       
   413         one = soup.find(text="1")
       
   414         three = soup.find(text="3")
       
   415         toExtract = soup.b
       
   416         soup.b.extract()
       
   417         self.assertEqual(one.next, three)
       
   418         self.assertEqual(three.previous, one)
       
   419         self.assertEqual(one.parent.nextSibling, three)
       
   420         self.assertEqual(three.previousSibling, soup.a)
       
   421 
       
   422 class TheManWithoutAttributes(SoupTest):
       
   423     "Test attribute access"
       
   424 
       
   425     def testHasKey(self):
       
   426         text = "<foo attr='bar'>"
       
   427         self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
       
   428 
       
   429 class QuoteMeOnThat(SoupTest):
       
   430     "Test quoting"
       
   431     def testQuotedAttributeValues(self):
       
   432         self.assertSoupEquals("<foo attr='bar'></foo>",
       
   433                               '<foo attr="bar"></foo>')
       
   434 
       
   435         text = """<foo attr='bar "brawls" happen'>a</foo>"""
       
   436         soup = BeautifulSoup(text)
       
   437         self.assertEquals(soup.decode(), text)
       
   438 
       
   439         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
       
   440         newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
       
   441         self.assertSoupEquals(soup.decode(), newText)
       
   442 
       
   443         self.assertSoupEquals('<this is="really messed up & stuff">',
       
   444                               '<this is="really messed up &amp; stuff"></this>')
       
   445 
       
   446 
       
   447 
       
   448 class YoureSoLiteral(SoupTest):
       
   449     "Test literal mode."
       
   450     def testLiteralMode(self):
       
   451         text = "<script>if (i<imgs.length)</script><b>Foo</b>"
       
   452         soup = BeautifulSoup(text)
       
   453         self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
       
   454         self.assertEqual(soup.b.contents[0], "Foo")
       
   455 
       
   456     def testTextArea(self):
       
   457         text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
       
   458         soup = BeautifulSoup(text)
       
   459         self.assertEqual(soup.textarea.contents[0],
       
   460                          "<b>This is an example of an HTML tag</b><&<&")
       
   461 
       
   462 class OperatorOverload(SoupTest):
       
   463     "Our operators do it all! Call now!"
       
   464 
       
   465     def testTagNameAsFind(self):
       
   466         "Tests that referencing a tag name as a member delegates to find()."
       
   467         soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
       
   468         self.assertEqual(soup.b.i, soup.find('b').find('i'))
       
   469         self.assertEqual(soup.b.i.string, 'bar')
       
   470         self.assertEqual(soup.b['id'], '1')
       
   471         self.assertEqual(soup.b.contents[0], 'foo')
       
   472         self.assert_(not soup.a)
       
   473 
       
   474         #Test the .fooTag variant of .foo.
       
   475         self.assertEqual(soup.bTag.iTag.string, 'bar')
       
   476         self.assertEqual(soup.b.iTag.string, 'bar')
       
   477         self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
       
   478 
       
   479 class NestableEgg(SoupTest):
       
   480     """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
       
   481 
       
   482     def testParaInsideBlockquote(self):
       
   483         soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
       
   484         self.assertEqual(soup.blockquote.p.b.string, 'Foo')
       
   485         self.assertEqual(soup.blockquote.b.string, 'Foo')
       
   486         self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
       
   487 
       
   488     def testNestedTables(self):
       
   489         text = """<table id="1"><tr><td>Here's another table:
       
   490         <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
       
   491         soup = BeautifulSoup(text)
       
   492         self.assertEquals(soup.table.table.td.string, 'Juicy text')
       
   493         self.assertEquals(len(soup.findAll('table')), 2)
       
   494         self.assertEquals(len(soup.table.findAll('table')), 1)
       
   495         self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
       
   496                           'table')
       
   497 
       
   498         text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
       
   499         soup = BeautifulSoup(text)
       
   500         self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
       
   501 
       
   502         text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
       
   503         <tfoot><tr>Baz</tr></tfoot></table>"""
       
   504         soup = BeautifulSoup(text)
       
   505         self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
       
   506 
       
   507     def testBadNestedTables(self):
       
   508         soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
       
   509         self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
       
   510 
       
   511 class CleanupOnAisleFour(SoupTest):
       
   512     """Here we test cleanup of text that breaks HTMLParser or is just
       
   513     obnoxious."""
       
   514 
       
   515     def testSelfClosingtag(self):
       
   516         self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
       
   517                          '<br />')
       
   518 
       
   519         self.assertSoupEquals('<p>test1<br/>test2</p>',
       
   520                               '<p>test1<br />test2</p>')
       
   521 
       
   522         text = '<p>test1<selfclosing>test2'
       
   523         soup = BeautifulStoneSoup(text)
       
   524         self.assertEqual(soup.decode(),
       
   525                          '<p>test1<selfclosing>test2</selfclosing></p>')
       
   526 
       
   527         soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
       
   528         self.assertEqual(soup.decode(),
       
   529                          '<p>test1<selfclosing />test2</p>')
       
   530 
       
   531     def testSelfClosingTagOrNot(self):
       
   532         text = "<item><link>http://foo.com/</link></item>"
       
   533         self.assertEqual(BeautifulStoneSoup(text).decode(), text)
       
   534         self.assertEqual(BeautifulSoup(text).decode(),
       
   535                          '<item><link />http://foo.com/</item>')
       
   536 
       
   537     def testBooleanAttributes(self):
       
   538         text = "<td nowrap>foo</td>"
       
   539         self.assertSoupEquals(text, text)
       
   540 
       
   541     def testCData(self):
       
   542         xml = "<root>foo<![CDATA[foobar]]>bar</root>"
       
   543         self.assertSoupEquals(xml, xml)
       
   544         r = re.compile("foo.*bar")
       
   545         soup = BeautifulSoup(xml)
       
   546         self.assertEquals(soup.find(text=r).string, "foobar")
       
   547         self.assertEquals(soup.find(text=r).__class__, CData)
       
   548 
       
   549     def testComments(self):
       
   550         xml = "foo<!--foobar-->baz"
       
   551         self.assertSoupEquals(xml)
       
   552         r = re.compile("foo.*bar")
       
   553         soup = BeautifulSoup(xml)
       
   554         self.assertEquals(soup.find(text=r).string, "foobar")
       
   555         self.assertEquals(soup.find(text="foobar").__class__, Comment)
       
   556 
       
   557     def testDeclaration(self):
       
   558         xml = "foo<!DOCTYPE foobar>baz"
       
   559         self.assertSoupEquals(xml)
       
   560         r = re.compile(".*foo.*bar")
       
   561         soup = BeautifulSoup(xml)
       
   562         text = "DOCTYPE foobar"
       
   563         self.assertEquals(soup.find(text=r).string, text)
       
   564         self.assertEquals(soup.find(text=text).__class__, Declaration)
       
   565 
       
   566         namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
       
   567                               '<html>foo</html>')
       
   568         soup = BeautifulSoup(namespaced_doctype)
       
   569         self.assertEquals(soup.contents[0],
       
   570                           'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
       
   571         self.assertEquals(soup.html.contents[0], 'foo')
       
   572 
       
   573     def testEntityConversions(self):
       
   574         text = "&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;"
       
   575         soup = BeautifulStoneSoup(text)
       
   576         self.assertSoupEquals(text)
       
   577 
       
   578         xmlEnt = BeautifulStoneSoup.XML_ENTITIES
       
   579         htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
       
   580         xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
       
   581 
       
   582         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
       
   583         self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
       
   584 
       
   585         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
       
   586         self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
       
   587 
       
   588         soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
       
   589         self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")
       
   590 
       
   591         # Make sure the "XML", "HTML", and "XHTML" settings work.
       
   592         text = "&lt;&trade;&apos;"
       
   593         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
       
   594         self.assertEquals(soup.decode(), u"<&trade;'")
       
   595 
       
   596         soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
       
   597         self.assertEquals(soup.decode(), u"<\u2122&apos;")
       
   598 
       
   599         soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
       
   600         self.assertEquals(soup.decode(), u"<\u2122'")
       
   601 
       
   602     def testNonBreakingSpaces(self):
       
   603         soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>",
       
   604                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
       
   605         self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
       
   606 
       
   607     def testWhitespaceInDeclaration(self):
       
   608         self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
       
   609 
       
   610     def testJunkInDeclaration(self):
       
   611         self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
       
   612 
       
   613     def testIncompleteDeclaration(self):
       
   614         self.assertSoupEquals('a<!b <p>c')
       
   615 
       
   616     def testEntityReplacement(self):
       
   617         self.assertSoupEquals('<b>hello&nbsp;there</b>')
       
   618 
       
   619     def testEntitiesInAttributeValues(self):
       
   620         self.assertSoupEquals('<x t="x&#241;">', '<x t="x\xc3\xb1"></x>',
       
   621                               encoding='utf-8')
       
   622         self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>',
       
   623                               encoding='utf-8')
       
   624 
       
   625         soup = BeautifulSoup('<x t="&gt;&trade;">',
       
   626                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
       
   627         self.assertEquals(soup.decode(), u'<x t="&gt;\u2122"></x>')
       
   628 
       
   629         uri = "http://crummy.com?sacr&eacute;&amp;bleu"
       
   630         link = '<a href="%s"></a>' % uri
       
   631 
       
   632         soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
       
   633         self.assertEquals(soup.decode(),
       
   634                           link.replace("&eacute;", u"\xe9"))
       
   635 
       
   636         uri = "http://crummy.com?sacr&eacute;&bleu"
       
   637         link = '<a href="%s"></a>' % uri
       
   638         soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
       
   639         self.assertEquals(soup.a['href'],
       
   640                           uri.replace("&eacute;", u"\xe9"))
       
   641 
       
   642     def testNakedAmpersands(self):
       
   643         html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
       
   644         soup = BeautifulStoneSoup("AT&T ", **html)
       
   645         self.assertEquals(soup.decode(), 'AT&amp;T ')
       
   646 
       
   647         nakedAmpersandInASentence = "AT&T was Ma Bell"
       
   648         soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
       
   649         self.assertEquals(soup.decode(), \
       
   650                nakedAmpersandInASentence.replace('&','&amp;'))
       
   651 
       
   652         invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
       
   653         validURL = invalidURL.replace('&','&amp;')
       
   654         soup = BeautifulStoneSoup(invalidURL)
       
   655         self.assertEquals(soup.decode(), validURL)
       
   656 
       
   657         soup = BeautifulStoneSoup(validURL)
       
   658         self.assertEquals(soup.decode(), validURL)
       
   659 
       
   660 
       
   661 class EncodeRed(SoupTest):
       
   662     """Tests encoding conversion, Unicode conversion, and Microsoft
       
   663     smart quote fixes."""
       
   664 
       
   665     def testUnicodeDammitStandalone(self):
       
   666         markup = "<foo>\x92</foo>"
       
   667         dammit = UnicodeDammit(markup)
       
   668         self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
       
   669 
       
   670         hebrew = "\xed\xe5\xec\xf9"
       
   671         dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
       
   672         self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
       
   673         self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
       
   674 
       
   675     def testGarbageInGarbageOut(self):
       
   676         ascii = "<foo>a</foo>"
       
   677         asciiSoup = BeautifulStoneSoup(ascii)
       
   678         self.assertEquals(ascii, asciiSoup.decode())
       
   679 
       
   680         unicodeData = u"<foo>\u00FC</foo>"
       
   681         utf8 = unicodeData.encode("utf-8")
       
   682         self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
       
   683 
       
   684         unicodeSoup = BeautifulStoneSoup(unicodeData)
       
   685         self.assertEquals(unicodeData, unicodeSoup.decode())
       
   686         self.assertEquals(unicodeSoup.foo.string, u'\u00FC')
       
   687 
       
   688         utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
       
   689         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
       
   690         self.assertEquals(utf8Soup.originalEncoding, "utf-8")
       
   691 
       
   692         utf8Soup = BeautifulStoneSoup(unicodeData)
       
   693         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
       
   694         self.assertEquals(utf8Soup.originalEncoding, None)
       
   695 
       
   696 
       
   697     def testHandleInvalidCodec(self):
       
   698         for bad_encoding in ['.utf8', '...', 'utF---16.!']:
       
   699             soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"),
       
   700                                  fromEncoding=bad_encoding)
       
   701             self.assertEquals(soup.originalEncoding, 'utf-8')
       
   702 
       
   703     def testUnicodeSearch(self):
       
   704         html = u'<html><body><h1>Räksmörgås</h1></body></html>'
       
   705         soup = BeautifulSoup(html)
       
   706         self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
       
   707 
       
   708     def testRewrittenXMLHeader(self):
       
   709         euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
       
   710         utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
       
   711         soup = BeautifulStoneSoup(euc_jp)
       
   712         if soup.originalEncoding != "euc-jp":
       
   713             raise Exception("Test failed when parsing euc-jp document. "
       
   714                             "If you're running Python >=2.4, or you have "
       
   715                             "cjkcodecs installed, this is a real problem. "
       
   716                             "Otherwise, ignore it.")
       
   717 
       
   718         self.assertEquals(soup.originalEncoding, "euc-jp")
       
   719         self.assertEquals(soup.renderContents('utf-8'), utf8)
       
   720 
       
   721         old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
       
   722         new_text = "<?xml version='1.0' encoding='utf-8'?><foo>&rsquo;</foo>"
       
   723         self.assertSoupEquals(old_text, new_text)
       
   724 
       
   725     def testRewrittenMetaTag(self):
       
   726         no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
       
   727         soup = BeautifulSoup(no_shift_jis_html)
       
   728 
       
   729         # Beautiful Soup used to try to rewrite the meta tag even if the
       
   730         # meta tag got filtered out by the strainer. This test makes
       
   731         # sure that doesn't happen.
       
   732         strainer = SoupStrainer('pre')
       
   733         soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
       
   734         self.assertEquals(soup.contents[0].name, 'pre')
       
   735 
       
   736         meta_tag = ('<meta content="text/html; charset=x-sjis" '
       
   737                     'http-equiv="Content-type" />')
       
   738         shift_jis_html = (
       
   739             '<html><head>\n%s\n'
       
   740             '<meta http-equiv="Content-language" content="ja" />'
       
   741             '</head><body><pre>\n'
       
   742             '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
       
   743             '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
       
   744             '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
       
   745             '</pre></body></html>') % meta_tag
       
   746         soup = BeautifulSoup(shift_jis_html)
       
   747         if soup.originalEncoding != "shift-jis":
       
   748             raise Exception("Test failed when parsing shift-jis document "
       
   749                             "with meta tag '%s'."
       
   750                             "If you're running Python >=2.4, or you have "
       
   751                             "cjkcodecs installed, this is a real problem. "
       
   752                             "Otherwise, ignore it." % meta_tag)
       
   753         self.assertEquals(soup.originalEncoding, "shift-jis")
       
   754 
       
   755         content_type_tag = soup.meta['content']
       
   756         self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
       
   757                           'charset=%SOUP-ENCODING%')
       
   758         content_type = str(soup.meta)
       
   759         index = content_type.find('charset=')
       
   760         self.assertEqual(content_type[index:index+len('charset=utf8')+1],
       
   761                          'charset=utf-8')
       
   762         content_type = soup.meta.encode('shift-jis')
       
   763         index = content_type.find('charset=')
       
   764         self.assertEqual(content_type[index:index+len('charset=shift-jis')],
       
   765                          'charset=shift-jis'.encode())
       
   766 
       
   767         self.assertEquals(soup.encode('utf-8'), (
       
   768                 '<html><head>\n'
       
   769                 '<meta content="text/html; charset=utf-8" '
       
   770                 'http-equiv="Content-type" />\n'
       
   771                 '<meta http-equiv="Content-language" content="ja" />'
       
   772                 '</head><body><pre>\n'
       
   773                 '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
       
   774                 '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
       
   775                 '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
       
   776                 '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
       
   777                 '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
       
   778                 '</pre></body></html>'))
       
   779         self.assertEquals(soup.encode("shift-jis"),
       
   780                           shift_jis_html.replace('x-sjis'.encode(),
       
   781                                                  'shift-jis'.encode()))
       
   782 
       
   783         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
       
   784         soup = BeautifulSoup(isolatin)
       
   785 
       
   786         utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
       
   787         utf8 = utf8.replace("\xe9", "\xc3\xa9")
       
   788         self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')
       
   789 
       
   790     def testHebrew(self):
       
   791         iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
       
   792         utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
       
   793         soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
       
   794         self.assertEquals(soup.encode('utf-8'), utf8)
       
   795 
       
   796     def testSmartQuotesNotSoSmartAnymore(self):
       
   797         self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
       
   798                               '&lsquo;Foo&rsquo; <!--blah-->')
       
   799 
       
   800     def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
       
   801         smartQuotes = "Il a dit, \x8BSacr&eacute; bl&#101;u!\x9b"
       
   802         soup = BeautifulSoup(smartQuotes)
       
   803         self.assertEquals(soup.decode(),
       
   804                           'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
       
   805         soup = BeautifulSoup(smartQuotes, convertEntities="html")
       
   806         self.assertEquals(soup.encode('utf-8'),
       
   807                           'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
       
   808 
       
   809     def testDontSeeSmartQuotesWhereThereAreNone(self):
       
   810         utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
       
   811         self.assertSoupEquals(utf_8, encoding='utf-8')
       
   812 
       
   813 
       
   814 class Whitewash(SoupTest):
       
   815     """Test whitespace preservation."""
       
   816 
       
   817     def testPreservedWhitespace(self):
       
   818         self.assertSoupEquals("<pre>   </pre>")
       
   819         self.assertSoupEquals("<pre> woo  </pre>")
       
   820 
       
   821     def testCollapsedWhitespace(self):
       
   822         self.assertSoupEquals("<p>   </p>", "<p> </p>")
       
   823 
       
   824 
       
   825 if __name__ == '__main__':
       
   826     unittest.main()