Moved the GHOP module into the modules package.
This also includes moving the templates and content into their respective place inside the Soc folder. This is to avoid adding every folder to the app.yaml file.
Patch by: Madhusudan C.S. and Lennard de Rijk
Reviewed by: Lennard de Rijk
# -*- coding: utf-8 -*-
"""Unit tests for Beautiful Soup.
These tests make sure the Beautiful Soup works as it should. If you
find a bug in Beautiful Soup, the best way to express it is as a test
case like this that fails."""
import unittest
from BeautifulSoup import *
class SoupTest(unittest.TestCase):
def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
encoding=None):
"""Parse the given text and make sure its string rep is the other
given text."""
if rep == None:
rep = toParse
obj = c(toParse)
if encoding is None:
rep2 = obj.decode()
else:
rep2 = obj.encode(encoding)
self.assertEqual(rep2, rep)
class FollowThatTag(SoupTest):
"Tests the various ways of fetching tags from a soup."
def setUp(self):
ml = """
<a id="x">1</a>
<A id="a">2</a>
<b id="b">3</a>
<b href="foo" id="x">4</a>
<ac width=100>4</ac>"""
self.soup = BeautifulStoneSoup(ml)
def testFindAllByName(self):
matching = self.soup('a')
self.assertEqual(len(matching), 2)
self.assertEqual(matching[0].name, 'a')
self.assertEqual(matching, self.soup.findAll('a'))
self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
def testFindAllByAttribute(self):
matching = self.soup.findAll(id='x')
self.assertEqual(len(matching), 2)
self.assertEqual(matching[0].name, 'a')
self.assertEqual(matching[1].name, 'b')
matching2 = self.soup.findAll(attrs={'id' : 'x'})
self.assertEqual(matching, matching2)
strainer = SoupStrainer(attrs={'id' : 'x'})
self.assertEqual(matching, self.soup.findAll(strainer))
self.assertEqual(len(self.soup.findAll(id=None)), 1)
self.assertEqual(len(self.soup.findAll(width=100)), 1)
self.assertEqual(len(self.soup.findAll(junk=None)), 5)
self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
self.assertEqual(len(self.soup.findAll(junk=True)), 0)
self.assertEqual(len(self.soup.findAll(junk=True)), 0)
self.assertEqual(len(self.soup.findAll(href=True)), 1)
def testFindallByClass(self):
soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
self.assertEqual(soup.find('a', '1').string, "Bar")
def testFindAllByList(self):
matching = self.soup(['a', 'ac'])
self.assertEqual(len(matching), 3)
def testFindAllByHash(self):
matching = self.soup({'a' : True, 'b' : True})
self.assertEqual(len(matching), 4)
def testFindAllText(self):
soup = BeautifulSoup("<html>\xbb</html>")
self.assertEqual(soup.findAll(text=re.compile('.*')),
[u'\xbb'])
def testFindAllByRE(self):
import re
r = re.compile('a.*')
self.assertEqual(len(self.soup(r)), 3)
def testFindAllByMethod(self):
def matchTagWhereIDMatchesName(tag):
return tag.name == tag.get('id')
matching = self.soup.findAll(matchTagWhereIDMatchesName)
self.assertEqual(len(matching), 2)
self.assertEqual(matching[0].name, 'a')
def testParents(self):
soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
b = soup.b
self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
self.assertEquals(b.findParent('ul')['a'], 'b')
PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
def testNext(self):
soup = self.PROXIMITY_TEST
b = soup.find('b', {'id' : 2})
self.assertEquals(b.findNext('b')['id'], '3')
self.assertEquals(b.findNext('b')['id'], '3')
self.assertEquals(len(b.findAllNext('b')), 2)
self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
def testPrevious(self):
soup = self.PROXIMITY_TEST
b = soup.find('b', {'id' : 3})
self.assertEquals(b.findPrevious('b')['id'], '2')
self.assertEquals(b.findPrevious('b')['id'], '2')
self.assertEquals(len(b.findAllPrevious('b')), 2)
self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
def testNextSibling(self):
soup = self.SIBLING_TEST
tag = 'blockquote'
b = soup.find(tag, {'id' : 2})
self.assertEquals(b.findNext(tag)['id'], '2.1')
self.assertEquals(b.findNextSibling(tag)['id'], '3')
self.assertEquals(b.findNextSibling(tag)['id'], '3')
self.assertEquals(len(b.findNextSiblings(tag)), 2)
self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
def testPreviousSibling(self):
soup = self.SIBLING_TEST
tag = 'blockquote'
b = soup.find(tag, {'id' : 3})
self.assertEquals(b.findPrevious(tag)['id'], '2.1')
self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
def testTextNavigation(self):
soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
baz = soup.find(text='Baz')
self.assertEquals(baz.findParent("i")['id'], '1')
self.assertEquals(baz.findNext(text='Blee'), 'Blee')
self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
self.assertEquals(baz.findNextSibling(text='Blargh'), None)
self.assertEquals(baz.findNextSibling('hr')['id'], '1')
class SiblingRivalry(SoupTest):
"Tests the nextSibling and previousSibling navigation."
def testSiblings(self):
soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
secondLI = soup.find('li').nextSibling
self.assert_(secondLI.name == 'li' and secondLI.string == '2')
self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
self.assertEquals(soup.find('p').nextSibling, 'B')
self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
class TagsAreObjectsToo(SoupTest):
"Tests the various built-in functions of Tag objects."
def testLen(self):
soup = BeautifulSoup("<top>1<b>2</b>3</top>")
self.assertEquals(len(soup.top), 3)
class StringEmUp(SoupTest):
"Tests the use of 'string' as an alias for a tag's only content."
def testString(self):
s = BeautifulSoup("<b>foo</b>")
self.assertEquals(s.b.string, 'foo')
def testLackOfString(self):
s = BeautifulSoup("<b>f<i>e</i>o</b>")
self.assert_(not s.b.string)
class ThatsMyLimit(SoupTest):
"Tests the limit argument."
def testBasicLimits(self):
s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
self.assertEquals(len(s.findAll('br')), 4)
self.assertEquals(len(s.findAll('br', limit=2)), 2)
self.assertEquals(len(s('br', limit=2)), 2)
class OnlyTheLonely(SoupTest):
"Tests the parseOnly argument to the constructor."
def setUp(self):
x = []
for i in range(1,6):
x.append('<a id="%s">' % i)
for j in range(100,103):
x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
x.append('</a>')
self.x = ''.join(x)
def testOnly(self):
strainer = SoupStrainer("b")
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
self.assertEquals(len(soup), 15)
strainer = SoupStrainer(id=re.compile("100.*"))
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
self.assertEquals(len(soup), 5)
strainer = SoupStrainer(text=re.compile("10[01].*"))
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
self.assertEquals(len(soup), 10)
strainer = SoupStrainer(text=lambda(x):x[8]=='3')
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
self.assertEquals(len(soup), 3)
class PickleMeThis(SoupTest):
"Testing features like pickle and deepcopy."
def setUp(self):
self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/transitional.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
<link rev="made" href="mailto:leonardr@segfault.org">
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
<meta name="author" content="Leonard Richardson">
</head>
<body>
<a href="foo">foo</a>
<a href="foo"><b>bar</b></a>
</body>
</html>"""
self.soup = BeautifulSoup(self.page)
def testPickle(self):
import pickle
dumped = pickle.dumps(self.soup, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), self.soup.decode())
def testDeepcopy(self):
from copy import deepcopy
deepcopy(BeautifulSoup("<a></a>"))
copied = deepcopy(self.soup)
self.assertEqual(copied.decode(), self.soup.decode())
def testUnicodePickle(self):
import cPickle as pickle
html = "<b>" + chr(0xc3) + "</b>"
soup = BeautifulSoup(html)
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.decode(), soup.decode())
class WriteOnlyCode(SoupTest):
"Testing the modification of the tree."
def testModifyAttributes(self):
soup = BeautifulSoup('<a id="1"></a>')
soup.a['id'] = 2
self.assertEqual(soup.decode(), '<a id="2"></a>')
del(soup.a['id'])
self.assertEqual(soup.decode(), '<a></a>')
soup.a['id2'] = 'foo'
self.assertEqual(soup.decode(), '<a id2="foo"></a>')
def testNewTagCreation(self):
"Makes sure tags don't step on each others' toes."
soup = BeautifulSoup()
a = Tag(soup, 'a')
ol = Tag(soup, 'ol')
a['href'] = 'http://foo.com/'
self.assertRaises(KeyError, lambda : ol['href'])
def testTagReplacement(self):
# Make sure you can replace an element with itself.
text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
soup = BeautifulSoup(text)
c = soup.c
soup.c.replaceWith(c)
self.assertEquals(soup.decode(), text)
# A very simple case
soup = BeautifulSoup("<b>Argh!</b>")
soup.find(text="Argh!").replaceWith("Hooray!")
newText = soup.find(text="Hooray!")
b = soup.b
self.assertEqual(newText.previous, b)
self.assertEqual(newText.parent, b)
self.assertEqual(newText.previous.next, newText)
self.assertEqual(newText.next, None)
# A more complex case
soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
soup.b.insert(1, "Hooray!")
newText = soup.find(text="Hooray!")
self.assertEqual(newText.previous, "Argh!")
self.assertEqual(newText.previous.next, newText)
self.assertEqual(newText.previousSibling, "Argh!")
self.assertEqual(newText.previousSibling.nextSibling, newText)
self.assertEqual(newText.nextSibling, None)
self.assertEqual(newText.next, soup.c)
text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
soup = BeautifulSoup(text)
no, show = soup.findAll('b')
show.replaceWith(no)
self.assertEquals(soup.decode(), "<html>There's business like <b>no</b> business</html>")
# Even more complex
soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
tag = Tag(soup, 'magictag')
tag.insert(0, "the")
soup.a.insert(1, tag)
b = soup.b
c = soup.c
theText = tag.find(text=True)
findText = b.find(text="Find")
self.assertEqual(findText.next, tag)
self.assertEqual(tag.previous, findText)
self.assertEqual(b.nextSibling, tag)
self.assertEqual(tag.previousSibling, b)
self.assertEqual(tag.nextSibling, c)
self.assertEqual(c.previousSibling, tag)
self.assertEqual(theText.next, c)
self.assertEqual(c.previous, theText)
# Aand... incredibly complex.
soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
f = soup.f
a = soup.a
c = soup.c
e = soup.e
weText = a.find(text="We")
soup.b.replaceWith(soup.f)
self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
self.assertEqual(f.previous, weText)
self.assertEqual(weText.next, f)
self.assertEqual(f.previousSibling, weText)
self.assertEqual(f.nextSibling, None)
self.assertEqual(weText.nextSibling, f)
def testAppend(self):
doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
soup = BeautifulSoup(doc)
second_para = soup('p')[1]
bold = soup.find('b')
soup('p')[1].append(soup.find('b'))
self.assertEqual(bold.parent, second_para)
self.assertEqual(soup.decode(),
"<p>Don't leave me .</p> "
"<p>Don't leave me.<b>here</b></p>")
def testTagExtraction(self):
# A very simple case
text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
soup = BeautifulSoup(text)
extracted = soup.find("div", id="nav").extract()
self.assertEqual(soup.decode(), "<html>Real content here.</html>")
self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
# A simple case, a more complex test.
text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
soup = BeautifulStoneSoup(text)
doc = soup.doc
numbers, roman, letters = soup("a")
self.assertEqual(roman.parent, doc)
oldPrevious = roman.previous
endOfThisTag = roman.nextSibling.previous
self.assertEqual(oldPrevious, "2")
self.assertEqual(roman.next, "i")
self.assertEqual(endOfThisTag, "ii")
self.assertEqual(roman.previousSibling, numbers)
self.assertEqual(roman.nextSibling, letters)
roman.extract()
self.assertEqual(roman.parent, None)
self.assertEqual(roman.previous, None)
self.assertEqual(roman.next, "i")
self.assertEqual(letters.previous, '2')
self.assertEqual(roman.previousSibling, None)
self.assertEqual(roman.nextSibling, None)
self.assertEqual(endOfThisTag.next, None)
self.assertEqual(roman.b.contents[0].next, None)
self.assertEqual(numbers.nextSibling, letters)
self.assertEqual(letters.previousSibling, numbers)
self.assertEqual(len(doc.contents), 2)
self.assertEqual(doc.contents[0], numbers)
self.assertEqual(doc.contents[1], letters)
# A more complex case.
text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
soup = BeautifulStoneSoup(text)
one = soup.find(text="1")
three = soup.find(text="3")
toExtract = soup.b
soup.b.extract()
self.assertEqual(one.next, three)
self.assertEqual(three.previous, one)
self.assertEqual(one.parent.nextSibling, three)
self.assertEqual(three.previousSibling, soup.a)
class TheManWithoutAttributes(SoupTest):
"Test attribute access"
def testHasKey(self):
text = "<foo attr='bar'>"
self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
class QuoteMeOnThat(SoupTest):
"Test quoting"
def testQuotedAttributeValues(self):
self.assertSoupEquals("<foo attr='bar'></foo>",
'<foo attr="bar"></foo>')
text = """<foo attr='bar "brawls" happen'>a</foo>"""
soup = BeautifulSoup(text)
self.assertEquals(soup.decode(), text)
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
self.assertSoupEquals(soup.decode(), newText)
self.assertSoupEquals('<this is="really messed up & stuff">',
'<this is="really messed up & stuff"></this>')
class YoureSoLiteral(SoupTest):
"Test literal mode."
def testLiteralMode(self):
text = "<script>if (i<imgs.length)</script><b>Foo</b>"
soup = BeautifulSoup(text)
self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
self.assertEqual(soup.b.contents[0], "Foo")
def testTextArea(self):
text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
soup = BeautifulSoup(text)
self.assertEqual(soup.textarea.contents[0],
"<b>This is an example of an HTML tag</b><&<&")
class OperatorOverload(SoupTest):
"Our operators do it all! Call now!"
def testTagNameAsFind(self):
"Tests that referencing a tag name as a member delegates to find()."
soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
self.assertEqual(soup.b.i, soup.find('b').find('i'))
self.assertEqual(soup.b.i.string, 'bar')
self.assertEqual(soup.b['id'], '1')
self.assertEqual(soup.b.contents[0], 'foo')
self.assert_(not soup.a)
#Test the .fooTag variant of .foo.
self.assertEqual(soup.bTag.iTag.string, 'bar')
self.assertEqual(soup.b.iTag.string, 'bar')
self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
class NestableEgg(SoupTest):
"""Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
def testParaInsideBlockquote(self):
soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
self.assertEqual(soup.blockquote.p.b.string, 'Foo')
self.assertEqual(soup.blockquote.b.string, 'Foo')
self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
def testNestedTables(self):
text = """<table id="1"><tr><td>Here's another table:
<table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
soup = BeautifulSoup(text)
self.assertEquals(soup.table.table.td.string, 'Juicy text')
self.assertEquals(len(soup.findAll('table')), 2)
self.assertEquals(len(soup.table.findAll('table')), 1)
self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
'table')
text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
soup = BeautifulSoup(text)
self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
<tfoot><tr>Baz</tr></tfoot></table>"""
soup = BeautifulSoup(text)
self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
def testBadNestedTables(self):
soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
class CleanupOnAisleFour(SoupTest):
"""Here we test cleanup of text that breaks HTMLParser or is just
obnoxious."""
def testSelfClosingtag(self):
self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
'<br />')
self.assertSoupEquals('<p>test1<br/>test2</p>',
'<p>test1<br />test2</p>')
text = '<p>test1<selfclosing>test2'
soup = BeautifulStoneSoup(text)
self.assertEqual(soup.decode(),
'<p>test1<selfclosing>test2</selfclosing></p>')
soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
self.assertEqual(soup.decode(),
'<p>test1<selfclosing />test2</p>')
def testSelfClosingTagOrNot(self):
text = "<item><link>http://foo.com/</link></item>"
self.assertEqual(BeautifulStoneSoup(text).decode(), text)
self.assertEqual(BeautifulSoup(text).decode(),
'<item><link />http://foo.com/</item>')
def testBooleanAttributes(self):
text = "<td nowrap>foo</td>"
self.assertSoupEquals(text, text)
def testCData(self):
xml = "<root>foo<![CDATA[foobar]]>bar</root>"
self.assertSoupEquals(xml, xml)
r = re.compile("foo.*bar")
soup = BeautifulSoup(xml)
self.assertEquals(soup.find(text=r).string, "foobar")
self.assertEquals(soup.find(text=r).__class__, CData)
def testComments(self):
xml = "foo<!--foobar-->baz"
self.assertSoupEquals(xml)
r = re.compile("foo.*bar")
soup = BeautifulSoup(xml)
self.assertEquals(soup.find(text=r).string, "foobar")
self.assertEquals(soup.find(text="foobar").__class__, Comment)
def testDeclaration(self):
xml = "foo<!DOCTYPE foobar>baz"
self.assertSoupEquals(xml)
r = re.compile(".*foo.*bar")
soup = BeautifulSoup(xml)
text = "DOCTYPE foobar"
self.assertEquals(soup.find(text=r).string, text)
self.assertEquals(soup.find(text=text).__class__, Declaration)
namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
'<html>foo</html>')
soup = BeautifulSoup(namespaced_doctype)
self.assertEquals(soup.contents[0],
'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
self.assertEquals(soup.html.contents[0], 'foo')
def testEntityConversions(self):
text = "<<sacré bleu!>>"
soup = BeautifulStoneSoup(text)
self.assertSoupEquals(text)
xmlEnt = BeautifulStoneSoup.XML_ENTITIES
htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
self.assertEquals(soup.decode(), "<<sacré bleu!>>")
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
self.assertEquals(soup.decode(), "<<sacré bleu!>>")
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")
# Make sure the "XML", "HTML", and "XHTML" settings work.
text = "<™'"
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
self.assertEquals(soup.decode(), u"<™'")
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
self.assertEquals(soup.decode(), u"<\u2122'")
soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
self.assertEquals(soup.decode(), u"<\u2122'")
def testNonBreakingSpaces(self):
soup = BeautifulSoup("<a> </a>",
convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
def testWhitespaceInDeclaration(self):
self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
def testJunkInDeclaration(self):
self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
def testIncompleteDeclaration(self):
self.assertSoupEquals('a<!b <p>c')
def testEntityReplacement(self):
self.assertSoupEquals('<b>hello there</b>')
def testEntitiesInAttributeValues(self):
self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
encoding='utf-8')
self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
encoding='utf-8')
soup = BeautifulSoup('<x t=">™">',
convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
self.assertEquals(soup.decode(), u'<x t=">\u2122"></x>')
uri = "http://crummy.com?sacré&bleu"
link = '<a href="%s"></a>' % uri
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
self.assertEquals(soup.decode(),
link.replace("é", u"\xe9"))
uri = "http://crummy.com?sacré&bleu"
link = '<a href="%s"></a>' % uri
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
self.assertEquals(soup.a['href'],
uri.replace("é", u"\xe9"))
def testNakedAmpersands(self):
html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
soup = BeautifulStoneSoup("AT&T ", **html)
self.assertEquals(soup.decode(), 'AT&T ')
nakedAmpersandInASentence = "AT&T was Ma Bell"
soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
self.assertEquals(soup.decode(), \
nakedAmpersandInASentence.replace('&','&'))
invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
validURL = invalidURL.replace('&','&')
soup = BeautifulStoneSoup(invalidURL)
self.assertEquals(soup.decode(), validURL)
soup = BeautifulStoneSoup(validURL)
self.assertEquals(soup.decode(), validURL)
class EncodeRed(SoupTest):
"""Tests encoding conversion, Unicode conversion, and Microsoft
smart quote fixes."""
def testUnicodeDammitStandalone(self):
markup = "<foo>\x92</foo>"
dammit = UnicodeDammit(markup)
self.assertEquals(dammit.unicode, "<foo>’</foo>")
hebrew = "\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
def testGarbageInGarbageOut(self):
ascii = "<foo>a</foo>"
asciiSoup = BeautifulStoneSoup(ascii)
self.assertEquals(ascii, asciiSoup.decode())
unicodeData = u"<foo>\u00FC</foo>"
utf8 = unicodeData.encode("utf-8")
self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
unicodeSoup = BeautifulStoneSoup(unicodeData)
self.assertEquals(unicodeData, unicodeSoup.decode())
self.assertEquals(unicodeSoup.foo.string, u'\u00FC')
utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
self.assertEquals(utf8, utf8Soup.encode('utf-8'))
self.assertEquals(utf8Soup.originalEncoding, "utf-8")
utf8Soup = BeautifulStoneSoup(unicodeData)
self.assertEquals(utf8, utf8Soup.encode('utf-8'))
self.assertEquals(utf8Soup.originalEncoding, None)
def testHandleInvalidCodec(self):
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"),
fromEncoding=bad_encoding)
self.assertEquals(soup.originalEncoding, 'utf-8')
def testUnicodeSearch(self):
html = u'<html><body><h1>Räksmörgås</h1></body></html>'
soup = BeautifulSoup(html)
self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
def testRewrittenXMLHeader(self):
euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
soup = BeautifulStoneSoup(euc_jp)
if soup.originalEncoding != "euc-jp":
raise Exception("Test failed when parsing euc-jp document. "
"If you're running Python >=2.4, or you have "
"cjkcodecs installed, this is a real problem. "
"Otherwise, ignore it.")
self.assertEquals(soup.originalEncoding, "euc-jp")
self.assertEquals(soup.renderContents('utf-8'), utf8)
old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
self.assertSoupEquals(old_text, new_text)
def testRewrittenMetaTag(self):
no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
soup = BeautifulSoup(no_shift_jis_html)
# Beautiful Soup used to try to rewrite the meta tag even if the
# meta tag got filtered out by the strainer. This test makes
# sure that doesn't happen.
strainer = SoupStrainer('pre')
soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
self.assertEquals(soup.contents[0].name, 'pre')
meta_tag = ('<meta content="text/html; charset=x-sjis" '
'http-equiv="Content-type" />')
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja" />'
'</head><body><pre>\n'
'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
'</pre></body></html>') % meta_tag
soup = BeautifulSoup(shift_jis_html)
if soup.originalEncoding != "shift-jis":
raise Exception("Test failed when parsing shift-jis document "
"with meta tag '%s'."
"If you're running Python >=2.4, or you have "
"cjkcodecs installed, this is a real problem. "
"Otherwise, ignore it." % meta_tag)
self.assertEquals(soup.originalEncoding, "shift-jis")
content_type_tag = soup.meta['content']
self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
'charset=%SOUP-ENCODING%')
content_type = str(soup.meta)
index = content_type.find('charset=')
self.assertEqual(content_type[index:index+len('charset=utf8')+1],
'charset=utf-8')
content_type = soup.meta.encode('shift-jis')
index = content_type.find('charset=')
self.assertEqual(content_type[index:index+len('charset=shift-jis')],
'charset=shift-jis'.encode())
self.assertEquals(soup.encode('utf-8'), (
'<html><head>\n'
'<meta content="text/html; charset=utf-8" '
'http-equiv="Content-type" />\n'
'<meta http-equiv="Content-language" content="ja" />'
'</head><body><pre>\n'
'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
'</pre></body></html>'))
self.assertEquals(soup.encode("shift-jis"),
shift_jis_html.replace('x-sjis'.encode(),
'shift-jis'.encode()))
isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
soup = BeautifulSoup(isolatin)
utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
utf8 = utf8.replace("\xe9", "\xc3\xa9")
self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')
def testHebrew(self):
iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
self.assertEquals(soup.encode('utf-8'), utf8)
def testSmartQuotesNotSoSmartAnymore(self):
self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
'‘Foo’ <!--blah-->')
def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
smartQuotes = "Il a dit, \x8BSacré bleu!\x9b"
soup = BeautifulSoup(smartQuotes)
self.assertEquals(soup.decode(),
'Il a dit, ‹Sacré bleu!›')
soup = BeautifulSoup(smartQuotes, convertEntities="html")
self.assertEquals(soup.encode('utf-8'),
'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
def testDontSeeSmartQuotesWhereThereAreNone(self):
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
self.assertSoupEquals(utf_8, encoding='utf-8')
class Whitewash(SoupTest):
"""Test whitespace preservation."""
def testPreservedWhitespace(self):
self.assertSoupEquals("<pre> </pre>")
self.assertSoupEquals("<pre> woo </pre>")
def testCollapsedWhitespace(self):
self.assertSoupEquals("<p> </p>", "<p> </p>")
if __name__ == '__main__':
unittest.main()