app/feedparser/__init__.py
changeset 151 6f8eb27752dc
parent 140 c3d098d6fafa
equal deleted inserted replaced
150:715b07485c48 151:6f8eb27752dc
       
     1 #!/usr/bin/env python
       
     2 """Universal feed parser
       
     3 
       
     4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
       
     5 
       
     6 Visit http://feedparser.org/ for the latest version
       
     7 Visit http://feedparser.org/docs/ for the latest documentation
       
     8 
       
     9 Required: Python 2.1 or later
       
    10 Recommended: Python 2.3 or later
       
    11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
       
    12 """
       
    13 
       
    14 __version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
       
    15 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
       
    16 
       
    17 Redistribution and use in source and binary forms, with or without modification,
       
    18 are permitted provided that the following conditions are met:
       
    19 
       
    20 * Redistributions of source code must retain the above copyright notice,
       
    21   this list of conditions and the following disclaimer.
       
    22 * Redistributions in binary form must reproduce the above copyright notice,
       
    23   this list of conditions and the following disclaimer in the documentation
       
    24   and/or other materials provided with the distribution.
       
    25 
       
    26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
       
    27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
       
    30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       
    31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       
    32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       
    33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       
    34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       
    35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    36 POSSIBILITY OF SUCH DAMAGE."""
       
    37 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
       
    38 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
       
    39                     "John Beimler <http://john.beimler.org/>",
       
    40                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
       
    41                     "Aaron Swartz <http://aaronsw.com/>",
       
    42                     "Kevin Marks <http://epeus.blogspot.com/>"]
       
    43 _debug = 0
       
    44 
       
    45 # HTTP "User-Agent" header to send to servers when downloading feeds.
       
    46 # If you are embedding feedparser in a larger application, you should
       
    47 # change this to your application name and URL.
       
    48 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
       
    49 
       
    50 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
       
    51 # want to send an Accept header, set this to None.
       
    52 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
       
    53 
       
    54 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
       
    55 # but if they're not installed, Python will keep searching through its own list
       
    56 # of pre-installed parsers until it finds one that supports everything we need.
       
    57 PREFERRED_XML_PARSERS = ["drv_libxml2"]
       
    58 
       
    59 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
       
    60 # this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
       
    61 # or utidylib <http://utidylib.berlios.de/>.
       
    62 TIDY_MARKUP = 0
       
    63 
       
    64 # List of Python interfaces for HTML Tidy, in order of preference.  Only useful
       
    65 # if TIDY_MARKUP = 1
       
    66 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
       
    67 
       
    68 # ---------- required modules (should come with any Python distribution) ----------
       
    69 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
       
    70 try:
       
    71     from cStringIO import StringIO as _StringIO
       
    72 except:
       
    73     from StringIO import StringIO as _StringIO
       
    74 
       
    75 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
       
    76 
       
    77 # gzip is included with most Python distributions, but may not be available if you compiled your own
       
    78 try:
       
    79     import gzip
       
    80 except:
       
    81     gzip = None
       
    82 try:
       
    83     import zlib
       
    84 except:
       
    85     zlib = None
       
    86 
       
    87 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
       
    88 # been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
       
    89 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
       
    90 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
       
    91 try:
       
    92     import xml.sax
       
    93     xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
       
    94     from xml.sax.saxutils import escape as _xmlescape
       
    95     _XML_AVAILABLE = 1
       
    96 except:
       
    97     _XML_AVAILABLE = 0
       
    98     def _xmlescape(data):
       
    99         data = data.replace('&', '&amp;')
       
   100         data = data.replace('>', '&gt;')
       
   101         data = data.replace('<', '&lt;')
       
   102         return data
       
   103 
       
   104 # base64 support for Atom feeds that contain embedded binary data
       
   105 try:
       
   106     import base64, binascii
       
   107 except:
       
   108     base64 = binascii = None
       
   109 
       
   110 # cjkcodecs and iconv_codec provide support for more character encodings.
       
   111 # Both are available from http://cjkpython.i18n.org/
       
   112 try:
       
   113     import cjkcodecs.aliases
       
   114 except:
       
   115     pass
       
   116 try:
       
   117     import iconv_codec
       
   118 except:
       
   119     pass
       
   120 
       
   121 # chardet library auto-detects character encodings
       
   122 # Download from http://chardet.feedparser.org/
       
   123 try:
       
   124     import chardet
       
   125     if _debug:
       
   126         import chardet.constants
       
   127         chardet.constants._debug = 1
       
   128 except:
       
   129     chardet = None
       
   130 
       
   131 # ---------- don't touch these ----------
       
   132 class ThingsNobodyCaresAboutButMe(Exception): pass
       
   133 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
       
   134 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
       
   135 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
       
   136 class UndeclaredNamespace(Exception): pass
       
   137 
       
   138 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
       
   139 sgmllib.special = re.compile('<!')
       
   140 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
       
   141 
       
   142 SUPPORTED_VERSIONS = {'': 'unknown',
       
   143                       'rss090': 'RSS 0.90',
       
   144                       'rss091n': 'RSS 0.91 (Netscape)',
       
   145                       'rss091u': 'RSS 0.91 (Userland)',
       
   146                       'rss092': 'RSS 0.92',
       
   147                       'rss093': 'RSS 0.93',
       
   148                       'rss094': 'RSS 0.94',
       
   149                       'rss20': 'RSS 2.0',
       
   150                       'rss10': 'RSS 1.0',
       
   151                       'rss': 'RSS (unknown version)',
       
   152                       'atom01': 'Atom 0.1',
       
   153                       'atom02': 'Atom 0.2',
       
   154                       'atom03': 'Atom 0.3',
       
   155                       'atom10': 'Atom 1.0',
       
   156                       'atom': 'Atom (unknown version)',
       
   157                       'cdf': 'CDF',
       
   158                       'hotrss': 'Hot RSS'
       
   159                       }
       
   160 
       
   161 try:
       
   162     UserDict = dict
       
   163 except NameError:
       
   164     # Python 2.1 does not have dict
       
   165     from UserDict import UserDict
       
   166     def dict(aList):
       
   167         rc = {}
       
   168         for k, v in aList:
       
   169             rc[k] = v
       
   170         return rc
       
   171 
       
   172 class FeedParserDict(UserDict):
       
   173     keymap = {'channel': 'feed',
       
   174               'items': 'entries',
       
   175               'guid': 'id',
       
   176               'date': 'updated',
       
   177               'date_parsed': 'updated_parsed',
       
   178               'description': ['subtitle', 'summary'],
       
   179               'url': ['href'],
       
   180               'modified': 'updated',
       
   181               'modified_parsed': 'updated_parsed',
       
   182               'issued': 'published',
       
   183               'issued_parsed': 'published_parsed',
       
   184               'copyright': 'rights',
       
   185               'copyright_detail': 'rights_detail',
       
   186               'tagline': 'subtitle',
       
   187               'tagline_detail': 'subtitle_detail'}
       
   188     def __getitem__(self, key):
       
   189         if key == 'category':
       
   190             return UserDict.__getitem__(self, 'tags')[0]['term']
       
   191         if key == 'categories':
       
   192             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
       
   193         realkey = self.keymap.get(key, key)
       
   194         if type(realkey) == types.ListType:
       
   195             for k in realkey:
       
   196                 if UserDict.has_key(self, k):
       
   197                     return UserDict.__getitem__(self, k)
       
   198         if UserDict.has_key(self, key):
       
   199             return UserDict.__getitem__(self, key)
       
   200         return UserDict.__getitem__(self, realkey)
       
   201 
       
   202     def __setitem__(self, key, value):
       
   203         for k in self.keymap.keys():
       
   204             if key == k:
       
   205                 key = self.keymap[k]
       
   206                 if type(key) == types.ListType:
       
   207                     key = key[0]
       
   208         return UserDict.__setitem__(self, key, value)
       
   209 
       
   210     def get(self, key, default=None):
       
   211         if self.has_key(key):
       
   212             return self[key]
       
   213         else:
       
   214             return default
       
   215 
       
   216     def setdefault(self, key, value):
       
   217         if not self.has_key(key):
       
   218             self[key] = value
       
   219         return self[key]
       
   220         
       
   221     def has_key(self, key):
       
   222         try:
       
   223             return hasattr(self, key) or UserDict.has_key(self, key)
       
   224         except AttributeError:
       
   225             return False
       
   226         
       
   227     def __getattr__(self, key):
       
   228         try:
       
   229             return self.__dict__[key]
       
   230         except KeyError:
       
   231             pass
       
   232         try:
       
   233             assert not key.startswith('_')
       
   234             return self.__getitem__(key)
       
   235         except:
       
   236             raise AttributeError, "object has no attribute '%s'" % key
       
   237 
       
   238     def __setattr__(self, key, value):
       
   239         if key.startswith('_') or key == 'data':
       
   240             self.__dict__[key] = value
       
   241         else:
       
   242             return self.__setitem__(key, value)
       
   243 
       
   244     def __contains__(self, key):
       
   245         return self.has_key(key)
       
   246 
       
   247 def zopeCompatibilityHack():
       
   248     global FeedParserDict
       
   249     del FeedParserDict
       
   250     def FeedParserDict(aDict=None):
       
   251         rc = {}
       
   252         if aDict:
       
   253             rc.update(aDict)
       
   254         return rc
       
   255 
       
   256 _ebcdic_to_ascii_map = None
       
   257 def _ebcdic_to_ascii(s):
       
   258     global _ebcdic_to_ascii_map
       
   259     if not _ebcdic_to_ascii_map:
       
   260         emap = (
       
   261             0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
       
   262             16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
       
   263             128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
       
   264             144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
       
   265             32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
       
   266             38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
       
   267             45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
       
   268             186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
       
   269             195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
       
   270             202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
       
   271             209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
       
   272             216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
       
   273             123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
       
   274             125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
       
   275             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
       
   276             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
       
   277             )
       
   278         import string
       
   279         _ebcdic_to_ascii_map = string.maketrans( \
       
   280             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
       
   281     return s.translate(_ebcdic_to_ascii_map)
       
   282 
       
   283 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
       
   284 def _urljoin(base, uri):
       
   285     uri = _urifixer.sub(r'\1\3', uri)
       
   286     return urlparse.urljoin(base, uri)
       
   287 
       
   288 class _FeedParserMixin:
       
   289     namespaces = {'': '',
       
   290                   'http://backend.userland.com/rss': '',
       
   291                   'http://blogs.law.harvard.edu/tech/rss': '',
       
   292                   'http://purl.org/rss/1.0/': '',
       
   293                   'http://my.netscape.com/rdf/simple/0.9/': '',
       
   294                   'http://example.com/newformat#': '',
       
   295                   'http://example.com/necho': '',
       
   296                   'http://purl.org/echo/': '',
       
   297                   'uri/of/echo/namespace#': '',
       
   298                   'http://purl.org/pie/': '',
       
   299                   'http://purl.org/atom/ns#': '',
       
   300                   'http://www.w3.org/2005/Atom': '',
       
   301                   'http://purl.org/rss/1.0/modules/rss091#': '',
       
   302                   
       
   303                   'http://webns.net/mvcb/':                               'admin',
       
   304                   'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
       
   305                   'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
       
   306                   'http://media.tangent.org/rss/1.0/':                    'audio',
       
   307                   'http://backend.userland.com/blogChannelModule':        'blogChannel',
       
   308                   'http://web.resource.org/cc/':                          'cc',
       
   309                   'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
       
   310                   'http://purl.org/rss/1.0/modules/company':              'co',
       
   311                   'http://purl.org/rss/1.0/modules/content/':             'content',
       
   312                   'http://my.theinfo.org/changed/1.0/rss/':               'cp',
       
   313                   'http://purl.org/dc/elements/1.1/':                     'dc',
       
   314                   'http://purl.org/dc/terms/':                            'dcterms',
       
   315                   'http://purl.org/rss/1.0/modules/email/':               'email',
       
   316                   'http://purl.org/rss/1.0/modules/event/':               'ev',
       
   317                   'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
       
   318                   'http://freshmeat.net/rss/fm/':                         'fm',
       
   319                   'http://xmlns.com/foaf/0.1/':                           'foaf',
       
   320                   'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
       
   321                   'http://postneo.com/icbm/':                             'icbm',
       
   322                   'http://purl.org/rss/1.0/modules/image/':               'image',
       
   323                   'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
       
   324                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
       
   325                   'http://purl.org/rss/1.0/modules/link/':                'l',
       
   326                   'http://search.yahoo.com/mrss':                         'media',
       
   327                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
       
   328                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
       
   329                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
       
   330                   'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
       
   331                   'http://purl.org/rss/1.0/modules/reference/':           'ref',
       
   332                   'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
       
   333                   'http://purl.org/rss/1.0/modules/search/':              'search',
       
   334                   'http://purl.org/rss/1.0/modules/slash/':               'slash',
       
   335                   'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
       
   336                   'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
       
   337                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
       
   338                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
       
   339                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
       
   340                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
       
   341                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
       
   342                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
       
   343                   'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
       
   344                   'http://wellformedweb.org/commentAPI/':                 'wfw',
       
   345                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
       
   346                   'http://www.w3.org/1999/xhtml':                         'xhtml',
       
   347                   'http://www.w3.org/XML/1998/namespace':                 'xml',
       
   348                   'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
       
   349 }
       
   350     _matchnamespaces = {}
       
   351 
       
   352     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
       
   353     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
       
   354     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
       
   355     html_types = ['text/html', 'application/xhtml+xml']
       
   356     
       
   357     def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
       
   358         if _debug: sys.stderr.write('initializing FeedParser\n')
       
   359         if not self._matchnamespaces:
       
   360             for k, v in self.namespaces.items():
       
   361                 self._matchnamespaces[k.lower()] = v
       
   362         self.feeddata = FeedParserDict() # feed-level data
       
   363         self.encoding = encoding # character encoding
       
   364         self.entries = [] # list of entry-level data
       
   365         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
       
   366         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
       
   367 
       
   368         # the following are used internally to track state;
       
   369         # this is really out of control and should be refactored
       
   370         self.infeed = 0
       
   371         self.inentry = 0
       
   372         self.incontent = 0
       
   373         self.intextinput = 0
       
   374         self.inimage = 0
       
   375         self.inauthor = 0
       
   376         self.incontributor = 0
       
   377         self.inpublisher = 0
       
   378         self.insource = 0
       
   379         self.sourcedata = FeedParserDict()
       
   380         self.contentparams = FeedParserDict()
       
   381         self._summaryKey = None
       
   382         self.namespacemap = {}
       
   383         self.elementstack = []
       
   384         self.basestack = []
       
   385         self.langstack = []
       
   386         self.baseuri = baseuri or ''
       
   387         self.lang = baselang or None
       
   388         if baselang:
       
   389             self.feeddata['language'] = baselang
       
   390 
       
   391     def unknown_starttag(self, tag, attrs):
       
   392         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
       
   393         # normalize attrs
       
   394         attrs = [(k.lower(), v) for k, v in attrs]
       
   395         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
       
   396         
       
   397         # track xml:base and xml:lang
       
   398         attrsD = dict(attrs)
       
   399         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
       
   400         self.baseuri = _urljoin(self.baseuri, baseuri)
       
   401         lang = attrsD.get('xml:lang', attrsD.get('lang'))
       
   402         if lang == '':
       
   403             # xml:lang could be explicitly set to '', we need to capture that
       
   404             lang = None
       
   405         elif lang is None:
       
   406             # if no xml:lang is specified, use parent lang
       
   407             lang = self.lang
       
   408         if lang:
       
   409             if tag in ('feed', 'rss', 'rdf:RDF'):
       
   410                 self.feeddata['language'] = lang
       
   411         self.lang = lang
       
   412         self.basestack.append(self.baseuri)
       
   413         self.langstack.append(lang)
       
   414         
       
   415         # track namespaces
       
   416         for prefix, uri in attrs:
       
   417             if prefix.startswith('xmlns:'):
       
   418                 self.trackNamespace(prefix[6:], uri)
       
   419             elif prefix == 'xmlns':
       
   420                 self.trackNamespace(None, uri)
       
   421 
       
   422         # track inline content
       
   423         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
       
   424             # element declared itself as escaped markup, but it isn't really
       
   425             self.contentparams['type'] = 'application/xhtml+xml'
       
   426         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
       
   427             # Note: probably shouldn't simply recreate localname here, but
       
   428             # our namespace handling isn't actually 100% correct in cases where
       
   429             # the feed redefines the default namespace (which is actually
       
   430             # the usual case for inline content, thanks Sam), so here we
       
   431             # cheat and just reconstruct the element based on localname
       
   432             # because that compensates for the bugs in our namespace handling.
       
   433             # This will horribly munge inline content with non-empty qnames,
       
   434             # but nobody actually does that, so I'm not fixing it.
       
   435             tag = tag.split(':')[-1]
       
   436             return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
       
   437 
       
   438         # match namespaces
       
   439         if tag.find(':') <> -1:
       
   440             prefix, suffix = tag.split(':', 1)
       
   441         else:
       
   442             prefix, suffix = '', tag
       
   443         prefix = self.namespacemap.get(prefix, prefix)
       
   444         if prefix:
       
   445             prefix = prefix + '_'
       
   446 
       
   447         # special hack for better tracking of empty textinput/image elements in illformed feeds
       
   448         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
       
   449             self.intextinput = 0
       
   450         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
       
   451             self.inimage = 0
       
   452         
       
   453         # call special handler (if defined) or default handler
       
   454         methodname = '_start_' + prefix + suffix
       
   455         try:
       
   456             method = getattr(self, methodname)
       
   457             return method(attrsD)
       
   458         except AttributeError:
       
   459             return self.push(prefix + suffix, 1)
       
   460 
       
   461     def unknown_endtag(self, tag):
       
   462         if _debug: sys.stderr.write('end %s\n' % tag)
       
   463         # match namespaces
       
   464         if tag.find(':') <> -1:
       
   465             prefix, suffix = tag.split(':', 1)
       
   466         else:
       
   467             prefix, suffix = '', tag
       
   468         prefix = self.namespacemap.get(prefix, prefix)
       
   469         if prefix:
       
   470             prefix = prefix + '_'
       
   471 
       
   472         # call special handler (if defined) or default handler
       
   473         methodname = '_end_' + prefix + suffix
       
   474         try:
       
   475             method = getattr(self, methodname)
       
   476             method()
       
   477         except AttributeError:
       
   478             self.pop(prefix + suffix)
       
   479 
       
   480         # track inline content
       
   481         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
       
   482             # element declared itself as escaped markup, but it isn't really
       
   483             self.contentparams['type'] = 'application/xhtml+xml'
       
   484         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
       
   485             tag = tag.split(':')[-1]
       
   486             self.handle_data('</%s>' % tag, escape=0)
       
   487 
       
   488         # track xml:base and xml:lang going out of scope
       
   489         if self.basestack:
       
   490             self.basestack.pop()
       
   491             if self.basestack and self.basestack[-1]:
       
   492                 self.baseuri = self.basestack[-1]
       
   493         if self.langstack:
       
   494             self.langstack.pop()
       
   495             if self.langstack: # and (self.langstack[-1] is not None):
       
   496                 self.lang = self.langstack[-1]
       
   497 
       
   498     def handle_charref(self, ref):
       
   499         # called for each character reference, e.g. for '&#160;', ref will be '160'
       
   500         if not self.elementstack: return
       
   501         ref = ref.lower()
       
   502         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
       
   503             text = '&#%s;' % ref
       
   504         else:
       
   505             if ref[0] == 'x':
       
   506                 c = int(ref[1:], 16)
       
   507             else:
       
   508                 c = int(ref)
       
   509             text = unichr(c).encode('utf-8')
       
   510         self.elementstack[-1][2].append(text)
       
   511 
       
   512     def handle_entityref(self, ref):
       
   513         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
       
   514         if not self.elementstack: return
       
   515         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
       
   516         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
       
   517             text = '&%s;' % ref
       
   518         else:
       
   519             # entity resolution graciously donated by Aaron Swartz
       
   520             def name2cp(k):
       
   521                 import htmlentitydefs
       
   522                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
       
   523                     return htmlentitydefs.name2codepoint[k]
       
   524                 k = htmlentitydefs.entitydefs[k]
       
   525                 if k.startswith('&#') and k.endswith(';'):
       
   526                     return int(k[2:-1]) # not in latin-1
       
   527                 return ord(k)
       
   528             try: name2cp(ref)
       
   529             except KeyError: text = '&%s;' % ref
       
   530             else: text = unichr(name2cp(ref)).encode('utf-8')
       
   531         self.elementstack[-1][2].append(text)
       
   532 
       
   533     def handle_data(self, text, escape=1):
       
   534         # called for each block of plain text, i.e. outside of any tag and
       
   535         # not containing any character or entity references
       
   536         if not self.elementstack: return
       
   537         if escape and self.contentparams.get('type') == 'application/xhtml+xml':
       
   538             text = _xmlescape(text)
       
   539         self.elementstack[-1][2].append(text)
       
   540 
       
   541     def handle_comment(self, text):
       
   542         # called for each comment, e.g. <!-- insert message here -->
       
   543         pass
       
   544 
       
   545     def handle_pi(self, text):
       
   546         # called for each processing instruction, e.g. <?instruction>
       
   547         pass
       
   548 
       
   549     def handle_decl(self, text):
       
   550         pass
       
   551 
       
   552     def parse_declaration(self, i):
       
   553         # override internal declaration handler to handle CDATA blocks
       
   554         if _debug: sys.stderr.write('entering parse_declaration\n')
       
   555         if self.rawdata[i:i+9] == '<![CDATA[':
       
   556             k = self.rawdata.find(']]>', i)
       
   557             if k == -1: k = len(self.rawdata)
       
   558             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
       
   559             return k+3
       
   560         else:
       
   561             k = self.rawdata.find('>', i)
       
   562             return k+1
       
   563 
       
   564     def mapContentType(self, contentType):
       
   565         contentType = contentType.lower()
       
   566         if contentType == 'text':
       
   567             contentType = 'text/plain'
       
   568         elif contentType == 'html':
       
   569             contentType = 'text/html'
       
   570         elif contentType == 'xhtml':
       
   571             contentType = 'application/xhtml+xml'
       
   572         return contentType
       
   573     
       
   574     def trackNamespace(self, prefix, uri):
       
   575         loweruri = uri.lower()
       
   576         if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
       
   577             self.version = 'rss090'
       
   578         if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
       
   579             self.version = 'rss10'
       
   580         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
       
   581             self.version = 'atom10'
       
   582         if loweruri.find('backend.userland.com/rss') <> -1:
       
   583             # match any backend.userland.com namespace
       
   584             uri = 'http://backend.userland.com/rss'
       
   585             loweruri = uri
       
   586         if self._matchnamespaces.has_key(loweruri):
       
   587             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
       
   588             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
       
   589         else:
       
   590             self.namespacesInUse[prefix or ''] = uri
       
   591 
       
   592     def resolveURI(self, uri):
       
   593         return _urljoin(self.baseuri or '', uri)
       
   594     
       
   595     def decodeEntities(self, element, data):
       
   596         return data
       
   597 
       
   598     def push(self, element, expectingText):
       
   599         self.elementstack.append([element, expectingText, []])
       
   600 
       
   601     def pop(self, element, stripWhitespace=1):
       
   602         if not self.elementstack: return
       
   603         if self.elementstack[-1][0] != element: return
       
   604         
       
   605         element, expectingText, pieces = self.elementstack.pop()
       
   606         output = ''.join(pieces)
       
   607         if stripWhitespace:
       
   608             output = output.strip()
       
   609         if not expectingText: return output
       
   610 
       
   611         # decode base64 content
       
   612         if base64 and self.contentparams.get('base64', 0):
       
   613             try:
       
   614                 output = base64.decodestring(output)
       
   615             except binascii.Error:
       
   616                 pass
       
   617             except binascii.Incomplete:
       
   618                 pass
       
   619                 
       
   620         # resolve relative URIs
       
   621         if (element in self.can_be_relative_uri) and output:
       
   622             output = self.resolveURI(output)
       
   623         
       
   624         # decode entities within embedded markup
       
   625         if not self.contentparams.get('base64', 0):
       
   626             output = self.decodeEntities(element, output)
       
   627 
       
   628         # remove temporary cruft from contentparams
       
   629         try:
       
   630             del self.contentparams['mode']
       
   631         except KeyError:
       
   632             pass
       
   633         try:
       
   634             del self.contentparams['base64']
       
   635         except KeyError:
       
   636             pass
       
   637 
       
   638         # resolve relative URIs within embedded markup
       
   639         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
       
   640             if element in self.can_contain_relative_uris:
       
   641                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
       
   642         
       
   643         # sanitize embedded markup
       
   644         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
       
   645             if element in self.can_contain_dangerous_markup:
       
   646                 output = _sanitizeHTML(output, self.encoding)
       
   647 
       
   648         if self.encoding and type(output) != type(u''):
       
   649             try:
       
   650                 output = unicode(output, self.encoding)
       
   651             except:
       
   652                 pass
       
   653 
       
   654         # categories/tags/keywords/whatever are handled in _end_category
       
   655         if element == 'category':
       
   656             return output
       
   657         
       
   658         # store output in appropriate place(s)
       
   659         if self.inentry and not self.insource:
       
   660             if element == 'content':
       
   661                 self.entries[-1].setdefault(element, [])
       
   662                 contentparams = copy.deepcopy(self.contentparams)
       
   663                 contentparams['value'] = output
       
   664                 self.entries[-1][element].append(contentparams)
       
   665             elif element == 'link':
       
   666                 self.entries[-1][element] = output
       
   667                 if output:
       
   668                     self.entries[-1]['links'][-1]['href'] = output
       
   669             else:
       
   670                 if element == 'description':
       
   671                     element = 'summary'
       
   672                 self.entries[-1][element] = output
       
   673                 if self.incontent:
       
   674                     contentparams = copy.deepcopy(self.contentparams)
       
   675                     contentparams['value'] = output
       
   676                     self.entries[-1][element + '_detail'] = contentparams
       
   677         elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
       
   678             context = self._getContext()
       
   679             if element == 'description':
       
   680                 element = 'subtitle'
       
   681             context[element] = output
       
   682             if element == 'link':
       
   683                 context['links'][-1]['href'] = output
       
   684             elif self.incontent:
       
   685                 contentparams = copy.deepcopy(self.contentparams)
       
   686                 contentparams['value'] = output
       
   687                 context[element + '_detail'] = contentparams
       
   688         return output
       
   689 
       
   690     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
       
   691         self.incontent += 1
       
   692         self.contentparams = FeedParserDict({
       
   693             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
       
   694             'language': self.lang,
       
   695             'base': self.baseuri})
       
   696         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
       
   697         self.push(tag, expectingText)
       
   698 
       
   699     def popContent(self, tag):
       
   700         value = self.pop(tag)
       
   701         self.incontent -= 1
       
   702         self.contentparams.clear()
       
   703         return value
       
   704         
       
   705     def _mapToStandardPrefix(self, name):
       
   706         colonpos = name.find(':')
       
   707         if colonpos <> -1:
       
   708             prefix = name[:colonpos]
       
   709             suffix = name[colonpos+1:]
       
   710             prefix = self.namespacemap.get(prefix, prefix)
       
   711             name = prefix + ':' + suffix
       
   712         return name
       
   713         
       
   714     def _getAttribute(self, attrsD, name):
       
   715         return attrsD.get(self._mapToStandardPrefix(name))
       
   716 
       
   717     def _isBase64(self, attrsD, contentparams):
       
   718         if attrsD.get('mode', '') == 'base64':
       
   719             return 1
       
   720         if self.contentparams['type'].startswith('text/'):
       
   721             return 0
       
   722         if self.contentparams['type'].endswith('+xml'):
       
   723             return 0
       
   724         if self.contentparams['type'].endswith('/xml'):
       
   725             return 0
       
   726         return 1
       
   727 
       
   728     def _itsAnHrefDamnIt(self, attrsD):
       
   729         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
       
   730         if href:
       
   731             try:
       
   732                 del attrsD['url']
       
   733             except KeyError:
       
   734                 pass
       
   735             try:
       
   736                 del attrsD['uri']
       
   737             except KeyError:
       
   738                 pass
       
   739             attrsD['href'] = href
       
   740         return attrsD
       
   741     
       
   742     def _save(self, key, value):
       
   743         context = self._getContext()
       
   744         context.setdefault(key, value)
       
   745 
       
   746     def _start_rss(self, attrsD):
       
   747         versionmap = {'0.91': 'rss091u',
       
   748                       '0.92': 'rss092',
       
   749                       '0.93': 'rss093',
       
   750                       '0.94': 'rss094'}
       
   751         if not self.version:
       
   752             attr_version = attrsD.get('version', '')
       
   753             version = versionmap.get(attr_version)
       
   754             if version:
       
   755                 self.version = version
       
   756             elif attr_version.startswith('2.'):
       
   757                 self.version = 'rss20'
       
   758             else:
       
   759                 self.version = 'rss'
       
   760     
       
   761     def _start_dlhottitles(self, attrsD):
       
   762         self.version = 'hotrss'
       
   763 
       
   764     def _start_channel(self, attrsD):
       
   765         self.infeed = 1
       
   766         self._cdf_common(attrsD)
       
   767     _start_feedinfo = _start_channel
       
   768 
       
   769     def _cdf_common(self, attrsD):
       
   770         if attrsD.has_key('lastmod'):
       
   771             self._start_modified({})
       
   772             self.elementstack[-1][-1] = attrsD['lastmod']
       
   773             self._end_modified()
       
   774         if attrsD.has_key('href'):
       
   775             self._start_link({})
       
   776             self.elementstack[-1][-1] = attrsD['href']
       
   777             self._end_link()
       
   778     
       
   779     def _start_feed(self, attrsD):
       
   780         self.infeed = 1
       
   781         versionmap = {'0.1': 'atom01',
       
   782                       '0.2': 'atom02',
       
   783                       '0.3': 'atom03'}
       
   784         if not self.version:
       
   785             attr_version = attrsD.get('version')
       
   786             version = versionmap.get(attr_version)
       
   787             if version:
       
   788                 self.version = version
       
   789             else:
       
   790                 self.version = 'atom'
       
   791 
       
   792     def _end_channel(self):
       
   793         self.infeed = 0
       
   794     _end_feed = _end_channel
       
   795     
       
   796     def _start_image(self, attrsD):
       
   797         self.inimage = 1
       
   798         self.push('image', 0)
       
   799         context = self._getContext()
       
   800         context.setdefault('image', FeedParserDict())
       
   801             
       
   802     def _end_image(self):
       
   803         self.pop('image')
       
   804         self.inimage = 0
       
   805 
       
   806     def _start_textinput(self, attrsD):
       
   807         self.intextinput = 1
       
   808         self.push('textinput', 0)
       
   809         context = self._getContext()
       
   810         context.setdefault('textinput', FeedParserDict())
       
   811     _start_textInput = _start_textinput
       
   812     
       
   813     def _end_textinput(self):
       
   814         self.pop('textinput')
       
   815         self.intextinput = 0
       
   816     _end_textInput = _end_textinput
       
   817 
       
   818     def _start_author(self, attrsD):
       
   819         self.inauthor = 1
       
   820         self.push('author', 1)
       
   821     _start_managingeditor = _start_author
       
   822     _start_dc_author = _start_author
       
   823     _start_dc_creator = _start_author
       
   824     _start_itunes_author = _start_author
       
   825 
       
   826     def _end_author(self):
       
   827         self.pop('author')
       
   828         self.inauthor = 0
       
   829         self._sync_author_detail()
       
   830     _end_managingeditor = _end_author
       
   831     _end_dc_author = _end_author
       
   832     _end_dc_creator = _end_author
       
   833     _end_itunes_author = _end_author
       
   834 
       
   835     def _start_itunes_owner(self, attrsD):
       
   836         self.inpublisher = 1
       
   837         self.push('publisher', 0)
       
   838 
       
   839     def _end_itunes_owner(self):
       
   840         self.pop('publisher')
       
   841         self.inpublisher = 0
       
   842         self._sync_author_detail('publisher')
       
   843 
       
   844     def _start_contributor(self, attrsD):
       
   845         self.incontributor = 1
       
   846         context = self._getContext()
       
   847         context.setdefault('contributors', [])
       
   848         context['contributors'].append(FeedParserDict())
       
   849         self.push('contributor', 0)
       
   850 
       
   851     def _end_contributor(self):
       
   852         self.pop('contributor')
       
   853         self.incontributor = 0
       
   854 
       
   855     def _start_dc_contributor(self, attrsD):
       
   856         self.incontributor = 1
       
   857         context = self._getContext()
       
   858         context.setdefault('contributors', [])
       
   859         context['contributors'].append(FeedParserDict())
       
   860         self.push('name', 0)
       
   861 
       
   862     def _end_dc_contributor(self):
       
   863         self._end_name()
       
   864         self.incontributor = 0
       
   865 
       
   866     def _start_name(self, attrsD):
       
   867         self.push('name', 0)
       
   868     _start_itunes_name = _start_name
       
   869 
       
   870     def _end_name(self):
       
   871         value = self.pop('name')
       
   872         if self.inpublisher:
       
   873             self._save_author('name', value, 'publisher')
       
   874         elif self.inauthor:
       
   875             self._save_author('name', value)
       
   876         elif self.incontributor:
       
   877             self._save_contributor('name', value)
       
   878         elif self.intextinput:
       
   879             context = self._getContext()
       
   880             context['textinput']['name'] = value
       
   881     _end_itunes_name = _end_name
       
   882 
       
   883     def _start_width(self, attrsD):
       
   884         self.push('width', 0)
       
   885 
       
   886     def _end_width(self):
       
   887         value = self.pop('width')
       
   888         try:
       
   889             value = int(value)
       
   890         except:
       
   891             value = 0
       
   892         if self.inimage:
       
   893             context = self._getContext()
       
   894             context['image']['width'] = value
       
   895 
       
   896     def _start_height(self, attrsD):
       
   897         self.push('height', 0)
       
   898 
       
   899     def _end_height(self):
       
   900         value = self.pop('height')
       
   901         try:
       
   902             value = int(value)
       
   903         except:
       
   904             value = 0
       
   905         if self.inimage:
       
   906             context = self._getContext()
       
   907             context['image']['height'] = value
       
   908 
       
   909     def _start_url(self, attrsD):
       
   910         self.push('href', 1)
       
   911     _start_homepage = _start_url
       
   912     _start_uri = _start_url
       
   913 
       
   914     def _end_url(self):
       
   915         value = self.pop('href')
       
   916         if self.inauthor:
       
   917             self._save_author('href', value)
       
   918         elif self.incontributor:
       
   919             self._save_contributor('href', value)
       
   920         elif self.inimage:
       
   921             context = self._getContext()
       
   922             context['image']['href'] = value
       
   923         elif self.intextinput:
       
   924             context = self._getContext()
       
   925             context['textinput']['link'] = value
       
   926     _end_homepage = _end_url
       
   927     _end_uri = _end_url
       
   928 
       
   929     def _start_email(self, attrsD):
       
   930         self.push('email', 0)
       
   931     _start_itunes_email = _start_email
       
   932 
       
   933     def _end_email(self):
       
   934         value = self.pop('email')
       
   935         if self.inpublisher:
       
   936             self._save_author('email', value, 'publisher')
       
   937         elif self.inauthor:
       
   938             self._save_author('email', value)
       
   939         elif self.incontributor:
       
   940             self._save_contributor('email', value)
       
   941     _end_itunes_email = _end_email
       
   942 
       
   943     def _getContext(self):
       
   944         if self.insource:
       
   945             context = self.sourcedata
       
   946         elif self.inentry:
       
   947             context = self.entries[-1]
       
   948         else:
       
   949             context = self.feeddata
       
   950         return context
       
   951 
       
   952     def _save_author(self, key, value, prefix='author'):
       
   953         context = self._getContext()
       
   954         context.setdefault(prefix + '_detail', FeedParserDict())
       
   955         context[prefix + '_detail'][key] = value
       
   956         self._sync_author_detail()
       
   957 
       
   958     def _save_contributor(self, key, value):
       
   959         context = self._getContext()
       
   960         context.setdefault('contributors', [FeedParserDict()])
       
   961         context['contributors'][-1][key] = value
       
   962 
       
   963     def _sync_author_detail(self, key='author'):
       
   964         context = self._getContext()
       
   965         detail = context.get('%s_detail' % key)
       
   966         if detail:
       
   967             name = detail.get('name')
       
   968             email = detail.get('email')
       
   969             if name and email:
       
   970                 context[key] = '%s (%s)' % (name, email)
       
   971             elif name:
       
   972                 context[key] = name
       
   973             elif email:
       
   974                 context[key] = email
       
   975         else:
       
   976             author = context.get(key)
       
   977             if not author: return
       
   978             emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
       
   979             if not emailmatch: return
       
   980             email = emailmatch.group(0)
       
   981             # probably a better way to do the following, but it passes all the tests
       
   982             author = author.replace(email, '')
       
   983             author = author.replace('()', '')
       
   984             author = author.strip()
       
   985             if author and (author[0] == '('):
       
   986                 author = author[1:]
       
   987             if author and (author[-1] == ')'):
       
   988                 author = author[:-1]
       
   989             author = author.strip()
       
   990             context.setdefault('%s_detail' % key, FeedParserDict())
       
   991             context['%s_detail' % key]['name'] = author
       
   992             context['%s_detail' % key]['email'] = email
       
   993 
       
   994     def _start_subtitle(self, attrsD):
       
   995         self.pushContent('subtitle', attrsD, 'text/plain', 1)
       
   996     _start_tagline = _start_subtitle
       
   997     _start_itunes_subtitle = _start_subtitle
       
   998 
       
   999     def _end_subtitle(self):
       
  1000         self.popContent('subtitle')
       
  1001     _end_tagline = _end_subtitle
       
  1002     _end_itunes_subtitle = _end_subtitle
       
  1003             
       
  1004     def _start_rights(self, attrsD):
       
  1005         self.pushContent('rights', attrsD, 'text/plain', 1)
       
  1006     _start_dc_rights = _start_rights
       
  1007     _start_copyright = _start_rights
       
  1008 
       
  1009     def _end_rights(self):
       
  1010         self.popContent('rights')
       
  1011     _end_dc_rights = _end_rights
       
  1012     _end_copyright = _end_rights
       
  1013 
       
  1014     def _start_item(self, attrsD):
       
  1015         self.entries.append(FeedParserDict())
       
  1016         self.push('item', 0)
       
  1017         self.inentry = 1
       
  1018         self.guidislink = 0
       
  1019         id = self._getAttribute(attrsD, 'rdf:about')
       
  1020         if id:
       
  1021             context = self._getContext()
       
  1022             context['id'] = id
       
  1023         self._cdf_common(attrsD)
       
  1024     _start_entry = _start_item
       
  1025     _start_product = _start_item
       
  1026 
       
  1027     def _end_item(self):
       
  1028         self.pop('item')
       
  1029         self.inentry = 0
       
  1030     _end_entry = _end_item
       
  1031 
       
  1032     def _start_dc_language(self, attrsD):
       
  1033         self.push('language', 1)
       
  1034     _start_language = _start_dc_language
       
  1035 
       
  1036     def _end_dc_language(self):
       
  1037         self.lang = self.pop('language')
       
  1038     _end_language = _end_dc_language
       
  1039 
       
  1040     def _start_dc_publisher(self, attrsD):
       
  1041         self.push('publisher', 1)
       
  1042     _start_webmaster = _start_dc_publisher
       
  1043 
       
  1044     def _end_dc_publisher(self):
       
  1045         self.pop('publisher')
       
  1046         self._sync_author_detail('publisher')
       
  1047     _end_webmaster = _end_dc_publisher
       
  1048 
       
  1049     def _start_published(self, attrsD):
       
  1050         self.push('published', 1)
       
  1051     _start_dcterms_issued = _start_published
       
  1052     _start_issued = _start_published
       
  1053 
       
  1054     def _end_published(self):
       
  1055         value = self.pop('published')
       
  1056         self._save('published_parsed', _parse_date(value))
       
  1057     _end_dcterms_issued = _end_published
       
  1058     _end_issued = _end_published
       
  1059 
       
  1060     def _start_updated(self, attrsD):
       
  1061         self.push('updated', 1)
       
  1062     _start_modified = _start_updated
       
  1063     _start_dcterms_modified = _start_updated
       
  1064     _start_pubdate = _start_updated
       
  1065     _start_dc_date = _start_updated
       
  1066 
       
  1067     def _end_updated(self):
       
  1068         value = self.pop('updated')
       
  1069         parsed_value = _parse_date(value)
       
  1070         self._save('updated_parsed', parsed_value)
       
  1071     _end_modified = _end_updated
       
  1072     _end_dcterms_modified = _end_updated
       
  1073     _end_pubdate = _end_updated
       
  1074     _end_dc_date = _end_updated
       
  1075 
       
  1076     def _start_created(self, attrsD):
       
  1077         self.push('created', 1)
       
  1078     _start_dcterms_created = _start_created
       
  1079 
       
  1080     def _end_created(self):
       
  1081         value = self.pop('created')
       
  1082         self._save('created_parsed', _parse_date(value))
       
  1083     _end_dcterms_created = _end_created
       
  1084 
       
  1085     def _start_expirationdate(self, attrsD):
       
  1086         self.push('expired', 1)
       
  1087 
       
  1088     def _end_expirationdate(self):
       
  1089         self._save('expired_parsed', _parse_date(self.pop('expired')))
       
  1090 
       
  1091     def _start_cc_license(self, attrsD):
       
  1092         self.push('license', 1)
       
  1093         value = self._getAttribute(attrsD, 'rdf:resource')
       
  1094         if value:
       
  1095             self.elementstack[-1][2].append(value)
       
  1096         self.pop('license')
       
  1097         
       
  1098     def _start_creativecommons_license(self, attrsD):
       
  1099         self.push('license', 1)
       
  1100 
       
  1101     def _end_creativecommons_license(self):
       
  1102         self.pop('license')
       
  1103 
       
  1104     def _addTag(self, term, scheme, label):
       
  1105         context = self._getContext()
       
  1106         tags = context.setdefault('tags', [])
       
  1107         if (not term) and (not scheme) and (not label): return
       
  1108         value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
       
  1109         if value not in tags:
       
  1110             tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
       
  1111 
       
  1112     def _start_category(self, attrsD):
       
  1113         if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
       
  1114         term = attrsD.get('term')
       
  1115         scheme = attrsD.get('scheme', attrsD.get('domain'))
       
  1116         label = attrsD.get('label')
       
  1117         self._addTag(term, scheme, label)
       
  1118         self.push('category', 1)
       
  1119     _start_dc_subject = _start_category
       
  1120     _start_keywords = _start_category
       
  1121         
       
  1122     def _end_itunes_keywords(self):
       
  1123         for term in self.pop('itunes_keywords').split():
       
  1124             self._addTag(term, 'http://www.itunes.com/', None)
       
  1125         
       
  1126     def _start_itunes_category(self, attrsD):
       
  1127         self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
       
  1128         self.push('category', 1)
       
  1129         
       
  1130     def _end_category(self):
       
  1131         value = self.pop('category')
       
  1132         if not value: return
       
  1133         context = self._getContext()
       
  1134         tags = context['tags']
       
  1135         if value and len(tags) and not tags[-1]['term']:
       
  1136             tags[-1]['term'] = value
       
  1137         else:
       
  1138             self._addTag(value, None, None)
       
  1139     _end_dc_subject = _end_category
       
  1140     _end_keywords = _end_category
       
  1141     _end_itunes_category = _end_category
       
  1142 
       
  1143     def _start_cloud(self, attrsD):
       
  1144         self._getContext()['cloud'] = FeedParserDict(attrsD)
       
  1145         
       
  1146     def _start_link(self, attrsD):
       
  1147         attrsD.setdefault('rel', 'alternate')
       
  1148         attrsD.setdefault('type', 'text/html')
       
  1149         attrsD = self._itsAnHrefDamnIt(attrsD)
       
  1150         if attrsD.has_key('href'):
       
  1151             attrsD['href'] = self.resolveURI(attrsD['href'])
       
  1152         expectingText = self.infeed or self.inentry or self.insource
       
  1153         context = self._getContext()
       
  1154         context.setdefault('links', [])
       
  1155         context['links'].append(FeedParserDict(attrsD))
       
  1156         if attrsD['rel'] == 'enclosure':
       
  1157             self._start_enclosure(attrsD)
       
  1158         if attrsD.has_key('href'):
       
  1159             expectingText = 0
       
  1160             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
       
  1161                 context['link'] = attrsD['href']
       
  1162         else:
       
  1163             self.push('link', expectingText)
       
  1164     _start_producturl = _start_link
       
  1165 
       
  1166     def _end_link(self):
       
  1167         value = self.pop('link')
       
  1168         context = self._getContext()
       
  1169         if self.intextinput:
       
  1170             context['textinput']['link'] = value
       
  1171         if self.inimage:
       
  1172             context['image']['link'] = value
       
  1173     _end_producturl = _end_link
       
  1174 
       
  1175     def _start_guid(self, attrsD):
       
  1176         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
       
  1177         self.push('id', 1)
       
  1178 
       
  1179     def _end_guid(self):
       
  1180         value = self.pop('id')
       
  1181         self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
       
  1182         if self.guidislink:
       
  1183             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
       
  1184             # and only if the item doesn't already have a link element
       
  1185             self._save('link', value)
       
  1186 
       
  1187     def _start_title(self, attrsD):
       
  1188         self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
       
  1189     _start_dc_title = _start_title
       
  1190     _start_media_title = _start_title
       
  1191 
       
  1192     def _end_title(self):
       
  1193         value = self.popContent('title')
       
  1194         context = self._getContext()
       
  1195         if self.intextinput:
       
  1196             context['textinput']['title'] = value
       
  1197         elif self.inimage:
       
  1198             context['image']['title'] = value
       
  1199     _end_dc_title = _end_title
       
  1200     _end_media_title = _end_title
       
  1201 
       
  1202     def _start_description(self, attrsD):
       
  1203         context = self._getContext()
       
  1204         if context.has_key('summary'):
       
  1205             self._summaryKey = 'content'
       
  1206             self._start_content(attrsD)
       
  1207         else:
       
  1208             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
       
  1209 
       
  1210     def _start_abstract(self, attrsD):
       
  1211         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
       
  1212 
       
  1213     def _end_description(self):
       
  1214         if self._summaryKey == 'content':
       
  1215             self._end_content()
       
  1216         else:
       
  1217             value = self.popContent('description')
       
  1218             context = self._getContext()
       
  1219             if self.intextinput:
       
  1220                 context['textinput']['description'] = value
       
  1221             elif self.inimage:
       
  1222                 context['image']['description'] = value
       
  1223         self._summaryKey = None
       
  1224     _end_abstract = _end_description
       
  1225 
       
  1226     def _start_info(self, attrsD):
       
  1227         self.pushContent('info', attrsD, 'text/plain', 1)
       
  1228     _start_feedburner_browserfriendly = _start_info
       
  1229 
       
  1230     def _end_info(self):
       
  1231         self.popContent('info')
       
  1232     _end_feedburner_browserfriendly = _end_info
       
  1233 
       
  1234     def _start_generator(self, attrsD):
       
  1235         if attrsD:
       
  1236             attrsD = self._itsAnHrefDamnIt(attrsD)
       
  1237             if attrsD.has_key('href'):
       
  1238                 attrsD['href'] = self.resolveURI(attrsD['href'])
       
  1239         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
       
  1240         self.push('generator', 1)
       
  1241 
       
  1242     def _end_generator(self):
       
  1243         value = self.pop('generator')
       
  1244         context = self._getContext()
       
  1245         if context.has_key('generator_detail'):
       
  1246             context['generator_detail']['name'] = value
       
  1247             
       
  1248     def _start_admin_generatoragent(self, attrsD):
       
  1249         self.push('generator', 1)
       
  1250         value = self._getAttribute(attrsD, 'rdf:resource')
       
  1251         if value:
       
  1252             self.elementstack[-1][2].append(value)
       
  1253         self.pop('generator')
       
  1254         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
       
  1255 
       
  1256     def _start_admin_errorreportsto(self, attrsD):
       
  1257         self.push('errorreportsto', 1)
       
  1258         value = self._getAttribute(attrsD, 'rdf:resource')
       
  1259         if value:
       
  1260             self.elementstack[-1][2].append(value)
       
  1261         self.pop('errorreportsto')
       
  1262         
       
  1263     def _start_summary(self, attrsD):
       
  1264         context = self._getContext()
       
  1265         if context.has_key('summary'):
       
  1266             self._summaryKey = 'content'
       
  1267             self._start_content(attrsD)
       
  1268         else:
       
  1269             self._summaryKey = 'summary'
       
  1270             self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
       
  1271     _start_itunes_summary = _start_summary
       
  1272 
       
  1273     def _end_summary(self):
       
  1274         if self._summaryKey == 'content':
       
  1275             self._end_content()
       
  1276         else:
       
  1277             self.popContent(self._summaryKey or 'summary')
       
  1278         self._summaryKey = None
       
  1279     _end_itunes_summary = _end_summary
       
  1280         
       
  1281     def _start_enclosure(self, attrsD):
       
  1282         attrsD = self._itsAnHrefDamnIt(attrsD)
       
  1283         self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
       
  1284         href = attrsD.get('href')
       
  1285         if href:
       
  1286             context = self._getContext()
       
  1287             if not context.get('id'):
       
  1288                 context['id'] = href
       
  1289             
       
  1290     def _start_source(self, attrsD):
       
  1291         self.insource = 1
       
  1292 
       
  1293     def _end_source(self):
       
  1294         self.insource = 0
       
  1295         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
       
  1296         self.sourcedata.clear()
       
  1297 
       
  1298     def _start_content(self, attrsD):
       
  1299         self.pushContent('content', attrsD, 'text/plain', 1)
       
  1300         src = attrsD.get('src')
       
  1301         if src:
       
  1302             self.contentparams['src'] = src
       
  1303         self.push('content', 1)
       
  1304 
       
  1305     def _start_prodlink(self, attrsD):
       
  1306         self.pushContent('content', attrsD, 'text/html', 1)
       
  1307 
       
  1308     def _start_body(self, attrsD):
       
  1309         self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
       
  1310     _start_xhtml_body = _start_body
       
  1311 
       
  1312     def _start_content_encoded(self, attrsD):
       
  1313         self.pushContent('content', attrsD, 'text/html', 1)
       
  1314     _start_fullitem = _start_content_encoded
       
  1315 
       
  1316     def _end_content(self):
       
  1317         copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
       
  1318         value = self.popContent('content')
       
  1319         if copyToDescription:
       
  1320             self._save('description', value)
       
  1321     _end_body = _end_content
       
  1322     _end_xhtml_body = _end_content
       
  1323     _end_content_encoded = _end_content
       
  1324     _end_fullitem = _end_content
       
  1325     _end_prodlink = _end_content
       
  1326 
       
  1327     def _start_itunes_image(self, attrsD):
       
  1328         self.push('itunes_image', 0)
       
  1329         self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
       
  1330     _start_itunes_link = _start_itunes_image
       
  1331         
       
  1332     def _end_itunes_block(self):
       
  1333         value = self.pop('itunes_block', 0)
       
  1334         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
       
  1335 
       
  1336     def _end_itunes_explicit(self):
       
  1337         value = self.pop('itunes_explicit', 0)
       
  1338         self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
       
  1339 
       
  1340 if _XML_AVAILABLE:
       
  1341     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
       
  1342         def __init__(self, baseuri, baselang, encoding):
       
  1343             if _debug: sys.stderr.write('trying StrictFeedParser\n')
       
  1344             xml.sax.handler.ContentHandler.__init__(self)
       
  1345             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
       
  1346             self.bozo = 0
       
  1347             self.exc = None
       
  1348         
       
  1349         def startPrefixMapping(self, prefix, uri):
       
  1350             self.trackNamespace(prefix, uri)
       
  1351         
       
  1352         def startElementNS(self, name, qname, attrs):
       
  1353             namespace, localname = name
       
  1354             lowernamespace = str(namespace or '').lower()
       
  1355             if lowernamespace.find('backend.userland.com/rss') <> -1:
       
  1356                 # match any backend.userland.com namespace
       
  1357                 namespace = 'http://backend.userland.com/rss'
       
  1358                 lowernamespace = namespace
       
  1359             if qname and qname.find(':') > 0:
       
  1360                 givenprefix = qname.split(':')[0]
       
  1361             else:
       
  1362                 givenprefix = None
       
  1363             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
       
  1364             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
       
  1365                     raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
       
  1366             if prefix:
       
  1367                 localname = prefix + ':' + localname
       
  1368             localname = str(localname).lower()
       
  1369             if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
       
  1370 
       
  1371             # qname implementation is horribly broken in Python 2.1 (it
       
  1372             # doesn't report any), and slightly broken in Python 2.2 (it
       
  1373             # doesn't report the xml: namespace). So we match up namespaces
       
  1374             # with a known list first, and then possibly override them with
       
  1375             # the qnames the SAX parser gives us (if indeed it gives us any
       
  1376             # at all).  Thanks to MatejC for helping me test this and
       
  1377             # tirelessly telling me that it didn't work yet.
       
  1378             attrsD = {}
       
  1379             for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
       
  1380                 lowernamespace = (namespace or '').lower()
       
  1381                 prefix = self._matchnamespaces.get(lowernamespace, '')
       
  1382                 if prefix:
       
  1383                     attrlocalname = prefix + ':' + attrlocalname
       
  1384                 attrsD[str(attrlocalname).lower()] = attrvalue
       
  1385             for qname in attrs.getQNames():
       
  1386                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
       
  1387             self.unknown_starttag(localname, attrsD.items())
       
  1388 
       
  1389         def characters(self, text):
       
  1390             self.handle_data(text)
       
  1391 
       
  1392         def endElementNS(self, name, qname):
       
  1393             namespace, localname = name
       
  1394             lowernamespace = str(namespace or '').lower()
       
  1395             if qname and qname.find(':') > 0:
       
  1396                 givenprefix = qname.split(':')[0]
       
  1397             else:
       
  1398                 givenprefix = ''
       
  1399             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
       
  1400             if prefix:
       
  1401                 localname = prefix + ':' + localname
       
  1402             localname = str(localname).lower()
       
  1403             self.unknown_endtag(localname)
       
  1404 
       
  1405         def error(self, exc):
       
  1406             self.bozo = 1
       
  1407             self.exc = exc
       
  1408             
       
  1409         def fatalError(self, exc):
       
  1410             self.error(exc)
       
  1411             raise exc
       
  1412 
       
  1413 class _BaseHTMLProcessor(sgmllib.SGMLParser):
       
  1414     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
       
  1415       'img', 'input', 'isindex', 'link', 'meta', 'param']
       
  1416     
       
  1417     def __init__(self, encoding):
       
  1418         self.encoding = encoding
       
  1419         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
       
  1420         sgmllib.SGMLParser.__init__(self)
       
  1421         
       
  1422     def reset(self):
       
  1423         self.pieces = []
       
  1424         sgmllib.SGMLParser.reset(self)
       
  1425 
       
  1426     def _shorttag_replace(self, match):
       
  1427         tag = match.group(1)
       
  1428         if tag in self.elements_no_end_tag:
       
  1429             return '<' + tag + ' />'
       
  1430         else:
       
  1431             return '<' + tag + '></' + tag + '>'
       
  1432         
       
  1433     def feed(self, data):
       
  1434         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
       
  1435         #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
       
  1436         data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) 
       
  1437         data = data.replace('&#39;', "'")
       
  1438         data = data.replace('&#34;', '"')
       
  1439         if self.encoding and type(data) == type(u''):
       
  1440             data = data.encode(self.encoding)
       
  1441         sgmllib.SGMLParser.feed(self, data)
       
  1442 
       
  1443     def normalize_attrs(self, attrs):
       
  1444         # utility method to be called by descendants
       
  1445         attrs = [(k.lower(), v) for k, v in attrs]
       
  1446         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
       
  1447         return attrs
       
  1448 
       
  1449     def unknown_starttag(self, tag, attrs):
       
  1450         # called for each start tag
       
  1451         # attrs is a list of (attr, value) tuples
       
  1452         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
       
  1453         if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
       
  1454         uattrs = []
       
  1455         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
       
  1456         for key, value in attrs:
       
  1457             if type(value) != type(u''):
       
  1458                 value = unicode(value, self.encoding)
       
  1459             uattrs.append((unicode(key, self.encoding), value))
       
  1460         strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
       
  1461         if tag in self.elements_no_end_tag:
       
  1462             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
       
  1463         else:
       
  1464             self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
       
  1465 
       
  1466     def unknown_endtag(self, tag):
       
  1467         # called for each end tag, e.g. for </pre>, tag will be 'pre'
       
  1468         # Reconstruct the original end tag.
       
  1469         if tag not in self.elements_no_end_tag:
       
  1470             self.pieces.append("</%(tag)s>" % locals())
       
  1471 
       
  1472     def handle_charref(self, ref):
       
  1473         # called for each character reference, e.g. for '&#160;', ref will be '160'
       
  1474         # Reconstruct the original character reference.
       
  1475         self.pieces.append('&#%(ref)s;' % locals())
       
  1476         
       
  1477     def handle_entityref(self, ref):
       
  1478         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
       
  1479         # Reconstruct the original entity reference.
       
  1480         self.pieces.append('&%(ref)s;' % locals())
       
  1481 
       
  1482     def handle_data(self, text):
       
  1483         # called for each block of plain text, i.e. outside of any tag and
       
  1484         # not containing any character or entity references
       
  1485         # Store the original text verbatim.
       
  1486         if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
       
  1487         self.pieces.append(text)
       
  1488         
       
  1489     def handle_comment(self, text):
       
  1490         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
       
  1491         # Reconstruct the original comment.
       
  1492         self.pieces.append('<!--%(text)s-->' % locals())
       
  1493         
       
  1494     def handle_pi(self, text):
       
  1495         # called for each processing instruction, e.g. <?instruction>
       
  1496         # Reconstruct original processing instruction.
       
  1497         self.pieces.append('<?%(text)s>' % locals())
       
  1498 
       
  1499     def handle_decl(self, text):
       
  1500         # called for the DOCTYPE, if present, e.g.
       
  1501         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
       
  1502         #     "http://www.w3.org/TR/html4/loose.dtd">
       
  1503         # Reconstruct original DOCTYPE
       
  1504         self.pieces.append('<!%(text)s>' % locals())
       
  1505         
       
  1506     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
       
  1507     def _scan_name(self, i, declstartpos):
       
  1508         rawdata = self.rawdata
       
  1509         n = len(rawdata)
       
  1510         if i == n:
       
  1511             return None, -1
       
  1512         m = self._new_declname_match(rawdata, i)
       
  1513         if m:
       
  1514             s = m.group()
       
  1515             name = s.strip()
       
  1516             if (i + len(s)) == n:
       
  1517                 return None, -1  # end of buffer
       
  1518             return name.lower(), m.end()
       
  1519         else:
       
  1520             self.handle_data(rawdata)
       
  1521 #            self.updatepos(declstartpos, i)
       
  1522             return None, -1
       
  1523 
       
  1524     def output(self):
       
  1525         '''Return processed HTML as a single string'''
       
  1526         return ''.join([str(p) for p in self.pieces])
       
  1527 
       
  1528 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
       
  1529     def __init__(self, baseuri, baselang, encoding):
       
  1530         sgmllib.SGMLParser.__init__(self)
       
  1531         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
       
  1532 
       
  1533     def decodeEntities(self, element, data):
       
  1534         data = data.replace('&#60;', '&lt;')
       
  1535         data = data.replace('&#x3c;', '&lt;')
       
  1536         data = data.replace('&#62;', '&gt;')
       
  1537         data = data.replace('&#x3e;', '&gt;')
       
  1538         data = data.replace('&#38;', '&amp;')
       
  1539         data = data.replace('&#x26;', '&amp;')
       
  1540         data = data.replace('&#34;', '&quot;')
       
  1541         data = data.replace('&#x22;', '&quot;')
       
  1542         data = data.replace('&#39;', '&apos;')
       
  1543         data = data.replace('&#x27;', '&apos;')
       
  1544         if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
       
  1545             data = data.replace('&lt;', '<')
       
  1546             data = data.replace('&gt;', '>')
       
  1547             data = data.replace('&amp;', '&')
       
  1548             data = data.replace('&quot;', '"')
       
  1549             data = data.replace('&apos;', "'")
       
  1550         return data
       
  1551         
       
  1552 class _RelativeURIResolver(_BaseHTMLProcessor):
       
  1553     relative_uris = [('a', 'href'),
       
  1554                      ('applet', 'codebase'),
       
  1555                      ('area', 'href'),
       
  1556                      ('blockquote', 'cite'),
       
  1557                      ('body', 'background'),
       
  1558                      ('del', 'cite'),
       
  1559                      ('form', 'action'),
       
  1560                      ('frame', 'longdesc'),
       
  1561                      ('frame', 'src'),
       
  1562                      ('iframe', 'longdesc'),
       
  1563                      ('iframe', 'src'),
       
  1564                      ('head', 'profile'),
       
  1565                      ('img', 'longdesc'),
       
  1566                      ('img', 'src'),
       
  1567                      ('img', 'usemap'),
       
  1568                      ('input', 'src'),
       
  1569                      ('input', 'usemap'),
       
  1570                      ('ins', 'cite'),
       
  1571                      ('link', 'href'),
       
  1572                      ('object', 'classid'),
       
  1573                      ('object', 'codebase'),
       
  1574                      ('object', 'data'),
       
  1575                      ('object', 'usemap'),
       
  1576                      ('q', 'cite'),
       
  1577                      ('script', 'src')]
       
  1578 
       
  1579     def __init__(self, baseuri, encoding):
       
  1580         _BaseHTMLProcessor.__init__(self, encoding)
       
  1581         self.baseuri = baseuri
       
  1582 
       
  1583     def resolveURI(self, uri):
       
  1584         return _urljoin(self.baseuri, uri)
       
  1585     
       
  1586     def unknown_starttag(self, tag, attrs):
       
  1587         attrs = self.normalize_attrs(attrs)
       
  1588         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
       
  1589         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
       
  1590         
       
  1591 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
       
  1592     if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
       
  1593     p = _RelativeURIResolver(baseURI, encoding)
       
  1594     p.feed(htmlSource)
       
  1595     return p.output()
       
  1596 
       
  1597 class _HTMLSanitizer(_BaseHTMLProcessor):
       
  1598     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
       
  1599       'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
       
  1600       'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
       
  1601       'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
       
  1602       'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
       
  1603       'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
       
  1604       'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
       
  1605       'thead', 'tr', 'tt', 'u', 'ul', 'var']
       
  1606 
       
  1607     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
       
  1608       'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
       
  1609       'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
       
  1610       'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
       
  1611       'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
       
  1612       'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
       
  1613       'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
       
  1614       'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
       
  1615       'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
       
  1616       'usemap', 'valign', 'value', 'vspace', 'width']
       
  1617 
       
  1618     unacceptable_elements_with_end_tag = ['script', 'applet']
       
  1619 
       
  1620     def reset(self):
       
  1621         _BaseHTMLProcessor.reset(self)
       
  1622         self.unacceptablestack = 0
       
  1623         
       
  1624     def unknown_starttag(self, tag, attrs):
       
  1625         if not tag in self.acceptable_elements:
       
  1626             if tag in self.unacceptable_elements_with_end_tag:
       
  1627                 self.unacceptablestack += 1
       
  1628             return
       
  1629         attrs = self.normalize_attrs(attrs)
       
  1630         attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
       
  1631         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
       
  1632         
       
  1633     def unknown_endtag(self, tag):
       
  1634         if not tag in self.acceptable_elements:
       
  1635             if tag in self.unacceptable_elements_with_end_tag:
       
  1636                 self.unacceptablestack -= 1
       
  1637             return
       
  1638         _BaseHTMLProcessor.unknown_endtag(self, tag)
       
  1639 
       
  1640     def handle_pi(self, text):
       
  1641         pass
       
  1642 
       
  1643     def handle_decl(self, text):
       
  1644         pass
       
  1645 
       
  1646     def handle_data(self, text):
       
  1647         if not self.unacceptablestack:
       
  1648             _BaseHTMLProcessor.handle_data(self, text)
       
  1649 
       
  1650 def _sanitizeHTML(htmlSource, encoding):
       
  1651     p = _HTMLSanitizer(encoding)
       
  1652     p.feed(htmlSource)
       
  1653     data = p.output()
       
  1654     if TIDY_MARKUP:
       
  1655         # loop through list of preferred Tidy interfaces looking for one that's installed,
       
  1656         # then set up a common _tidy function to wrap the interface-specific API.
       
  1657         _tidy = None
       
  1658         for tidy_interface in PREFERRED_TIDY_INTERFACES:
       
  1659             try:
       
  1660                 if tidy_interface == "uTidy":
       
  1661                     from tidy import parseString as _utidy
       
  1662                     def _tidy(data, **kwargs):
       
  1663                         return str(_utidy(data, **kwargs))
       
  1664                     break
       
  1665                 elif tidy_interface == "mxTidy":
       
  1666                     from mx.Tidy import Tidy as _mxtidy
       
  1667                     def _tidy(data, **kwargs):
       
  1668                         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
       
  1669                         return data
       
  1670                     break
       
  1671             except:
       
  1672                 pass
       
  1673         if _tidy:
       
  1674             utf8 = type(data) == type(u'')
       
  1675             if utf8:
       
  1676                 data = data.encode('utf-8')
       
  1677             data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
       
  1678             if utf8:
       
  1679                 data = unicode(data, 'utf-8')
       
  1680             if data.count('<body'):
       
  1681                 data = data.split('<body', 1)[1]
       
  1682                 if data.count('>'):
       
  1683                     data = data.split('>', 1)[1]
       
  1684             if data.count('</body'):
       
  1685                 data = data.split('</body', 1)[0]
       
  1686     data = data.strip().replace('\r\n', '\n')
       
  1687     return data
       
  1688 
       
  1689 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
       
  1690     def http_error_default(self, req, fp, code, msg, headers):
       
  1691         if ((code / 100) == 3) and (code != 304):
       
  1692             return self.http_error_302(req, fp, code, msg, headers)
       
  1693         infourl = urllib.addinfourl(fp, headers, req.get_full_url())
       
  1694         infourl.status = code
       
  1695         return infourl
       
  1696 
       
  1697     def http_error_302(self, req, fp, code, msg, headers):
       
  1698         if headers.dict.has_key('location'):
       
  1699             infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
       
  1700         else:
       
  1701             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
       
  1702         if not hasattr(infourl, 'status'):
       
  1703             infourl.status = code
       
  1704         return infourl
       
  1705 
       
  1706     def http_error_301(self, req, fp, code, msg, headers):
       
  1707         if headers.dict.has_key('location'):
       
  1708             infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
       
  1709         else:
       
  1710             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
       
  1711         if not hasattr(infourl, 'status'):
       
  1712             infourl.status = code
       
  1713         return infourl
       
  1714 
       
  1715     http_error_300 = http_error_302
       
  1716     http_error_303 = http_error_302
       
  1717     http_error_307 = http_error_302
       
  1718         
       
  1719     def http_error_401(self, req, fp, code, msg, headers):
       
  1720         # Check if
       
  1721         # - server requires digest auth, AND
       
  1722         # - we tried (unsuccessfully) with basic auth, AND
       
  1723         # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
       
  1724         # If all conditions hold, parse authentication information
       
  1725         # out of the Authorization header we sent the first time
       
  1726         # (for the username and password) and the WWW-Authenticate
       
  1727         # header the server sent back (for the realm) and retry
       
  1728         # the request with the appropriate digest auth headers instead.
       
  1729         # This evil genius hack has been brought to you by Aaron Swartz.
       
  1730         host = urlparse.urlparse(req.get_full_url())[1]
       
  1731         try:
       
  1732             assert sys.version.split()[0] >= '2.3.3'
       
  1733             assert base64 != None
       
  1734             user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
       
  1735             realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
       
  1736             self.add_password(realm, host, user, passw)
       
  1737             retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
       
  1738             self.reset_retry_count()
       
  1739             return retry
       
  1740         except:
       
  1741             return self.http_error_default(req, fp, code, msg, headers)
       
  1742 
       
  1743 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
       
  1744     """URL, filename, or string --> stream
       
  1745 
       
  1746     This function lets you define parsers that take any input source
       
  1747     (URL, pathname to local or network file, or actual data as a string)
       
  1748     and deal with it in a uniform manner.  Returned object is guaranteed
       
  1749     to have all the basic stdio read methods (read, readline, readlines).
       
  1750     Just .close() the object when you're done with it.
       
  1751 
       
  1752     If the etag argument is supplied, it will be used as the value of an
       
  1753     If-None-Match request header.
       
  1754 
       
  1755     If the modified argument is supplied, it must be a tuple of 9 integers
       
  1756     as returned by gmtime() in the standard Python time module. This MUST
       
  1757     be in GMT (Greenwich Mean Time). The formatted date/time will be used
       
  1758     as the value of an If-Modified-Since request header.
       
  1759 
       
  1760     If the agent argument is supplied, it will be used as the value of a
       
  1761     User-Agent request header.
       
  1762 
       
  1763     If the referrer argument is supplied, it will be used as the value of a
       
  1764     Referer[sic] request header.
       
  1765 
       
  1766     If handlers is supplied, it is a list of handlers used to build a
       
  1767     urllib2 opener.
       
  1768     """
       
  1769 
       
  1770     if hasattr(url_file_stream_or_string, 'read'):
       
  1771         return url_file_stream_or_string
       
  1772 
       
  1773     if url_file_stream_or_string == '-':
       
  1774         return sys.stdin
       
  1775 
       
  1776     if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
       
  1777         if not agent:
       
  1778             agent = USER_AGENT
       
  1779         # test for inline user:password for basic auth
       
  1780         auth = None
       
  1781         if base64:
       
  1782             urltype, rest = urllib.splittype(url_file_stream_or_string)
       
  1783             realhost, rest = urllib.splithost(rest)
       
  1784             if realhost:
       
  1785                 user_passwd, realhost = urllib.splituser(realhost)
       
  1786                 if user_passwd:
       
  1787                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
       
  1788                     auth = base64.encodestring(user_passwd).strip()
       
  1789         # try to open with urllib2 (to use optional headers)
       
  1790         request = urllib2.Request(url_file_stream_or_string)
       
  1791         request.add_header('User-Agent', agent)
       
  1792         if etag:
       
  1793             request.add_header('If-None-Match', etag)
       
  1794         if modified:
       
  1795             # format into an RFC 1123-compliant timestamp. We can't use
       
  1796             # time.strftime() since the %a and %b directives can be affected
       
  1797             # by the current locale, but RFC 2616 states that dates must be
       
  1798             # in English.
       
  1799             short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
       
  1800             months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
       
  1801             request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
       
  1802         if referrer:
       
  1803             request.add_header('Referer', referrer)
       
  1804         if gzip and zlib:
       
  1805             request.add_header('Accept-encoding', 'gzip, deflate')
       
  1806         elif gzip:
       
  1807             request.add_header('Accept-encoding', 'gzip')
       
  1808         elif zlib:
       
  1809             request.add_header('Accept-encoding', 'deflate')
       
  1810         else:
       
  1811             request.add_header('Accept-encoding', '')
       
  1812         if auth:
       
  1813             request.add_header('Authorization', 'Basic %s' % auth)
       
  1814         if ACCEPT_HEADER:
       
  1815             request.add_header('Accept', ACCEPT_HEADER)
       
  1816         request.add_header('A-IM', 'feed') # RFC 3229 support
       
  1817         opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
       
  1818         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
       
  1819         try:
       
  1820             return opener.open(request)
       
  1821         finally:
       
  1822             opener.close() # JohnD
       
  1823     
       
  1824     # try to open with native open function (if url_file_stream_or_string is a filename)
       
  1825     try:
       
  1826         return open(url_file_stream_or_string)
       
  1827     except:
       
  1828         pass
       
  1829 
       
  1830     # treat url_file_stream_or_string as string
       
  1831     return _StringIO(str(url_file_stream_or_string))
       
  1832 
       
  1833 _date_handlers = []
       
  1834 def registerDateHandler(func):
       
  1835     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
       
  1836     _date_handlers.insert(0, func)
       
  1837     
       
  1838 # ISO-8601 date parsing routines written by Fazal Majid.
       
  1839 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
       
  1840 # parser is beyond the scope of feedparser and would be a worthwhile addition
       
  1841 # to the Python library.
       
  1842 # A single regular expression cannot parse ISO 8601 date formats into groups
       
  1843 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
       
  1844 # 0301-04-01), so we use templates instead.
       
  1845 # Please note the order in templates is significant because we need a
       
  1846 # greedy match.
       
  1847 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
       
  1848                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
       
  1849                 '-YY-?MM', '-OOO', '-YY',
       
  1850                 '--MM-?DD', '--MM',
       
  1851                 '---DD',
       
  1852                 'CC', '']
       
  1853 _iso8601_re = [
       
  1854     tmpl.replace(
       
  1855     'YYYY', r'(?P<year>\d{4})').replace(
       
  1856     'YY', r'(?P<year>\d\d)').replace(
       
  1857     'MM', r'(?P<month>[01]\d)').replace(
       
  1858     'DD', r'(?P<day>[0123]\d)').replace(
       
  1859     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
       
  1860     'CC', r'(?P<century>\d\d$)')
       
  1861     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
       
  1862     + r'(:(?P<second>\d{2}))?'
       
  1863     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
       
  1864     for tmpl in _iso8601_tmpl]
       
  1865 del tmpl
       
  1866 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
       
  1867 del regex
       
  1868 def _parse_date_iso8601(dateString):
       
  1869     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
       
  1870     m = None
       
  1871     for _iso8601_match in _iso8601_matches:
       
  1872         m = _iso8601_match(dateString)
       
  1873         if m: break
       
  1874     if not m: return
       
  1875     if m.span() == (0, 0): return
       
  1876     params = m.groupdict()
       
  1877     ordinal = params.get('ordinal', 0)
       
  1878     if ordinal:
       
  1879         ordinal = int(ordinal)
       
  1880     else:
       
  1881         ordinal = 0
       
  1882     year = params.get('year', '--')
       
  1883     if not year or year == '--':
       
  1884         year = time.gmtime()[0]
       
  1885     elif len(year) == 2:
       
  1886         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
       
  1887         year = 100 * int(time.gmtime()[0] / 100) + int(year)
       
  1888     else:
       
  1889         year = int(year)
       
  1890     month = params.get('month', '-')
       
  1891     if not month or month == '-':
       
  1892         # ordinals are NOT normalized by mktime, we simulate them
       
  1893         # by setting month=1, day=ordinal
       
  1894         if ordinal:
       
  1895             month = 1
       
  1896         else:
       
  1897             month = time.gmtime()[1]
       
  1898     month = int(month)
       
  1899     day = params.get('day', 0)
       
  1900     if not day:
       
  1901         # see above
       
  1902         if ordinal:
       
  1903             day = ordinal
       
  1904         elif params.get('century', 0) or \
       
  1905                  params.get('year', 0) or params.get('month', 0):
       
  1906             day = 1
       
  1907         else:
       
  1908             day = time.gmtime()[2]
       
  1909     else:
       
  1910         day = int(day)
       
  1911     # special case of the century - is the first year of the 21st century
       
  1912     # 2000 or 2001 ? The debate goes on...
       
  1913     if 'century' in params.keys():
       
  1914         year = (int(params['century']) - 1) * 100 + 1
       
  1915     # in ISO 8601 most fields are optional
       
  1916     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
       
  1917         if not params.get(field, None):
       
  1918             params[field] = 0
       
  1919     hour = int(params.get('hour', 0))
       
  1920     minute = int(params.get('minute', 0))
       
  1921     second = int(params.get('second', 0))
       
  1922     # weekday is normalized by mktime(), we can ignore it
       
  1923     weekday = 0
       
  1924     # daylight savings is complex, but not needed for feedparser's purposes
       
  1925     # as time zones, if specified, include mention of whether it is active
       
  1926     # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
       
  1927     # and most implementations have DST bugs
       
  1928     daylight_savings_flag = 0
       
  1929     tm = [year, month, day, hour, minute, second, weekday,
       
  1930           ordinal, daylight_savings_flag]
       
  1931     # ISO 8601 time zone adjustments
       
  1932     tz = params.get('tz')
       
  1933     if tz and tz != 'Z':
       
  1934         if tz[0] == '-':
       
  1935             tm[3] += int(params.get('tzhour', 0))
       
  1936             tm[4] += int(params.get('tzmin', 0))
       
  1937         elif tz[0] == '+':
       
  1938             tm[3] -= int(params.get('tzhour', 0))
       
  1939             tm[4] -= int(params.get('tzmin', 0))
       
  1940         else:
       
  1941             return None
       
  1942     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
       
  1943     # which is guaranteed to normalize d/m/y/h/m/s.
       
  1944     # Many implementations have bugs, but we'll pretend they don't.
       
  1945     return time.localtime(time.mktime(tm))
       
  1946 registerDateHandler(_parse_date_iso8601)
       
  1947     
       
  1948 # 8-bit date handling routines written by ytrewq1.
       
  1949 _korean_year  = u'\ub144' # b3e2 in euc-kr
       
  1950 _korean_month = u'\uc6d4' # bff9 in euc-kr
       
  1951 _korean_day   = u'\uc77c' # c0cf in euc-kr
       
  1952 _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
       
  1953 _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
       
  1954 
       
  1955 _korean_onblog_date_re = \
       
  1956     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
       
  1957                (_korean_year, _korean_month, _korean_day))
       
  1958 _korean_nate_date_re = \
       
  1959     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
       
  1960                (_korean_am, _korean_pm))
       
  1961 def _parse_date_onblog(dateString):
       
  1962     '''Parse a string according to the OnBlog 8-bit date format'''
       
  1963     m = _korean_onblog_date_re.match(dateString)
       
  1964     if not m: return
       
  1965     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
       
  1966                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
       
  1967                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
       
  1968                  'zonediff': '+09:00'}
       
  1969     if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
       
  1970     return _parse_date_w3dtf(w3dtfdate)
       
  1971 registerDateHandler(_parse_date_onblog)
       
  1972 
       
  1973 def _parse_date_nate(dateString):
       
  1974     '''Parse a string according to the Nate 8-bit date format'''
       
  1975     m = _korean_nate_date_re.match(dateString)
       
  1976     if not m: return
       
  1977     hour = int(m.group(5))
       
  1978     ampm = m.group(4)
       
  1979     if (ampm == _korean_pm):
       
  1980         hour += 12
       
  1981     hour = str(hour)
       
  1982     if len(hour) == 1:
       
  1983         hour = '0' + hour
       
  1984     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
       
  1985                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
       
  1986                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
       
  1987                  'zonediff': '+09:00'}
       
  1988     if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
       
  1989     return _parse_date_w3dtf(w3dtfdate)
       
  1990 registerDateHandler(_parse_date_nate)
       
  1991 
       
  1992 _mssql_date_re = \
       
  1993     re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
       
  1994 def _parse_date_mssql(dateString):
       
  1995     '''Parse a string according to the MS SQL date format'''
       
  1996     m = _mssql_date_re.match(dateString)
       
  1997     if not m: return
       
  1998     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
       
  1999                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
       
  2000                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
       
  2001                  'zonediff': '+09:00'}
       
  2002     if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
       
  2003     return _parse_date_w3dtf(w3dtfdate)
       
  2004 registerDateHandler(_parse_date_mssql)
       
  2005 
       
  2006 # Unicode strings for Greek date strings
       
  2007 _greek_months = \
       
  2008   { \
       
  2009    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
       
  2010    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
       
  2011    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
       
  2012    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
       
  2013    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
       
  2014    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
       
  2015    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
       
  2016    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
       
  2017    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
       
  2018    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
       
  2019    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
       
  2020    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
       
  2021    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
       
  2022    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
       
  2023    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
       
  2024    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
       
  2025    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
       
  2026    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
       
  2027    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
       
  2028   }
       
  2029 
       
  2030 _greek_wdays = \
       
  2031   { \
       
  2032    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
       
  2033    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
       
  2034    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
       
  2035    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
       
  2036    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
       
  2037    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
       
  2038    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7   
       
  2039   }
       
  2040 
       
  2041 _greek_date_format_re = \
       
  2042     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
       
  2043 
       
  2044 def _parse_date_greek(dateString):
       
  2045     '''Parse a string according to a Greek 8-bit date format.'''
       
  2046     m = _greek_date_format_re.match(dateString)
       
  2047     if not m: return
       
  2048     try:
       
  2049         wday = _greek_wdays[m.group(1)]
       
  2050         month = _greek_months[m.group(3)]
       
  2051     except:
       
  2052         return
       
  2053     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
       
  2054                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
       
  2055                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
       
  2056                   'zonediff': m.group(8)}
       
  2057     if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
       
  2058     return _parse_date_rfc822(rfc822date)
       
  2059 registerDateHandler(_parse_date_greek)
       
  2060 
       
  2061 # Unicode strings for Hungarian date strings
       
  2062 _hungarian_months = \
       
  2063   { \
       
  2064     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
       
  2065     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
       
  2066     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
       
  2067     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
       
  2068     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
       
  2069     u'j\u00fanius':   u'06',  # fa in iso-8859-2
       
  2070     u'j\u00falius':   u'07',  # fa in iso-8859-2
       
  2071     u'augusztus':     u'08',
       
  2072     u'szeptember':    u'09',
       
  2073     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
       
  2074     u'november':      u'11',
       
  2075     u'december':      u'12',
       
  2076   }
       
  2077 
       
  2078 _hungarian_date_format_re = \
       
  2079   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
       
  2080 
       
  2081 def _parse_date_hungarian(dateString):
       
  2082     '''Parse a string according to a Hungarian 8-bit date format.'''
       
  2083     m = _hungarian_date_format_re.match(dateString)
       
  2084     if not m: return
       
  2085     try:
       
  2086         month = _hungarian_months[m.group(2)]
       
  2087         day = m.group(3)
       
  2088         if len(day) == 1:
       
  2089             day = '0' + day
       
  2090         hour = m.group(4)
       
  2091         if len(hour) == 1:
       
  2092             hour = '0' + hour
       
  2093     except:
       
  2094         return
       
  2095     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
       
  2096                 {'year': m.group(1), 'month': month, 'day': day,\
       
  2097                  'hour': hour, 'minute': m.group(5),\
       
  2098                  'zonediff': m.group(6)}
       
  2099     if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
       
  2100     return _parse_date_w3dtf(w3dtfdate)
       
  2101 registerDateHandler(_parse_date_hungarian)
       
  2102 
       
  2103 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
       
  2104 # Drake and licensed under the Python license.  Removed all range checking
       
  2105 # for month, day, hour, minute, and second, since mktime will normalize
       
  2106 # these later
       
  2107 def _parse_date_w3dtf(dateString):
       
  2108     def __extract_date(m):
       
  2109         year = int(m.group('year'))
       
  2110         if year < 100:
       
  2111             year = 100 * int(time.gmtime()[0] / 100) + int(year)
       
  2112         if year < 1000:
       
  2113             return 0, 0, 0
       
  2114         julian = m.group('julian')
       
  2115         if julian:
       
  2116             julian = int(julian)
       
  2117             month = julian / 30 + 1
       
  2118             day = julian % 30 + 1
       
  2119             jday = None
       
  2120             while jday != julian:
       
  2121                 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
       
  2122                 jday = time.gmtime(t)[-2]
       
  2123                 diff = abs(jday - julian)
       
  2124                 if jday > julian:
       
  2125                     if diff < day:
       
  2126                         day = day - diff
       
  2127                     else:
       
  2128                         month = month - 1
       
  2129                         day = 31
       
  2130                 elif jday < julian:
       
  2131                     if day + diff < 28:
       
  2132                        day = day + diff
       
  2133                     else:
       
  2134                         month = month + 1
       
  2135             return year, month, day
       
  2136         month = m.group('month')
       
  2137         day = 1
       
  2138         if month is None:
       
  2139             month = 1
       
  2140         else:
       
  2141             month = int(month)
       
  2142             day = m.group('day')
       
  2143             if day:
       
  2144                 day = int(day)
       
  2145             else:
       
  2146                 day = 1
       
  2147         return year, month, day
       
  2148 
       
  2149     def __extract_time(m):
       
  2150         if not m:
       
  2151             return 0, 0, 0
       
  2152         hours = m.group('hours')
       
  2153         if not hours:
       
  2154             return 0, 0, 0
       
  2155         hours = int(hours)
       
  2156         minutes = int(m.group('minutes'))
       
  2157         seconds = m.group('seconds')
       
  2158         if seconds:
       
  2159             seconds = int(seconds)
       
  2160         else:
       
  2161             seconds = 0
       
  2162         return hours, minutes, seconds
       
  2163 
       
  2164     def __extract_tzd(m):
       
  2165         '''Return the Time Zone Designator as an offset in seconds from UTC.'''
       
  2166         if not m:
       
  2167             return 0
       
  2168         tzd = m.group('tzd')
       
  2169         if not tzd:
       
  2170             return 0
       
  2171         if tzd == 'Z':
       
  2172             return 0
       
  2173         hours = int(m.group('tzdhours'))
       
  2174         minutes = m.group('tzdminutes')
       
  2175         if minutes:
       
  2176             minutes = int(minutes)
       
  2177         else:
       
  2178             minutes = 0
       
  2179         offset = (hours*60 + minutes) * 60
       
  2180         if tzd[0] == '+':
       
  2181             return -offset
       
  2182         return offset
       
  2183 
       
  2184     __date_re = ('(?P<year>\d\d\d\d)'
       
  2185                  '(?:(?P<dsep>-|)'
       
  2186                  '(?:(?P<julian>\d\d\d)'
       
  2187                  '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
       
  2188     __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
       
  2189     __tzd_rx = re.compile(__tzd_re)
       
  2190     __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
       
  2191                  '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
       
  2192                  + __tzd_re)
       
  2193     __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
       
  2194     __datetime_rx = re.compile(__datetime_re)
       
  2195     m = __datetime_rx.match(dateString)
       
  2196     if (m is None) or (m.group() != dateString): return
       
  2197     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
       
  2198     if gmt[0] == 0: return
       
  2199     return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
       
  2200 registerDateHandler(_parse_date_w3dtf)
       
  2201 
       
  2202 def _parse_date_rfc822(dateString):
       
  2203     '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
       
  2204     data = dateString.split()
       
  2205     if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
       
  2206         del data[0]
       
  2207     if len(data) == 4:
       
  2208         s = data[3]
       
  2209         i = s.find('+')
       
  2210         if i > 0:
       
  2211             data[3:] = [s[:i], s[i+1:]]
       
  2212         else:
       
  2213             data.append('')
       
  2214         dateString = " ".join(data)
       
  2215     if len(data) < 5:
       
  2216         dateString += ' 00:00:00 GMT'
       
  2217     tm = rfc822.parsedate_tz(dateString)
       
  2218     if tm:
       
  2219         return time.gmtime(rfc822.mktime_tz(tm))
       
  2220 # rfc822.py defines several time zones, but we define some extra ones.
       
  2221 # 'ET' is equivalent to 'EST', etc.
       
  2222 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
       
  2223 rfc822._timezones.update(_additional_timezones)
       
  2224 registerDateHandler(_parse_date_rfc822)    
       
  2225 
       
  2226 def _parse_date(dateString):
       
  2227     '''Parses a variety of date formats into a 9-tuple in GMT'''
       
  2228     for handler in _date_handlers:
       
  2229         try:
       
  2230             date9tuple = handler(dateString)
       
  2231             if not date9tuple: continue
       
  2232             if len(date9tuple) != 9:
       
  2233                 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
       
  2234                 raise ValueError
       
  2235             map(int, date9tuple)
       
  2236             return date9tuple
       
  2237         except Exception, e:
       
  2238             if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
       
  2239             pass
       
  2240     return None
       
  2241 
       
  2242 def _getCharacterEncoding(http_headers, xml_data):
       
  2243     '''Get the character encoding of the XML document
       
  2244 
       
  2245     http_headers is a dictionary
       
  2246     xml_data is a raw string (not Unicode)
       
  2247     
       
  2248     This is so much trickier than it sounds, it's not even funny.
       
  2249     According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
       
  2250     is application/xml, application/*+xml,
       
  2251     application/xml-external-parsed-entity, or application/xml-dtd,
       
  2252     the encoding given in the charset parameter of the HTTP Content-Type
       
  2253     takes precedence over the encoding given in the XML prefix within the
       
  2254     document, and defaults to 'utf-8' if neither are specified.  But, if
       
  2255     the HTTP Content-Type is text/xml, text/*+xml, or
       
  2256     text/xml-external-parsed-entity, the encoding given in the XML prefix
       
  2257     within the document is ALWAYS IGNORED and only the encoding given in
       
  2258     the charset parameter of the HTTP Content-Type header should be
       
  2259     respected, and it defaults to 'us-ascii' if not specified.
       
  2260 
       
  2261     Furthermore, discussion on the atom-syntax mailing list with the
       
  2262     author of RFC 3023 leads me to the conclusion that any document
       
  2263     served with a Content-Type of text/* and no charset parameter
       
  2264     must be treated as us-ascii.  (We now do this.)  And also that it
       
  2265     must always be flagged as non-well-formed.  (We now do this too.)
       
  2266     
       
  2267     If Content-Type is unspecified (input was local file or non-HTTP source)
       
  2268     or unrecognized (server just got it totally wrong), then go by the
       
  2269     encoding given in the XML prefix of the document and default to
       
  2270     'iso-8859-1' as per the HTTP specification (RFC 2616).
       
  2271     
       
  2272     Then, assuming we didn't find a character encoding in the HTTP headers
       
  2273     (and the HTTP Content-type allowed us to look in the body), we need
       
  2274     to sniff the first few bytes of the XML data and try to determine
       
  2275     whether the encoding is ASCII-compatible.  Section F of the XML
       
  2276     specification shows the way here:
       
  2277     http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
       
  2278 
       
  2279     If the sniffed encoding is not ASCII-compatible, we need to make it
       
  2280     ASCII compatible so that we can sniff further into the XML declaration
       
  2281     to find the encoding attribute, which will tell us the true encoding.
       
  2282 
       
  2283     Of course, none of this guarantees that we will be able to parse the
       
  2284     feed in the declared character encoding (assuming it was declared
       
  2285     correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
       
  2286     you should definitely install them if you can.
       
  2287     http://cjkpython.i18n.org/
       
  2288     '''
       
  2289 
       
  2290     def _parseHTTPContentType(content_type):
       
  2291         '''takes HTTP Content-Type header and returns (content type, charset)
       
  2292 
       
  2293         If no charset is specified, returns (content type, '')
       
  2294         If no content type is specified, returns ('', '')
       
  2295         Both return parameters are guaranteed to be lowercase strings
       
  2296         '''
       
  2297         content_type = content_type or ''
       
  2298         content_type, params = cgi.parse_header(content_type)
       
  2299         return content_type, params.get('charset', '').replace("'", '')
       
  2300 
       
  2301     sniffed_xml_encoding = ''
       
  2302     xml_encoding = ''
       
  2303     true_encoding = ''
       
  2304     http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
       
  2305     # Must sniff for non-ASCII-compatible character encodings before
       
  2306     # searching for XML declaration.  This heuristic is defined in
       
  2307     # section F of the XML specification:
       
  2308     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
       
  2309     try:
       
  2310         if xml_data[:4] == '\x4c\x6f\xa7\x94':
       
  2311             # EBCDIC
       
  2312             xml_data = _ebcdic_to_ascii(xml_data)
       
  2313         elif xml_data[:4] == '\x00\x3c\x00\x3f':
       
  2314             # UTF-16BE
       
  2315             sniffed_xml_encoding = 'utf-16be'
       
  2316             xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
       
  2317         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
       
  2318             # UTF-16BE with BOM
       
  2319             sniffed_xml_encoding = 'utf-16be'
       
  2320             xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
       
  2321         elif xml_data[:4] == '\x3c\x00\x3f\x00':
       
  2322             # UTF-16LE
       
  2323             sniffed_xml_encoding = 'utf-16le'
       
  2324             xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
       
  2325         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
       
  2326             # UTF-16LE with BOM
       
  2327             sniffed_xml_encoding = 'utf-16le'
       
  2328             xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
       
  2329         elif xml_data[:4] == '\x00\x00\x00\x3c':
       
  2330             # UTF-32BE
       
  2331             sniffed_xml_encoding = 'utf-32be'
       
  2332             xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
       
  2333         elif xml_data[:4] == '\x3c\x00\x00\x00':
       
  2334             # UTF-32LE
       
  2335             sniffed_xml_encoding = 'utf-32le'
       
  2336             xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
       
  2337         elif xml_data[:4] == '\x00\x00\xfe\xff':
       
  2338             # UTF-32BE with BOM
       
  2339             sniffed_xml_encoding = 'utf-32be'
       
  2340             xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
       
  2341         elif xml_data[:4] == '\xff\xfe\x00\x00':
       
  2342             # UTF-32LE with BOM
       
  2343             sniffed_xml_encoding = 'utf-32le'
       
  2344             xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
       
  2345         elif xml_data[:3] == '\xef\xbb\xbf':
       
  2346             # UTF-8 with BOM
       
  2347             sniffed_xml_encoding = 'utf-8'
       
  2348             xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
       
  2349         else:
       
  2350             # ASCII-compatible
       
  2351             pass
       
  2352         xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
       
  2353     except:
       
  2354         xml_encoding_match = None
       
  2355     if xml_encoding_match:
       
  2356         xml_encoding = xml_encoding_match.groups()[0].lower()
       
  2357         if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
       
  2358             xml_encoding = sniffed_xml_encoding
       
  2359     acceptable_content_type = 0
       
  2360     application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
       
  2361     text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
       
  2362     if (http_content_type in application_content_types) or \
       
  2363        (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
       
  2364         acceptable_content_type = 1
       
  2365         true_encoding = http_encoding or xml_encoding or 'utf-8'
       
  2366     elif (http_content_type in text_content_types) or \
       
  2367          (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
       
  2368         acceptable_content_type = 1
       
  2369         true_encoding = http_encoding or 'us-ascii'
       
  2370     elif http_content_type.startswith('text/'):
       
  2371         true_encoding = http_encoding or 'us-ascii'
       
  2372     elif http_headers and (not http_headers.has_key('content-type')):
       
  2373         true_encoding = xml_encoding or 'iso-8859-1'
       
  2374     else:
       
  2375         true_encoding = xml_encoding or 'utf-8'
       
  2376     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
       
  2377     
       
  2378 def _toUTF8(data, encoding):
       
  2379     '''Changes an XML data stream on the fly to specify a new encoding
       
  2380 
       
  2381     data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
       
  2382     encoding is a string recognized by encodings.aliases
       
  2383     '''
       
  2384     if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
       
  2385     # strip Byte Order Mark (if present)
       
  2386     if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
       
  2387         if _debug:
       
  2388             sys.stderr.write('stripping BOM\n')
       
  2389             if encoding != 'utf-16be':
       
  2390                 sys.stderr.write('trying utf-16be instead\n')
       
  2391         encoding = 'utf-16be'
       
  2392         data = data[2:]
       
  2393     elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
       
  2394         if _debug:
       
  2395             sys.stderr.write('stripping BOM\n')
       
  2396             if encoding != 'utf-16le':
       
  2397                 sys.stderr.write('trying utf-16le instead\n')
       
  2398         encoding = 'utf-16le'
       
  2399         data = data[2:]
       
  2400     elif data[:3] == '\xef\xbb\xbf':
       
  2401         if _debug:
       
  2402             sys.stderr.write('stripping BOM\n')
       
  2403             if encoding != 'utf-8':
       
  2404                 sys.stderr.write('trying utf-8 instead\n')
       
  2405         encoding = 'utf-8'
       
  2406         data = data[3:]
       
  2407     elif data[:4] == '\x00\x00\xfe\xff':
       
  2408         if _debug:
       
  2409             sys.stderr.write('stripping BOM\n')
       
  2410             if encoding != 'utf-32be':
       
  2411                 sys.stderr.write('trying utf-32be instead\n')
       
  2412         encoding = 'utf-32be'
       
  2413         data = data[4:]
       
  2414     elif data[:4] == '\xff\xfe\x00\x00':
       
  2415         if _debug:
       
  2416             sys.stderr.write('stripping BOM\n')
       
  2417             if encoding != 'utf-32le':
       
  2418                 sys.stderr.write('trying utf-32le instead\n')
       
  2419         encoding = 'utf-32le'
       
  2420         data = data[4:]
       
  2421     newdata = unicode(data, encoding)
       
  2422     if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
       
  2423     declmatch = re.compile('^<\?xml[^>]*?>')
       
  2424     newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
       
  2425     if declmatch.search(newdata):
       
  2426         newdata = declmatch.sub(newdecl, newdata)
       
  2427     else:
       
  2428         newdata = newdecl + u'\n' + newdata
       
  2429     return newdata.encode('utf-8')
       
  2430 
       
  2431 def _stripDoctype(data):
       
  2432     '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
       
  2433 
       
  2434     rss_version may be 'rss091n' or None
       
  2435     stripped_data is the same XML document, minus the DOCTYPE
       
  2436     '''
       
  2437     entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
       
  2438     data = entity_pattern.sub('', data)
       
  2439     doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
       
  2440     doctype_results = doctype_pattern.findall(data)
       
  2441     doctype = doctype_results and doctype_results[0] or ''
       
  2442     if doctype.lower().count('netscape'):
       
  2443         version = 'rss091n'
       
  2444     else:
       
  2445         version = None
       
  2446     data = doctype_pattern.sub('', data)
       
  2447     return version, data
       
  2448     
       
  2449 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
       
  2450     '''Parse a feed from a URL, file, stream, or string'''
       
  2451     result = FeedParserDict()
       
  2452     result['feed'] = FeedParserDict()
       
  2453     result['entries'] = []
       
  2454     if _XML_AVAILABLE:
       
  2455         result['bozo'] = 0
       
  2456     if type(handlers) == types.InstanceType:
       
  2457         handlers = [handlers]
       
  2458     try:
       
  2459         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
       
  2460         data = f.read()
       
  2461     except Exception, e:
       
  2462         result['bozo'] = 1
       
  2463         result['bozo_exception'] = e
       
  2464         data = ''
       
  2465         f = None
       
  2466 
       
  2467     # if feed is gzip-compressed, decompress it
       
  2468     if f and data and hasattr(f, 'headers'):
       
  2469         if gzip and f.headers.get('content-encoding', '') == 'gzip':
       
  2470             try:
       
  2471                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
       
  2472             except Exception, e:
       
  2473                 # Some feeds claim to be gzipped but they're not, so
       
  2474                 # we get garbage.  Ideally, we should re-request the
       
  2475                 # feed without the 'Accept-encoding: gzip' header,
       
  2476                 # but we don't.
       
  2477                 result['bozo'] = 1
       
  2478                 result['bozo_exception'] = e
       
  2479                 data = ''
       
  2480         elif zlib and f.headers.get('content-encoding', '') == 'deflate':
       
  2481             try:
       
  2482                 data = zlib.decompress(data, -zlib.MAX_WBITS)
       
  2483             except Exception, e:
       
  2484                 result['bozo'] = 1
       
  2485                 result['bozo_exception'] = e
       
  2486                 data = ''
       
  2487 
       
  2488     # save HTTP headers
       
  2489     if hasattr(f, 'info'):
       
  2490         info = f.info()
       
  2491         result['etag'] = info.getheader('ETag')
       
  2492         last_modified = info.getheader('Last-Modified')
       
  2493         if last_modified:
       
  2494             result['modified'] = _parse_date(last_modified)
       
  2495     if hasattr(f, 'url'):
       
  2496         result['href'] = f.url
       
  2497         result['status'] = 200
       
  2498     if hasattr(f, 'status'):
       
  2499         result['status'] = f.status
       
  2500     if hasattr(f, 'headers'):
       
  2501         result['headers'] = f.headers.dict
       
  2502     if hasattr(f, 'close'):
       
  2503         f.close()
       
  2504 
       
  2505     # there are four encodings to keep track of:
       
  2506     # - http_encoding is the encoding declared in the Content-Type HTTP header
       
  2507     # - xml_encoding is the encoding declared in the <?xml declaration
       
  2508     # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
       
  2509     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
       
  2510     http_headers = result.get('headers', {})
       
  2511     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
       
  2512         _getCharacterEncoding(http_headers, data)
       
  2513     if http_headers and (not acceptable_content_type):
       
  2514         if http_headers.has_key('content-type'):
       
  2515             bozo_message = '%s is not an XML media type' % http_headers['content-type']
       
  2516         else:
       
  2517             bozo_message = 'no Content-type specified'
       
  2518         result['bozo'] = 1
       
  2519         result['bozo_exception'] = NonXMLContentType(bozo_message)
       
  2520         
       
  2521     result['version'], data = _stripDoctype(data)
       
  2522 
       
  2523     baseuri = http_headers.get('content-location', result.get('href'))
       
  2524     baselang = http_headers.get('content-language', None)
       
  2525 
       
  2526     # if server sent 304, we're done
       
  2527     if result.get('status', 0) == 304:
       
  2528         result['version'] = ''
       
  2529         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
       
  2530             'so the server sent no data.  This is a feature, not a bug!'
       
  2531         return result
       
  2532 
       
  2533     # if there was a problem downloading, we're done
       
  2534     if not data:
       
  2535         return result
       
  2536 
       
  2537     # determine character encoding
       
  2538     use_strict_parser = 0
       
  2539     known_encoding = 0
       
  2540     tried_encodings = []
       
  2541     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
       
  2542     for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
       
  2543         if not proposed_encoding: continue
       
  2544         if proposed_encoding in tried_encodings: continue
       
  2545         tried_encodings.append(proposed_encoding)
       
  2546         try:
       
  2547             data = _toUTF8(data, proposed_encoding)
       
  2548             known_encoding = use_strict_parser = 1
       
  2549             break
       
  2550         except:
       
  2551             pass
       
  2552     # if no luck and we have auto-detection library, try that
       
  2553     if (not known_encoding) and chardet:
       
  2554         try:
       
  2555             proposed_encoding = chardet.detect(data)['encoding']
       
  2556             if proposed_encoding and (proposed_encoding not in tried_encodings):
       
  2557                 tried_encodings.append(proposed_encoding)
       
  2558                 data = _toUTF8(data, proposed_encoding)
       
  2559                 known_encoding = use_strict_parser = 1
       
  2560         except:
       
  2561             pass
       
  2562     # if still no luck and we haven't tried utf-8 yet, try that
       
  2563     if (not known_encoding) and ('utf-8' not in tried_encodings):
       
  2564         try:
       
  2565             proposed_encoding = 'utf-8'
       
  2566             tried_encodings.append(proposed_encoding)
       
  2567             data = _toUTF8(data, proposed_encoding)
       
  2568             known_encoding = use_strict_parser = 1
       
  2569         except:
       
  2570             pass
       
  2571     # if still no luck and we haven't tried windows-1252 yet, try that
       
  2572     if (not known_encoding) and ('windows-1252' not in tried_encodings):
       
  2573         try:
       
  2574             proposed_encoding = 'windows-1252'
       
  2575             tried_encodings.append(proposed_encoding)
       
  2576             data = _toUTF8(data, proposed_encoding)
       
  2577             known_encoding = use_strict_parser = 1
       
  2578         except:
       
  2579             pass
       
  2580     # if still no luck, give up
       
  2581     if not known_encoding:
       
  2582         result['bozo'] = 1
       
  2583         result['bozo_exception'] = CharacterEncodingUnknown( \
       
  2584             'document encoding unknown, I tried ' + \
       
  2585             '%s, %s, utf-8, and windows-1252 but nothing worked' % \
       
  2586             (result['encoding'], xml_encoding))
       
  2587         result['encoding'] = ''
       
  2588     elif proposed_encoding != result['encoding']:
       
  2589         result['bozo'] = 1
       
  2590         result['bozo_exception'] = CharacterEncodingOverride( \
       
  2591             'documented declared as %s, but parsed as %s' % \
       
  2592             (result['encoding'], proposed_encoding))
       
  2593         result['encoding'] = proposed_encoding
       
  2594 
       
  2595     if not _XML_AVAILABLE:
       
  2596         use_strict_parser = 0
       
  2597     if use_strict_parser:
       
  2598         # initialize the SAX parser
       
  2599         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
       
  2600         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
       
  2601         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
       
  2602         saxparser.setContentHandler(feedparser)
       
  2603         saxparser.setErrorHandler(feedparser)
       
  2604         source = xml.sax.xmlreader.InputSource()
       
  2605         source.setByteStream(_StringIO(data))
       
  2606         if hasattr(saxparser, '_ns_stack'):
       
  2607             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
       
  2608             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
       
  2609             saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
       
  2610         try:
       
  2611             saxparser.parse(source)
       
  2612         except Exception, e:
       
  2613             if _debug:
       
  2614                 import traceback
       
  2615                 traceback.print_stack()
       
  2616                 traceback.print_exc()
       
  2617                 sys.stderr.write('xml parsing failed\n')
       
  2618             result['bozo'] = 1
       
  2619             result['bozo_exception'] = feedparser.exc or e
       
  2620             use_strict_parser = 0
       
  2621     if not use_strict_parser:
       
  2622         feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
       
  2623         feedparser.feed(data)
       
  2624     result['feed'] = feedparser.feeddata
       
  2625     result['entries'] = feedparser.entries
       
  2626     result['version'] = result['version'] or feedparser.version
       
  2627     result['namespaces'] = feedparser.namespacesInUse
       
  2628     return result
       
  2629 
       
  2630 if __name__ == '__main__':
       
  2631     if not sys.argv[1:]:
       
  2632         print __doc__
       
  2633         sys.exit(0)
       
  2634     else:
       
  2635         urls = sys.argv[1:]
       
  2636     zopeCompatibilityHack()
       
  2637     from pprint import pprint
       
  2638     for url in urls:
       
  2639         print url
       
  2640         print
       
  2641         result = parse(url)
       
  2642         pprint(result)
       
  2643         print
       
  2644 
       
  2645 #REVISION HISTORY
       
  2646 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
       
  2647 #  added Simon Fell's test suite
       
  2648 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
       
  2649 #2.0 - 10/19/2002
       
  2650 #  JD - use inchannel to watch out for image and textinput elements which can
       
  2651 #  also contain title, link, and description elements
       
  2652 #  JD - check for isPermaLink='false' attribute on guid elements
       
  2653 #  JD - replaced openAnything with open_resource supporting ETag and
       
  2654 #  If-Modified-Since request headers
       
  2655 #  JD - parse now accepts etag, modified, agent, and referrer optional
       
  2656 #  arguments
       
  2657 #  JD - modified parse to return a dictionary instead of a tuple so that any
       
  2658 #  etag or modified information can be returned and cached by the caller
       
  2659 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
       
  2660 #  because of etag/modified, return the old etag/modified to the caller to
       
  2661 #  indicate why nothing is being returned
       
  2662 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
       
  2663 #  useless.  Fixes the problem JD was addressing by adding it.
       
  2664 #2.1 - 11/14/2002 - MAP - added gzip support
       
  2665 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
       
  2666 #  start_admingeneratoragent is an example of how to handle elements with
       
  2667 #  only attributes, no content.
       
  2668 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
       
  2669 #  also, make sure we send the User-Agent even if urllib2 isn't available.
       
  2670 #  Match any variation of backend.userland.com/rss namespace.
       
  2671 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
       
  2672 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
       
  2673 #  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
       
  2674 #  project name
       
  2675 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
       
  2676 #  removed unnecessary urllib code -- urllib2 should always be available anyway;
       
  2677 #  return actual url, status, and full HTTP headers (as result['url'],
       
  2678 #  result['status'], and result['headers']) if parsing a remote feed over HTTP --
       
  2679 #  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
       
  2680 #  added the latest namespace-of-the-week for RSS 2.0
       
  2681 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
       
  2682 #  User-Agent (otherwise urllib2 sends two, which confuses some servers)
       
  2683 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
       
  2684 #  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
       
  2685 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
       
  2686 #  textInput, and also to return the character encoding (if specified)
       
  2687 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
       
  2688 #  nested divs within content (JohnD); fixed missing sys import (JohanS);
       
  2689 #  fixed regular expression to capture XML character encoding (Andrei);
       
  2690 #  added support for Atom 0.3-style links; fixed bug with textInput tracking;
       
  2691 #  added support for cloud (MartijnP); added support for multiple
       
  2692 #  category/dc:subject (MartijnP); normalize content model: 'description' gets
       
  2693 #  description (which can come from description, summary, or full content if no
       
  2694 #  description), 'content' gets dict of base/language/type/value (which can come
       
  2695 #  from content:encoded, xhtml:body, content, or fullitem);
       
  2696 #  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
       
  2697 #  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
       
  2698 #  <content> element is not in default namespace (like Pocketsoap feed);
       
  2699 #  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
       
  2700 #  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
       
  2701 #  description, xhtml:body, content, content:encoded, title, subtitle,
       
  2702 #  summary, info, tagline, and copyright; added support for pingback and
       
  2703 #  trackback namespaces
       
  2704 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
       
  2705 #  namespaces, as opposed to 2.6 when I said I did but didn't really;
       
  2706 #  sanitize HTML markup within some elements; added mxTidy support (if
       
  2707 #  installed) to tidy HTML markup within some elements; fixed indentation
       
  2708 #  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
       
  2709 #  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
       
  2710 #  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
       
  2711 #  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
       
  2712 #  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
       
  2713 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
       
  2714 #  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
       
  2715 #  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
       
  2716 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
       
  2717 #  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
       
  2718 #  fixed relative URI processing for guid (skadz); added ICBM support; added
       
  2719 #  base64 support
       
  2720 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
       
  2721 #  blogspot.com sites); added _debug variable
       
  2722 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
       
  2723 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
       
  2724 #  added several new supported namespaces; fixed bug tracking naked markup in
       
  2725 #  description; added support for enclosure; added support for source; re-added
       
  2726 #  support for cloud which got dropped somehow; added support for expirationDate
       
  2727 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
       
  2728 #  xml:base URI, one for documents that don't define one explicitly and one for
       
  2729 #  documents that define an outer and an inner xml:base that goes out of scope
       
  2730 #  before the end of the document
       
  2731 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
       
  2732 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
       
  2733 #  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
       
  2734 #  added support for creativeCommons:license and cc:license; added support for
       
  2735 #  full Atom content model in title, tagline, info, copyright, summary; fixed bug
       
  2736 #  with gzip encoding (not always telling server we support it when we do)
       
  2737 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
       
  2738 #  (dictionary of 'name', 'url', 'email'); map author to author_detail if author
       
  2739 #  contains name + email address
       
  2740 #3.0b8 - 1/28/2004 - MAP - added support for contributor
       
  2741 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
       
  2742 #  support for summary
       
  2743 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
       
  2744 #  xml.util.iso8601
       
  2745 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
       
  2746 #  dangerous markup; fiddled with decodeEntities (not right); liberalized
       
  2747 #  date parsing even further
       
  2748 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
       
  2749 #  added support to Atom 0.2 subtitle; added support for Atom content model
       
  2750 #  in copyright; better sanitizing of dangerous HTML elements with end tags
       
  2751 #  (script, frameset)
       
  2752 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
       
  2753 #  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
       
  2754 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
       
  2755 #  Python 2.1
       
  2756 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
       
  2757 #  fixed bug capturing author and contributor URL; fixed bug resolving relative
       
  2758 #  links in author and contributor URL; fixed bug resolvin relative links in
       
  2759 #  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
       
  2760 #  namespace tests, and included them permanently in the test suite with his
       
  2761 #  permission; fixed namespace handling under Python 2.1
       
  2762 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
       
  2763 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
       
  2764 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
       
  2765 #  use libxml2 (if available)
       
  2766 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
       
  2767 #  name was in parentheses; removed ultra-problematic mxTidy support; patch to
       
  2768 #  workaround crash in PyXML/expat when encountering invalid entities
       
  2769 #  (MarkMoraes); support for textinput/textInput
       
  2770 #3.0b20 - 4/7/2004 - MAP - added CDF support
       
  2771 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
       
  2772 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
       
  2773 #  results dict; changed results dict to allow getting values with results.key
       
  2774 #  as well as results[key]; work around embedded illformed HTML with half
       
  2775 #  a DOCTYPE; work around malformed Content-Type header; if character encoding
       
  2776 #  is wrong, try several common ones before falling back to regexes (if this
       
  2777 #  works, bozo_exception is set to CharacterEncodingOverride); fixed character
       
  2778 #  encoding issues in BaseHTMLProcessor by tracking encoding and converting
       
  2779 #  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
       
  2780 #  convert each value in results to Unicode (if possible), even if using
       
  2781 #  regex-based parsing
       
  2782 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
       
  2783 #  high-bit characters in attributes in embedded HTML in description (thanks
       
  2784 #  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
       
  2785 #  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
       
  2786 #  about a mapped key
       
  2787 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
       
  2788 #  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
       
  2789 #  cause the same encoding to be tried twice (even if it failed the first time);
       
  2790 #  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
       
  2791 #  better textinput and image tracking in illformed RSS 1.0 feeds
       
  2792 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
       
  2793 #  my blink tag tests
       
  2794 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
       
  2795 #  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
       
  2796 #  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
       
  2797 #  added support for image; refactored parse() fallback logic to try other
       
  2798 #  encodings if SAX parsing fails (previously it would only try other encodings
       
  2799 #  if re-encoding failed); remove unichr madness in normalize_attrs now that
       
  2800 #  we're properly tracking encoding in and out of BaseHTMLProcessor; set
       
  2801 #  feed.language from root-level xml:lang; set entry.id from rdf:about;
       
  2802 #  send Accept header
       
  2803 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
       
  2804 #  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
       
  2805 #  windows-1252); fixed regression that could cause the same encoding to be
       
  2806 #  tried twice (even if it failed the first time)
       
  2807 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
       
  2808 #  recover from malformed content-type header parameter with no equals sign
       
  2809 #  ('text/xml; charset:iso-8859-1')
       
  2810 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
       
  2811 #  to Unicode equivalents in illformed feeds (aaronsw); added and
       
  2812 #  passed tests for converting character entities to Unicode equivalents
       
  2813 #  in illformed feeds (aaronsw); test for valid parsers when setting
       
  2814 #  XML_AVAILABLE; make version and encoding available when server returns
       
  2815 #  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
       
  2816 #  digest auth or proxy support); add code to parse username/password
       
  2817 #  out of url and send as basic authentication; expose downloading-related
       
  2818 #  exceptions in bozo_exception (aaronsw); added __contains__ method to
       
  2819 #  FeedParserDict (aaronsw); added publisher_detail (aaronsw)
       
  2820 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
       
  2821 #  convert feed to UTF-8 before passing to XML parser; completely revamped
       
  2822 #  logic for determining character encoding and attempting XML parsing
       
  2823 #  (much faster); increased default timeout to 20 seconds; test for presence
       
  2824 #  of Location header on redirects; added tests for many alternate character
       
  2825 #  encodings; support various EBCDIC encodings; support UTF-16BE and
       
  2826 #  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
       
  2827 #  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
       
  2828 #  XML parsers are available; added support for 'Content-encoding: deflate';
       
  2829 #  send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
       
  2830 #  are available
       
  2831 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
       
  2832 #  problem tracking xml:base and xml:lang if element declares it, child
       
  2833 #  doesn't, first grandchild redeclares it, and second grandchild doesn't;
       
  2834 #  refactored date parsing; defined public registerDateHandler so callers
       
  2835 #  can add support for additional date formats at runtime; added support
       
  2836 #  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
       
  2837 #  zopeCompatibilityHack() which turns FeedParserDict into a regular
       
  2838 #  dictionary, required for Zope compatibility, and also makes command-
       
  2839 #  line debugging easier because pprint module formats real dictionaries
       
  2840 #  better than dictionary-like objects; added NonXMLContentType exception,
       
  2841 #  which is stored in bozo_exception when a feed is served with a non-XML
       
  2842 #  media type such as 'text/plain'; respect Content-Language as default
       
  2843 #  language if not xml:lang is present; cloud dict is now FeedParserDict;
       
  2844 #  generator dict is now FeedParserDict; better tracking of xml:lang,
       
  2845 #  including support for xml:lang='' to unset the current language;
       
  2846 #  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
       
  2847 #  namespace; don't overwrite final status on redirects (scenarios:
       
  2848 #  redirecting to a URL that returns 304, redirecting to a URL that
       
  2849 #  redirects to another URL with a different type of redirect); add
       
  2850 #  support for HTTP 303 redirects
       
  2851 #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
       
  2852 #  encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
       
  2853 #  support for Atom 1.0; support for iTunes extensions; new 'tags' for
       
  2854 #  categories/keywords/etc. as array of dict
       
  2855 #  {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
       
  2856 #  terminology; parse RFC 822-style dates with no time; lots of other
       
  2857 #  bug fixes
       
  2858 #4.1 - MAP - removed socket timeout; added support for chardet library